diff mbox

[v3,1/3] dma: Support multiple interleaved frames with non-contiguous memory

Message ID 1392465619-27614-2-git-send-email-sthokal@xilinx.com (mailing list archive)
State Superseded
Headers show

Commit Message

Srikanth Thokala Feb. 15, 2014, noon UTC
The current implementation of interleaved DMA API support multiple
frames only when the memory is contiguous by incrementing src_start/
dst_start members of interleaved template.

But, when the memory is non-contiguous it will restrict slave device
to not submit multiple frames in a batch.  This patch handles this
issue by allowing the slave device to send array of interleaved dma
templates each having a different memory location.

Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
---
 Documentation/dmaengine.txt              |    2 +-
 drivers/dma/imx-dma.c                    |    3 ++-
 drivers/dma/sirf-dma.c                   |    3 ++-
 drivers/media/platform/m2m-deinterlace.c |    2 +-
 include/linux/dmaengine.h                |    6 +++---
 5 files changed, 9 insertions(+), 7 deletions(-)

Comments

Vinod Koul Feb. 17, 2014, 8:43 a.m. UTC | #1
On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
> The current implementation of interleaved DMA API support multiple
> frames only when the memory is contiguous by incrementing src_start/
> dst_start members of interleaved template.
> 
> But, when the memory is non-contiguous it will restrict slave device
> to not submit multiple frames in a batch.  This patch handles this
> issue by allowing the slave device to send array of interleaved dma
> templates each having a different memory location.
This seems to be missing the numbers of templates you are sending, wouldnt this
require sending ARRAY_SiZE too?

And why send double pointer?
Srikanth Thokala Feb. 17, 2014, 9:29 a.m. UTC | #2
On Mon, Feb 17, 2014 at 2:13 PM, Vinod Koul <vinod.koul@intel.com> wrote:
> On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
>> The current implementation of interleaved DMA API support multiple
>> frames only when the memory is contiguous by incrementing src_start/
>> dst_start members of interleaved template.
>>
>> But, when the memory is non-contiguous it will restrict slave device
>> to not submit multiple frames in a batch.  This patch handles this
>> issue by allowing the slave device to send array of interleaved dma
>> templates each having a different memory location.
> This seems to be missing the numbers of templates you are sending, wouldnt this
> require sending ARRAY_SiZE too?
>
> And why send double pointer?

Array size is not required, when we pass the double pointer.  The last
element would be
pointed to NULL and we could get the number of templates from this condition.
Here is an example snippet,

In slave device driver,

        struct dma_interleaved_template **xts;

        xts = kcalloc(frm_cnt+1, sizeof(struct
dma_interleaved_template *), GFP_KERNEL);
        /* Error check for xts */
        for (i = 0; i < frm_cnt; i++) {
                xts[i] = kmalloc(sizeof(struct
dma_interleaved_template), GFP_KERNEL);
                /* Error check for xts[i] */
        }
        xts[i] = NULL;

In DMA engine driver,  we could get the number of frames by,

        for (; xts[frmno] != NULL; frmno++);

I felt this way is simpler than adding an extra argument to the API.
Please let me know
your opinion and suggest me a better way.

Thanks
Srikanth

>
> --
> ~Vinod
>
>>
>> Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
>> ---
>>  Documentation/dmaengine.txt              |    2 +-
>>  drivers/dma/imx-dma.c                    |    3 ++-
>>  drivers/dma/sirf-dma.c                   |    3 ++-
>>  drivers/media/platform/m2m-deinterlace.c |    2 +-
>>  include/linux/dmaengine.h                |    6 +++---
>>  5 files changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
>> index 879b6e3..c642614 100644
>> --- a/Documentation/dmaengine.txt
>> +++ b/Documentation/dmaengine.txt
>> @@ -94,7 +94,7 @@ The slave DMA usage consists of following steps:
>>               size_t period_len, enum dma_data_direction direction);
>>
>>       struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>               unsigned long flags);
>>
>>     The peripheral driver is expected to have mapped the scatterlist for
>> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
>> index 6f9ac20..e2c52ce 100644
>> --- a/drivers/dma/imx-dma.c
>> +++ b/drivers/dma/imx-dma.c
>> @@ -954,12 +954,13 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_memcpy(
>>  }
>>
>>  static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>       unsigned long flags)
>>  {
>>       struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
>>       struct imxdma_engine *imxdma = imxdmac->imxdma;
>>       struct imxdma_desc *desc;
>> +     struct dma_interleaved_template *xt = *xts;
>>
>>       dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx dst_start=0x%llx\n"
>>               "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n", __func__,
>> diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
>> index d4d3a31..b6a150b 100644
>> --- a/drivers/dma/sirf-dma.c
>> +++ b/drivers/dma/sirf-dma.c
>> @@ -509,12 +509,13 @@ sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
>>  }
>>
>>  static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>       unsigned long flags)
>>  {
>>       struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
>>       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
>>       struct sirfsoc_dma_desc *sdesc = NULL;
>> +     struct dma_interleaved_template *xt = *xts;
>>       unsigned long iflags;
>>       int ret;
>>
>> diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c
>> index 6bb86b5..468110a 100644
>> --- a/drivers/media/platform/m2m-deinterlace.c
>> +++ b/drivers/media/platform/m2m-deinterlace.c
>> @@ -343,7 +343,7 @@ static void deinterlace_issue_dma(struct deinterlace_ctx *ctx, int op,
>>       ctx->xt->dst_sgl = true;
>>       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
>>
>> -     tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
>> +     tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
>>       if (tx == NULL) {
>>               v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep error\n");
>>               return;
>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>> index c5c92d5..2f77a9a 100644
>> --- a/include/linux/dmaengine.h
>> +++ b/include/linux/dmaengine.h
>> @@ -675,7 +675,7 @@ struct dma_device {
>>               size_t period_len, enum dma_transfer_direction direction,
>>               unsigned long flags, void *context);
>>       struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>               unsigned long flags);
>>       int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
>>               unsigned long arg);
>> @@ -752,10 +752,10 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
>>  }
>>
>>  static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>               unsigned long flags)
>>  {
>> -     return chan->device->device_prep_interleaved_dma(chan, xt, flags);
>> +     return chan->device->device_prep_interleaved_dma(chan, xts, flags);
>>  }
>>
>>  static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lars-Peter Clausen Feb. 17, 2014, 9:35 a.m. UTC | #3
On 02/17/2014 10:29 AM, Srikanth Thokala wrote:
> On Mon, Feb 17, 2014 at 2:13 PM, Vinod Koul <vinod.koul@intel.com> wrote:
>> On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
>>> The current implementation of interleaved DMA API support multiple
>>> frames only when the memory is contiguous by incrementing src_start/
>>> dst_start members of interleaved template.
>>>
>>> But, when the memory is non-contiguous it will restrict slave device
>>> to not submit multiple frames in a batch.  This patch handles this
>>> issue by allowing the slave device to send array of interleaved dma
>>> templates each having a different memory location.
>> This seems to be missing the numbers of templates you are sending, wouldnt this
>> require sending ARRAY_SiZE too?
>>
>> And why send double pointer?
>
> Array size is not required, when we pass the double pointer.  The last
> element would be
> pointed to NULL and we could get the number of templates from this condition.
> Here is an example snippet,
>
> In slave device driver,
>
>          struct dma_interleaved_template **xts;
>
>          xts = kcalloc(frm_cnt+1, sizeof(struct
> dma_interleaved_template *), GFP_KERNEL);
>          /* Error check for xts */
>          for (i = 0; i < frm_cnt; i++) {
>                  xts[i] = kmalloc(sizeof(struct
> dma_interleaved_template), GFP_KERNEL);
>                  /* Error check for xts[i] */
>          }
>          xts[i] = NULL;
>
> In DMA engine driver,  we could get the number of frames by,
>
>          for (; xts[frmno] != NULL; frmno++);
>
> I felt this way is simpler than adding an extra argument to the API.
> Please let me know
> your opinion and suggest me a better way.

I think Vinod's suggestion of passing in an array of interleaved_templates 
and the size of the array is better than what you are currently doing.

Btw. you also need to update the current implementations and users of the 
API accordingly.

>
>>
>> --
>> ~Vinod
>>
>>>
>>> Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
>>> ---
>>>   Documentation/dmaengine.txt              |    2 +-
>>>   drivers/dma/imx-dma.c                    |    3 ++-
>>>   drivers/dma/sirf-dma.c                   |    3 ++-
>>>   drivers/media/platform/m2m-deinterlace.c |    2 +-
>>>   include/linux/dmaengine.h                |    6 +++---
>>>   5 files changed, 9 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
>>> index 879b6e3..c642614 100644
>>> --- a/Documentation/dmaengine.txt
>>> +++ b/Documentation/dmaengine.txt
>>> @@ -94,7 +94,7 @@ The slave DMA usage consists of following steps:
>>>                size_t period_len, enum dma_data_direction direction);
>>>
>>>        struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>                unsigned long flags);
>>>
>>>      The peripheral driver is expected to have mapped the scatterlist for
>>> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
>>> index 6f9ac20..e2c52ce 100644
>>> --- a/drivers/dma/imx-dma.c
>>> +++ b/drivers/dma/imx-dma.c
>>> @@ -954,12 +954,13 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_memcpy(
>>>   }
>>>
>>>   static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>        unsigned long flags)
>>>   {
>>>        struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
>>>        struct imxdma_engine *imxdma = imxdmac->imxdma;
>>>        struct imxdma_desc *desc;
>>> +     struct dma_interleaved_template *xt = *xts;
>>>
>>>        dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx dst_start=0x%llx\n"
>>>                "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n", __func__,
>>> diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
>>> index d4d3a31..b6a150b 100644
>>> --- a/drivers/dma/sirf-dma.c
>>> +++ b/drivers/dma/sirf-dma.c
>>> @@ -509,12 +509,13 @@ sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
>>>   }
>>>
>>>   static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>        unsigned long flags)
>>>   {
>>>        struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
>>>        struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
>>>        struct sirfsoc_dma_desc *sdesc = NULL;
>>> +     struct dma_interleaved_template *xt = *xts;
>>>        unsigned long iflags;
>>>        int ret;
>>>
>>> diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c
>>> index 6bb86b5..468110a 100644
>>> --- a/drivers/media/platform/m2m-deinterlace.c
>>> +++ b/drivers/media/platform/m2m-deinterlace.c
>>> @@ -343,7 +343,7 @@ static void deinterlace_issue_dma(struct deinterlace_ctx *ctx, int op,
>>>        ctx->xt->dst_sgl = true;
>>>        flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
>>>
>>> -     tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
>>> +     tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
>>>        if (tx == NULL) {
>>>                v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep error\n");
>>>                return;
>>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>>> index c5c92d5..2f77a9a 100644
>>> --- a/include/linux/dmaengine.h
>>> +++ b/include/linux/dmaengine.h
>>> @@ -675,7 +675,7 @@ struct dma_device {
>>>                size_t period_len, enum dma_transfer_direction direction,
>>>                unsigned long flags, void *context);
>>>        struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>                unsigned long flags);
>>>        int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
>>>                unsigned long arg);
>>> @@ -752,10 +752,10 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
>>>   }
>>>
>>>   static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
>>> -             struct dma_chan *chan, struct dma_interleaved_template *xt,
>>> +             struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>                unsigned long flags)
>>>   {
>>> -     return chan->device->device_prep_interleaved_dma(chan, xt, flags);
>>> +     return chan->device->device_prep_interleaved_dma(chan, xts, flags);
>>>   }
>>>
>>>   static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
>>> --
>>> 1.7.9.5
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>> --
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 17, 2014, 9:42 a.m. UTC | #4
On Mon, Feb 17, 2014 at 3:05 PM, Lars-Peter Clausen <lars@metafoo.de> wrote:
> On 02/17/2014 10:29 AM, Srikanth Thokala wrote:
>>
>> On Mon, Feb 17, 2014 at 2:13 PM, Vinod Koul <vinod.koul@intel.com> wrote:
>>>
>>> On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
>>>>
>>>> The current implementation of interleaved DMA API support multiple
>>>> frames only when the memory is contiguous by incrementing src_start/
>>>> dst_start members of interleaved template.
>>>>
>>>> But, when the memory is non-contiguous it will restrict slave device
>>>> to not submit multiple frames in a batch.  This patch handles this
>>>> issue by allowing the slave device to send array of interleaved dma
>>>> templates each having a different memory location.
>>>
>>> This seems to be missing the numbers of templates you are sending,
>>> wouldnt this
>>> require sending ARRAY_SiZE too?
>>>
>>> And why send double pointer?
>>
>>
>> Array size is not required, when we pass the double pointer.  The last
>> element would be
>> pointed to NULL and we could get the number of templates from this
>> condition.
>> Here is an example snippet,
>>
>> In slave device driver,
>>
>>          struct dma_interleaved_template **xts;
>>
>>          xts = kcalloc(frm_cnt+1, sizeof(struct
>> dma_interleaved_template *), GFP_KERNEL);
>>          /* Error check for xts */
>>          for (i = 0; i < frm_cnt; i++) {
>>                  xts[i] = kmalloc(sizeof(struct
>> dma_interleaved_template), GFP_KERNEL);
>>                  /* Error check for xts[i] */
>>          }
>>          xts[i] = NULL;
>>
>> In DMA engine driver,  we could get the number of frames by,
>>
>>          for (; xts[frmno] != NULL; frmno++);
>>
>> I felt this way is simpler than adding an extra argument to the API.
>> Please let me know
>> your opinion and suggest me a better way.
>
>
> I think Vinod's suggestion of passing in an array of interleaved_templates
> and the size of the array is better than what you are currently doing.

Ok, Lars.  I will update with this in my v4. Thanks.

>
> Btw. you also need to update the current implementations and users of the
> API accordingly.

Yes, I have updated them in this patch.

Thanks
Srikanth


>
>
>>
>>>
>>> --
>>> ~Vinod
>>>
>>>>
>>>> Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
>>>> ---
>>>>   Documentation/dmaengine.txt              |    2 +-
>>>>   drivers/dma/imx-dma.c                    |    3 ++-
>>>>   drivers/dma/sirf-dma.c                   |    3 ++-
>>>>   drivers/media/platform/m2m-deinterlace.c |    2 +-
>>>>   include/linux/dmaengine.h                |    6 +++---
>>>>   5 files changed, 9 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
>>>> index 879b6e3..c642614 100644
>>>> --- a/Documentation/dmaengine.txt
>>>> +++ b/Documentation/dmaengine.txt
>>>> @@ -94,7 +94,7 @@ The slave DMA usage consists of following steps:
>>>>                size_t period_len, enum dma_data_direction direction);
>>>>
>>>>        struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>> *xt,
>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>> **xts,
>>>>                unsigned long flags);
>>>>
>>>>      The peripheral driver is expected to have mapped the scatterlist
>>>> for
>>>> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
>>>> index 6f9ac20..e2c52ce 100644
>>>> --- a/drivers/dma/imx-dma.c
>>>> +++ b/drivers/dma/imx-dma.c
>>>> @@ -954,12 +954,13 @@ static struct dma_async_tx_descriptor
>>>> *imxdma_prep_dma_memcpy(
>>>>   }
>>>>
>>>>   static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>        unsigned long flags)
>>>>   {
>>>>        struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
>>>>        struct imxdma_engine *imxdma = imxdmac->imxdma;
>>>>        struct imxdma_desc *desc;
>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>
>>>>        dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx
>>>> dst_start=0x%llx\n"
>>>>                "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n",
>>>> __func__,
>>>> diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
>>>> index d4d3a31..b6a150b 100644
>>>> --- a/drivers/dma/sirf-dma.c
>>>> +++ b/drivers/dma/sirf-dma.c
>>>> @@ -509,12 +509,13 @@ sirfsoc_dma_tx_status(struct dma_chan *chan,
>>>> dma_cookie_t cookie,
>>>>   }
>>>>
>>>>   static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>        unsigned long flags)
>>>>   {
>>>>        struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
>>>>        struct sirfsoc_dma_chan *schan =
>>>> dma_chan_to_sirfsoc_dma_chan(chan);
>>>>        struct sirfsoc_dma_desc *sdesc = NULL;
>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>        unsigned long iflags;
>>>>        int ret;
>>>>
>>>> diff --git a/drivers/media/platform/m2m-deinterlace.c
>>>> b/drivers/media/platform/m2m-deinterlace.c
>>>> index 6bb86b5..468110a 100644
>>>> --- a/drivers/media/platform/m2m-deinterlace.c
>>>> +++ b/drivers/media/platform/m2m-deinterlace.c
>>>> @@ -343,7 +343,7 @@ static void deinterlace_issue_dma(struct
>>>> deinterlace_ctx *ctx, int op,
>>>>        ctx->xt->dst_sgl = true;
>>>>        flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
>>>>
>>>> -     tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
>>>> +     tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
>>>>        if (tx == NULL) {
>>>>                v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep
>>>> error\n");
>>>>                return;
>>>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>>>> index c5c92d5..2f77a9a 100644
>>>> --- a/include/linux/dmaengine.h
>>>> +++ b/include/linux/dmaengine.h
>>>> @@ -675,7 +675,7 @@ struct dma_device {
>>>>                size_t period_len, enum dma_transfer_direction direction,
>>>>                unsigned long flags, void *context);
>>>>        struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>> *xt,
>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>> **xts,
>>>>                unsigned long flags);
>>>>        int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd
>>>> cmd,
>>>>                unsigned long arg);
>>>> @@ -752,10 +752,10 @@ static inline struct dma_async_tx_descriptor
>>>> *dmaengine_prep_dma_cyclic(
>>>>   }
>>>>
>>>>   static inline struct dma_async_tx_descriptor
>>>> *dmaengine_prep_interleaved_dma(
>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>> *xt,
>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>> **xts,
>>>>                unsigned long flags)
>>>>   {
>>>> -     return chan->device->device_prep_interleaved_dma(chan, xt, flags);
>>>> +     return chan->device->device_prep_interleaved_dma(chan, xts,
>>>> flags);
>>>>   }
>>>>
>>>>   static inline int dma_get_slave_caps(struct dma_chan *chan, struct
>>>> dma_slave_caps *caps)
>>>> --
>>>> 1.7.9.5
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>> --
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
>>> in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lars-Peter Clausen Feb. 17, 2014, 9:44 a.m. UTC | #5
On 02/17/2014 10:42 AM, Srikanth Thokala wrote:
> On Mon, Feb 17, 2014 at 3:05 PM, Lars-Peter Clausen <lars@metafoo.de> wrote:
>> On 02/17/2014 10:29 AM, Srikanth Thokala wrote:
>>>
>>> On Mon, Feb 17, 2014 at 2:13 PM, Vinod Koul <vinod.koul@intel.com> wrote:
>>>>
>>>> On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
>>>>>
>>>>> The current implementation of interleaved DMA API support multiple
>>>>> frames only when the memory is contiguous by incrementing src_start/
>>>>> dst_start members of interleaved template.
>>>>>
>>>>> But, when the memory is non-contiguous it will restrict slave device
>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>> templates each having a different memory location.
>>>>
>>>> This seems to be missing the numbers of templates you are sending,
>>>> wouldnt this
>>>> require sending ARRAY_SiZE too?
>>>>
>>>> And why send double pointer?
>>>
>>>
>>> Array size is not required, when we pass the double pointer.  The last
>>> element would be
>>> pointed to NULL and we could get the number of templates from this
>>> condition.
>>> Here is an example snippet,
>>>
>>> In slave device driver,
>>>
>>>           struct dma_interleaved_template **xts;
>>>
>>>           xts = kcalloc(frm_cnt+1, sizeof(struct
>>> dma_interleaved_template *), GFP_KERNEL);
>>>           /* Error check for xts */
>>>           for (i = 0; i < frm_cnt; i++) {
>>>                   xts[i] = kmalloc(sizeof(struct
>>> dma_interleaved_template), GFP_KERNEL);
>>>                   /* Error check for xts[i] */
>>>           }
>>>           xts[i] = NULL;
>>>
>>> In DMA engine driver,  we could get the number of frames by,
>>>
>>>           for (; xts[frmno] != NULL; frmno++);
>>>
>>> I felt this way is simpler than adding an extra argument to the API.
>>> Please let me know
>>> your opinion and suggest me a better way.
>>
>>
>> I think Vinod's suggestion of passing in an array of interleaved_templates
>> and the size of the array is better than what you are currently doing.
>
> Ok, Lars.  I will update with this in my v4. Thanks.
>
>>
>> Btw. you also need to update the current implementations and users of the
>> API accordingly.
>
> Yes, I have updated them in this patch.

But you didn't update them accordingly to your proposed semantics. The 
caller didn't NULL terminate the array and the drivers did blindly assume 
there will always be exactly one transfer.

>
> Thanks
> Srikanth
>
>
>>
>>
>>>
>>>>
>>>> --
>>>> ~Vinod
>>>>
>>>>>
>>>>> Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
>>>>> ---
>>>>>    Documentation/dmaengine.txt              |    2 +-
>>>>>    drivers/dma/imx-dma.c                    |    3 ++-
>>>>>    drivers/dma/sirf-dma.c                   |    3 ++-
>>>>>    drivers/media/platform/m2m-deinterlace.c |    2 +-
>>>>>    include/linux/dmaengine.h                |    6 +++---
>>>>>    5 files changed, 9 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
>>>>> index 879b6e3..c642614 100644
>>>>> --- a/Documentation/dmaengine.txt
>>>>> +++ b/Documentation/dmaengine.txt
>>>>> @@ -94,7 +94,7 @@ The slave DMA usage consists of following steps:
>>>>>                 size_t period_len, enum dma_data_direction direction);
>>>>>
>>>>>         struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>> *xt,
>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>> **xts,
>>>>>                 unsigned long flags);
>>>>>
>>>>>       The peripheral driver is expected to have mapped the scatterlist
>>>>> for
>>>>> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
>>>>> index 6f9ac20..e2c52ce 100644
>>>>> --- a/drivers/dma/imx-dma.c
>>>>> +++ b/drivers/dma/imx-dma.c
>>>>> @@ -954,12 +954,13 @@ static struct dma_async_tx_descriptor
>>>>> *imxdma_prep_dma_memcpy(
>>>>>    }
>>>>>
>>>>>    static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
>>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>>         unsigned long flags)
>>>>>    {
>>>>>         struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
>>>>>         struct imxdma_engine *imxdma = imxdmac->imxdma;
>>>>>         struct imxdma_desc *desc;
>>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>>
>>>>>         dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx
>>>>> dst_start=0x%llx\n"
>>>>>                 "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n",
>>>>> __func__,
>>>>> diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
>>>>> index d4d3a31..b6a150b 100644
>>>>> --- a/drivers/dma/sirf-dma.c
>>>>> +++ b/drivers/dma/sirf-dma.c
>>>>> @@ -509,12 +509,13 @@ sirfsoc_dma_tx_status(struct dma_chan *chan,
>>>>> dma_cookie_t cookie,
>>>>>    }
>>>>>
>>>>>    static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
>>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>>         unsigned long flags)
>>>>>    {
>>>>>         struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
>>>>>         struct sirfsoc_dma_chan *schan =
>>>>> dma_chan_to_sirfsoc_dma_chan(chan);
>>>>>         struct sirfsoc_dma_desc *sdesc = NULL;
>>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>>         unsigned long iflags;
>>>>>         int ret;
>>>>>
>>>>> diff --git a/drivers/media/platform/m2m-deinterlace.c
>>>>> b/drivers/media/platform/m2m-deinterlace.c
>>>>> index 6bb86b5..468110a 100644
>>>>> --- a/drivers/media/platform/m2m-deinterlace.c
>>>>> +++ b/drivers/media/platform/m2m-deinterlace.c
>>>>> @@ -343,7 +343,7 @@ static void deinterlace_issue_dma(struct
>>>>> deinterlace_ctx *ctx, int op,
>>>>>         ctx->xt->dst_sgl = true;
>>>>>         flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
>>>>>
>>>>> -     tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
>>>>> +     tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
>>>>>         if (tx == NULL) {
>>>>>                 v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep
>>>>> error\n");
>>>>>                 return;
>>>>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>>>>> index c5c92d5..2f77a9a 100644
>>>>> --- a/include/linux/dmaengine.h
>>>>> +++ b/include/linux/dmaengine.h
>>>>> @@ -675,7 +675,7 @@ struct dma_device {
>>>>>                 size_t period_len, enum dma_transfer_direction direction,
>>>>>                 unsigned long flags, void *context);
>>>>>         struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>> *xt,
>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>> **xts,
>>>>>                 unsigned long flags);
>>>>>         int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd
>>>>> cmd,
>>>>>                 unsigned long arg);
>>>>> @@ -752,10 +752,10 @@ static inline struct dma_async_tx_descriptor
>>>>> *dmaengine_prep_dma_cyclic(
>>>>>    }
>>>>>
>>>>>    static inline struct dma_async_tx_descriptor
>>>>> *dmaengine_prep_interleaved_dma(
>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>> *xt,
>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>> **xts,
>>>>>                 unsigned long flags)
>>>>>    {
>>>>> -     return chan->device->device_prep_interleaved_dma(chan, xt, flags);
>>>>> +     return chan->device->device_prep_interleaved_dma(chan, xts,
>>>>> flags);
>>>>>    }
>>>>>
>>>>>    static inline int dma_get_slave_caps(struct dma_chan *chan, struct
>>>>> dma_slave_caps *caps)
>>>>> --
>>>>> 1.7.9.5
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>>>
>>>> --
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
>>>> in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 17, 2014, 9:52 a.m. UTC | #6
On Mon, Feb 17, 2014 at 3:14 PM, Lars-Peter Clausen <lars@metafoo.de> wrote:
> On 02/17/2014 10:42 AM, Srikanth Thokala wrote:
>>
>> On Mon, Feb 17, 2014 at 3:05 PM, Lars-Peter Clausen <lars@metafoo.de>
>> wrote:
>>>
>>> On 02/17/2014 10:29 AM, Srikanth Thokala wrote:
>>>>
>>>>
>>>> On Mon, Feb 17, 2014 at 2:13 PM, Vinod Koul <vinod.koul@intel.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> On Sat, Feb 15, 2014 at 05:30:17PM +0530, Srikanth Thokala wrote:
>>>>>>
>>>>>>
>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>> frames only when the memory is contiguous by incrementing src_start/
>>>>>> dst_start members of interleaved template.
>>>>>>
>>>>>> But, when the memory is non-contiguous it will restrict slave device
>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>> templates each having a different memory location.
>>>>>
>>>>>
>>>>> This seems to be missing the numbers of templates you are sending,
>>>>> wouldnt this
>>>>> require sending ARRAY_SiZE too?
>>>>>
>>>>> And why send double pointer?
>>>>
>>>>
>>>>
>>>> Array size is not required, when we pass the double pointer.  The last
>>>> element would be
>>>> pointed to NULL and we could get the number of templates from this
>>>> condition.
>>>> Here is an example snippet,
>>>>
>>>> In slave device driver,
>>>>
>>>>           struct dma_interleaved_template **xts;
>>>>
>>>>           xts = kcalloc(frm_cnt+1, sizeof(struct
>>>> dma_interleaved_template *), GFP_KERNEL);
>>>>           /* Error check for xts */
>>>>           for (i = 0; i < frm_cnt; i++) {
>>>>                   xts[i] = kmalloc(sizeof(struct
>>>> dma_interleaved_template), GFP_KERNEL);
>>>>                   /* Error check for xts[i] */
>>>>           }
>>>>           xts[i] = NULL;
>>>>
>>>> In DMA engine driver,  we could get the number of frames by,
>>>>
>>>>           for (; xts[frmno] != NULL; frmno++);
>>>>
>>>> I felt this way is simpler than adding an extra argument to the API.
>>>> Please let me know
>>>> your opinion and suggest me a better way.
>>>
>>>
>>>
>>> I think Vinod's suggestion of passing in an array of
>>> interleaved_templates
>>> and the size of the array is better than what you are currently doing.
>>
>>
>> Ok, Lars.  I will update with this in my v4. Thanks.
>>
>>>
>>> Btw. you also need to update the current implementations and users of the
>>> API accordingly.
>>
>>
>> Yes, I have updated them in this patch.
>
>
> But you didn't update them accordingly to your proposed semantics. The
> caller didn't NULL terminate the array and the drivers did blindly assume
> there will always be exactly one transfer.

Ok, got it.  I will correct in v4.  Thanks for pointing this.

Srikanth

>
>
>>
>> Thanks
>> Srikanth
>>
>>
>>>
>>>
>>>>
>>>>>
>>>>> --
>>>>> ~Vinod
>>>>>
>>>>>>
>>>>>> Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
>>>>>> ---
>>>>>>    Documentation/dmaengine.txt              |    2 +-
>>>>>>    drivers/dma/imx-dma.c                    |    3 ++-
>>>>>>    drivers/dma/sirf-dma.c                   |    3 ++-
>>>>>>    drivers/media/platform/m2m-deinterlace.c |    2 +-
>>>>>>    include/linux/dmaengine.h                |    6 +++---
>>>>>>    5 files changed, 9 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
>>>>>> index 879b6e3..c642614 100644
>>>>>> --- a/Documentation/dmaengine.txt
>>>>>> +++ b/Documentation/dmaengine.txt
>>>>>> @@ -94,7 +94,7 @@ The slave DMA usage consists of following steps:
>>>>>>                 size_t period_len, enum dma_data_direction direction);
>>>>>>
>>>>>>         struct dma_async_tx_descriptor
>>>>>> *(*device_prep_interleaved_dma)(
>>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> *xt,
>>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> **xts,
>>>>>>                 unsigned long flags);
>>>>>>
>>>>>>       The peripheral driver is expected to have mapped the scatterlist
>>>>>> for
>>>>>> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
>>>>>> index 6f9ac20..e2c52ce 100644
>>>>>> --- a/drivers/dma/imx-dma.c
>>>>>> +++ b/drivers/dma/imx-dma.c
>>>>>> @@ -954,12 +954,13 @@ static struct dma_async_tx_descriptor
>>>>>> *imxdma_prep_dma_memcpy(
>>>>>>    }
>>>>>>
>>>>>>    static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
>>>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>>>         unsigned long flags)
>>>>>>    {
>>>>>>         struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
>>>>>>         struct imxdma_engine *imxdma = imxdmac->imxdma;
>>>>>>         struct imxdma_desc *desc;
>>>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>>>
>>>>>>         dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx
>>>>>> dst_start=0x%llx\n"
>>>>>>                 "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n",
>>>>>> __func__,
>>>>>> diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
>>>>>> index d4d3a31..b6a150b 100644
>>>>>> --- a/drivers/dma/sirf-dma.c
>>>>>> +++ b/drivers/dma/sirf-dma.c
>>>>>> @@ -509,12 +509,13 @@ sirfsoc_dma_tx_status(struct dma_chan *chan,
>>>>>> dma_cookie_t cookie,
>>>>>>    }
>>>>>>
>>>>>>    static struct dma_async_tx_descriptor
>>>>>> *sirfsoc_dma_prep_interleaved(
>>>>>> -     struct dma_chan *chan, struct dma_interleaved_template *xt,
>>>>>> +     struct dma_chan *chan, struct dma_interleaved_template **xts,
>>>>>>         unsigned long flags)
>>>>>>    {
>>>>>>         struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
>>>>>>         struct sirfsoc_dma_chan *schan =
>>>>>> dma_chan_to_sirfsoc_dma_chan(chan);
>>>>>>         struct sirfsoc_dma_desc *sdesc = NULL;
>>>>>> +     struct dma_interleaved_template *xt = *xts;
>>>>>>         unsigned long iflags;
>>>>>>         int ret;
>>>>>>
>>>>>> diff --git a/drivers/media/platform/m2m-deinterlace.c
>>>>>> b/drivers/media/platform/m2m-deinterlace.c
>>>>>> index 6bb86b5..468110a 100644
>>>>>> --- a/drivers/media/platform/m2m-deinterlace.c
>>>>>> +++ b/drivers/media/platform/m2m-deinterlace.c
>>>>>> @@ -343,7 +343,7 @@ static void deinterlace_issue_dma(struct
>>>>>> deinterlace_ctx *ctx, int op,
>>>>>>         ctx->xt->dst_sgl = true;
>>>>>>         flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
>>>>>>
>>>>>> -     tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
>>>>>> +     tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
>>>>>>         if (tx == NULL) {
>>>>>>                 v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep
>>>>>> error\n");
>>>>>>                 return;
>>>>>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>>>>>> index c5c92d5..2f77a9a 100644
>>>>>> --- a/include/linux/dmaengine.h
>>>>>> +++ b/include/linux/dmaengine.h
>>>>>> @@ -675,7 +675,7 @@ struct dma_device {
>>>>>>                 size_t period_len, enum dma_transfer_direction
>>>>>> direction,
>>>>>>                 unsigned long flags, void *context);
>>>>>>         struct dma_async_tx_descriptor
>>>>>> *(*device_prep_interleaved_dma)(
>>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> *xt,
>>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> **xts,
>>>>>>                 unsigned long flags);
>>>>>>         int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd
>>>>>> cmd,
>>>>>>                 unsigned long arg);
>>>>>> @@ -752,10 +752,10 @@ static inline struct dma_async_tx_descriptor
>>>>>> *dmaengine_prep_dma_cyclic(
>>>>>>    }
>>>>>>
>>>>>>    static inline struct dma_async_tx_descriptor
>>>>>> *dmaengine_prep_interleaved_dma(
>>>>>> -             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> *xt,
>>>>>> +             struct dma_chan *chan, struct dma_interleaved_template
>>>>>> **xts,
>>>>>>                 unsigned long flags)
>>>>>>    {
>>>>>> -     return chan->device->device_prep_interleaved_dma(chan, xt,
>>>>>> flags);
>>>>>> +     return chan->device->device_prep_interleaved_dma(chan, xts,
>>>>>> flags);
>>>>>>    }
>>>>>>
>>>>>>    static inline int dma_get_slave_caps(struct dma_chan *chan, struct
>>>>>> dma_slave_caps *caps)
>>>>>> --
>>>>>> 1.7.9.5
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe dmaengine"
>>>>>> in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
>>>>> in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>> Please read the FAQ at  http://www.tux.org/lkml/
>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
>>> in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 17, 2014, 9:57 a.m. UTC | #7
On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
> The current implementation of interleaved DMA API support multiple
> frames only when the memory is contiguous by incrementing src_start/
> dst_start members of interleaved template.
>
> But, when the memory is non-contiguous it will restrict slave device
> to not submit multiple frames in a batch.  This patch handles this
> issue by allowing the slave device to send array of interleaved dma
> templates each having a different memory location.
>
How fragmented could be memory in your case? Is it inefficient to
submit separate transfers for each segment/frame?
It will help if you could give a typical example (chunk size and gap
in bytes) of what you worry about.

Thanks,
Jassi
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 18, 2014, 11:28 a.m. UTC | #8
On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>> The current implementation of interleaved DMA API support multiple
>> frames only when the memory is contiguous by incrementing src_start/
>> dst_start members of interleaved template.
>>
>> But, when the memory is non-contiguous it will restrict slave device
>> to not submit multiple frames in a batch.  This patch handles this
>> issue by allowing the slave device to send array of interleaved dma
>> templates each having a different memory location.
>>
> How fragmented could be memory in your case? Is it inefficient to
> submit separate transfers for each segment/frame?
> It will help if you could give a typical example (chunk size and gap
> in bytes) of what you worry about.

With scatter-gather engine feature in the hardware, submitting separate
transfers for each frame look inefficient. As an example, our DMA engine
supports up to 16 video frames, with each frame (a typical video frame
size) being contiguous in memory but frames are scattered into different
locations. We could not definitely submit frame by frame as it would be
software overhead (HW interrupting for each frame) resulting in video lags.

By this approach, it will allow slave device to submit multiple frames at
once.

Srikanth

>
> Thanks,
> Jassi
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 18, 2014, 4:50 p.m. UTC | #9
On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com> wrote:
> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>> The current implementation of interleaved DMA API support multiple
>>> frames only when the memory is contiguous by incrementing src_start/
>>> dst_start members of interleaved template.
>>>
>>> But, when the memory is non-contiguous it will restrict slave device
>>> to not submit multiple frames in a batch.  This patch handles this
>>> issue by allowing the slave device to send array of interleaved dma
>>> templates each having a different memory location.
>>>
>> How fragmented could be memory in your case? Is it inefficient to
>> submit separate transfers for each segment/frame?
>> It will help if you could give a typical example (chunk size and gap
>> in bytes) of what you worry about.
>
> With scatter-gather engine feature in the hardware, submitting separate
> transfers for each frame look inefficient. As an example, our DMA engine
> supports up to 16 video frames, with each frame (a typical video frame
> size) being contiguous in memory but frames are scattered into different
> locations. We could not definitely submit frame by frame as it would be
> software overhead (HW interrupting for each frame) resulting in video lags.
>
IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
inefficient at all. Even poor-latency audio would generate a higher
interrupt-rate. So the "inefficiency concern" doesn't seem valid to
me.

Not to mean we shouldn't strive to reduce the interrupt-rate further.
Another option is to emulate the ring-buffer scheme of ALSA.... which
should be possible since for a session of video playback the frame
buffers' locations wouldn't change.

Yet another option is to use the full potential of the
interleaved-xfer api as such. It seems you confuse a 'video frame'
with the interleaved-xfer api's 'frame'. They are different.

Assuming your one video frame is F bytes long and Gk is the gap in
bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
for i!=j
In the context of interleaved-xfer api, you have just 1 Frame of 16
chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
0<=k<15
So for your use-case .....
  dma_interleaved_template.numf = 1   /* just 1 frame */
  dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
   ...... //other parameters

You have 3 options to choose from and all should work just as fine.
Otherwise please state your problem in real numbers (video-frames'
size, count & gap in bytes).

-Jassi
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 18, 2014, 5:46 p.m. UTC | #10
On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com> wrote:
>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>> The current implementation of interleaved DMA API support multiple
>>>> frames only when the memory is contiguous by incrementing src_start/
>>>> dst_start members of interleaved template.
>>>>
>>>> But, when the memory is non-contiguous it will restrict slave device
>>>> to not submit multiple frames in a batch.  This patch handles this
>>>> issue by allowing the slave device to send array of interleaved dma
>>>> templates each having a different memory location.
>>>>
>>> How fragmented could be memory in your case? Is it inefficient to
>>> submit separate transfers for each segment/frame?
>>> It will help if you could give a typical example (chunk size and gap
>>> in bytes) of what you worry about.
>>
>> With scatter-gather engine feature in the hardware, submitting separate
>> transfers for each frame look inefficient. As an example, our DMA engine
>> supports up to 16 video frames, with each frame (a typical video frame
>> size) being contiguous in memory but frames are scattered into different
>> locations. We could not definitely submit frame by frame as it would be
>> software overhead (HW interrupting for each frame) resulting in video lags.
>>
> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
> inefficient at all. Even poor-latency audio would generate a higher
> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
> me.
>
> Not to mean we shouldn't strive to reduce the interrupt-rate further.
> Another option is to emulate the ring-buffer scheme of ALSA.... which
> should be possible since for a session of video playback the frame
> buffers' locations wouldn't change.
>
> Yet another option is to use the full potential of the
> interleaved-xfer api as such. It seems you confuse a 'video frame'
> with the interleaved-xfer api's 'frame'. They are different.
>
> Assuming your one video frame is F bytes long and Gk is the gap in
> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
> for i!=j
> In the context of interleaved-xfer api, you have just 1 Frame of 16
> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
> 0<=k<15
> So for your use-case .....
>   dma_interleaved_template.numf = 1   /* just 1 frame */
>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>    ...... //other parameters
>
> You have 3 options to choose from and all should work just as fine.
> Otherwise please state your problem in real numbers (video-frames'
> size, count & gap in bytes).

Initially I interpreted interleaved template the same.  But, Lars corrected me
in the subsequent discussion and let me put it here briefly,

In the interleaved template, each frame represents a line of size denoted by
chunk.size and the stride by icg.  'numf' represent number of frames i.e.
number of lines.

In video frame context,
chunk.size -> hsize
chunk.icg -> stride
numf -> vsize
and frame_size is always 1 as it will have only one chunk in a line.

So, the API would not allow to pass multiple frames and we came up with a
resolution to pass an array of interleaved template structs to handle this.

Srikanth

>
> -Jassi
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 18, 2014, 7:03 p.m. UTC | #11
On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>> The current implementation of interleaved DMA API support multiple
>>>>> frames only when the memory is contiguous by incrementing src_start/
>>>>> dst_start members of interleaved template.
>>>>>
>>>>> But, when the memory is non-contiguous it will restrict slave device
>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>> templates each having a different memory location.
>>>>>
>>>> How fragmented could be memory in your case? Is it inefficient to
>>>> submit separate transfers for each segment/frame?
>>>> It will help if you could give a typical example (chunk size and gap
>>>> in bytes) of what you worry about.
>>>
>>> With scatter-gather engine feature in the hardware, submitting separate
>>> transfers for each frame look inefficient. As an example, our DMA engine
>>> supports up to 16 video frames, with each frame (a typical video frame
>>> size) being contiguous in memory but frames are scattered into different
>>> locations. We could not definitely submit frame by frame as it would be
>>> software overhead (HW interrupting for each frame) resulting in video lags.
>>>
>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>> inefficient at all. Even poor-latency audio would generate a higher
>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>> me.
>>
>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>> should be possible since for a session of video playback the frame
>> buffers' locations wouldn't change.
>>
>> Yet another option is to use the full potential of the
>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>> with the interleaved-xfer api's 'frame'. They are different.
>>
>> Assuming your one video frame is F bytes long and Gk is the gap in
>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>> for i!=j
>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>> 0<=k<15
>> So for your use-case .....
>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>    ...... //other parameters
>>
>> You have 3 options to choose from and all should work just as fine.
>> Otherwise please state your problem in real numbers (video-frames'
>> size, count & gap in bytes).
>
> Initially I interpreted interleaved template the same.  But, Lars corrected me
> in the subsequent discussion and let me put it here briefly,
>
> In the interleaved template, each frame represents a line of size denoted by
> chunk.size and the stride by icg.  'numf' represent number of frames i.e.
> number of lines.
>
> In video frame context,
> chunk.size -> hsize
> chunk.icg -> stride
> numf -> vsize
> and frame_size is always 1 as it will have only one chunk in a line.
>
But you said in your last post
  "with each frame (a typical video frame size) being contiguous in memory"
 ... which is not true from what you write above. Anyways, my first 2
suggestions still hold.

> So, the API would not allow to pass multiple frames and we came up with a
> resolution to pass an array of interleaved template structs to handle this.
>
Yeah the API doesn't allow such xfers that don't fall into any
'regular expression' of a transfer and also because no controller
natively supports such xfers -- your controller will break your
request up into 16 transfers and program them individually, right?
   BTW if you insist you could still express the 16 video frames as 1
interleaved-xfer frame with frame_size = (vsize + 1) * 16   ;)

Again, I would suggest you implement ring-buffer type scheme. Say
prepare 16 interleaved xfer templates and queue them. Upon each
xfer-done callback (i.e frame rendered), update the data and queue it
back. It might be much simpler for your actual case. At 30fps, 33ms to
queue a dma request should _not_ result in any frame-drop.

-Jassi
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 20, 2014, 9:24 a.m. UTC | #12
On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>> frames only when the memory is contiguous by incrementing src_start/
>>>>>> dst_start members of interleaved template.
>>>>>>
>>>>>> But, when the memory is non-contiguous it will restrict slave device
>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>> templates each having a different memory location.
>>>>>>
>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>> submit separate transfers for each segment/frame?
>>>>> It will help if you could give a typical example (chunk size and gap
>>>>> in bytes) of what you worry about.
>>>>
>>>> With scatter-gather engine feature in the hardware, submitting separate
>>>> transfers for each frame look inefficient. As an example, our DMA engine
>>>> supports up to 16 video frames, with each frame (a typical video frame
>>>> size) being contiguous in memory but frames are scattered into different
>>>> locations. We could not definitely submit frame by frame as it would be
>>>> software overhead (HW interrupting for each frame) resulting in video lags.
>>>>
>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>> inefficient at all. Even poor-latency audio would generate a higher
>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>> me.
>>>
>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>> should be possible since for a session of video playback the frame
>>> buffers' locations wouldn't change.
>>>
>>> Yet another option is to use the full potential of the
>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>> with the interleaved-xfer api's 'frame'. They are different.
>>>
>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>> for i!=j
>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>> 0<=k<15
>>> So for your use-case .....
>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>    ...... //other parameters
>>>
>>> You have 3 options to choose from and all should work just as fine.
>>> Otherwise please state your problem in real numbers (video-frames'
>>> size, count & gap in bytes).
>>
>> Initially I interpreted interleaved template the same.  But, Lars corrected me
>> in the subsequent discussion and let me put it here briefly,
>>
>> In the interleaved template, each frame represents a line of size denoted by
>> chunk.size and the stride by icg.  'numf' represent number of frames i.e.
>> number of lines.
>>
>> In video frame context,
>> chunk.size -> hsize
>> chunk.icg -> stride
>> numf -> vsize
>> and frame_size is always 1 as it will have only one chunk in a line.
>>
> But you said in your last post
>   "with each frame (a typical video frame size) being contiguous in memory"
>  ... which is not true from what you write above. Anyways, my first 2
> suggestions still hold.

Yes, each video frame is contiguous and they can be scattered.

>
>> So, the API would not allow to pass multiple frames and we came up with a
>> resolution to pass an array of interleaved template structs to handle this.
>>
> Yeah the API doesn't allow such xfers that don't fall into any
> 'regular expression' of a transfer and also because no controller
> natively supports such xfers -- your controller will break your
> request up into 16 transfers and program them individually, right?

No, it will not program individually.  It has a scatter-gather engine where we
could update the current pointer to the first frame and tail pointer to the last
frame and hardware does the transfer and raise an interrupt when all the frames
are completed (ie. when current reaches tail).

>    BTW if you insist you could still express the 16 video frames as 1
> interleaved-xfer frame with frame_size = (vsize + 1) * 16   ;)
>
> Again, I would suggest you implement ring-buffer type scheme. Say
> prepare 16 interleaved xfer templates and queue them. Upon each
> xfer-done callback (i.e frame rendered), update the data and queue it
> back. It might be much simpler for your actual case. At 30fps, 33ms to
> queue a dma request should _not_ result in any frame-drop.

The driver has similar implementation, where each desc (handling 16 frames)
is pushed to pending queue and then to done queue.  Slave device still can
add desc's to the pending queue and whenever the transfer of 16 frames is
completed we move each desc to done list.

Srikanth

>
> -Jassi
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 20, 2014, 9:53 a.m. UTC | #13
On 20 February 2014 14:54, Srikanth Thokala <sthokal@xilinx.com> wrote:
> On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>>> frames only when the memory is contiguous by incrementing src_start/
>>>>>>> dst_start members of interleaved template.
>>>>>>>
>>>>>>> But, when the memory is non-contiguous it will restrict slave device
>>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>>> templates each having a different memory location.
>>>>>>>
>>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>>> submit separate transfers for each segment/frame?
>>>>>> It will help if you could give a typical example (chunk size and gap
>>>>>> in bytes) of what you worry about.
>>>>>
>>>>> With scatter-gather engine feature in the hardware, submitting separate
>>>>> transfers for each frame look inefficient. As an example, our DMA engine
>>>>> supports up to 16 video frames, with each frame (a typical video frame
>>>>> size) being contiguous in memory but frames are scattered into different
>>>>> locations. We could not definitely submit frame by frame as it would be
>>>>> software overhead (HW interrupting for each frame) resulting in video lags.
>>>>>
>>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>>> inefficient at all. Even poor-latency audio would generate a higher
>>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>>> me.
>>>>
>>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>>> should be possible since for a session of video playback the frame
>>>> buffers' locations wouldn't change.
>>>>
>>>> Yet another option is to use the full potential of the
>>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>>> with the interleaved-xfer api's 'frame'. They are different.
>>>>
>>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>>> for i!=j
>>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>>> 0<=k<15
>>>> So for your use-case .....
>>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>>    ...... //other parameters
>>>>
>>>> You have 3 options to choose from and all should work just as fine.
>>>> Otherwise please state your problem in real numbers (video-frames'
>>>> size, count & gap in bytes).
>>>
>>> Initially I interpreted interleaved template the same.  But, Lars corrected me
>>> in the subsequent discussion and let me put it here briefly,
>>>
>>> In the interleaved template, each frame represents a line of size denoted by
>>> chunk.size and the stride by icg.  'numf' represent number of frames i.e.
>>> number of lines.
>>>
>>> In video frame context,
>>> chunk.size -> hsize
>>> chunk.icg -> stride
>>> numf -> vsize
>>> and frame_size is always 1 as it will have only one chunk in a line.
>>>
>> But you said in your last post
>>   "with each frame (a typical video frame size) being contiguous in memory"
>>  ... which is not true from what you write above. Anyways, my first 2
>> suggestions still hold.
>
> Yes, each video frame is contiguous and they can be scattered.
>
I assume by contiguous frame you mean as in framebuffer?  Which is an
array of bytes.
If yes, then you should do as I suggest first,  frame_size=16  and numf=1.

If no, then it seems you are already doing the right thing.... the
ring-buffer scheme. Please share some stats how the current api is
causing you overhead because that is a very common case (many
controllers support LLI) and you have 467ms (@30fps with 16-frames
ring-buffer) to queue in before you see any frame drop.

Regards,
Jassi
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 24, 2014, 2:09 a.m. UTC | #14
On 21 February 2014 23:37, Srikanth Thokala <sthokal@xilinx.com> wrote:
> On Thu, Feb 20, 2014 at 3:23 PM, Jassi Brar <jaswinder.singh@linaro.org>
> wrote:
>> On 20 February 2014 14:54, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>> On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org>
>>> wrote:
>>>> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar
>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com>
>>>>>> wrote:
>>>>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar
>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>> wrote:
>>>>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>>>>> frames only when the memory is contiguous by incrementing
>>>>>>>>> src_start/
>>>>>>>>> dst_start members of interleaved template.
>>>>>>>>>
>>>>>>>>> But, when the memory is non-contiguous it will restrict slave
>>>>>>>>> device
>>>>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>>>>> templates each having a different memory location.
>>>>>>>>>
>>>>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>>>>> submit separate transfers for each segment/frame?
>>>>>>>> It will help if you could give a typical example (chunk size and gap
>>>>>>>> in bytes) of what you worry about.
>>>>>>>
>>>>>>> With scatter-gather engine feature in the hardware, submitting
>>>>>>> separate
>>>>>>> transfers for each frame look inefficient. As an example, our DMA
>>>>>>> engine
>>>>>>> supports up to 16 video frames, with each frame (a typical video
>>>>>>> frame
>>>>>>> size) being contiguous in memory but frames are scattered into
>>>>>>> different
>>>>>>> locations. We could not definitely submit frame by frame as it would
>>>>>>> be
>>>>>>> software overhead (HW interrupting for each frame) resulting in video
>>>>>>> lags.
>>>>>>>
>>>>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>>>>> inefficient at all. Even poor-latency audio would generate a higher
>>>>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>>>>> me.
>>>>>>
>>>>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>>>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>>>>> should be possible since for a session of video playback the frame
>>>>>> buffers' locations wouldn't change.
>>>>>>
>>>>>> Yet another option is to use the full potential of the
>>>>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>>>>> with the interleaved-xfer api's 'frame'. They are different.
>>>>>>
>>>>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>>>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>>>>> for i!=j
>>>>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>>>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>>>>> 0<=k<15
>>>>>> So for your use-case .....
>>>>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>>>>    ...... //other parameters
>>>>>>
>>>>>> You have 3 options to choose from and all should work just as fine.
>>>>>> Otherwise please state your problem in real numbers (video-frames'
>>>>>> size, count & gap in bytes).
>>>>>
>>>>> Initially I interpreted interleaved template the same.  But, Lars
>>>>> corrected me
>>>>> in the subsequent discussion and let me put it here briefly,
>>>>>
>>>>> In the interleaved template, each frame represents a line of size
>>>>> denoted by
>>>>> chunk.size and the stride by icg.  'numf' represent number of frames
>>>>> i.e.
>>>>> number of lines.
>>>>>
>>>>> In video frame context,
>>>>> chunk.size -> hsize
>>>>> chunk.icg -> stride
>>>>> numf -> vsize
>>>>> and frame_size is always 1 as it will have only one chunk in a line.
>>>>>
>>>> But you said in your last post
>>>>   "with each frame (a typical video frame size) being contiguous in
>>>> memory"
>>>>  ... which is not true from what you write above. Anyways, my first 2
>>>> suggestions still hold.
>>>
>>> Yes, each video frame is contiguous and they can be scattered.
>>>
>> I assume by contiguous frame you mean as in framebuffer?  Which is an
>> array of bytes.
>> If yes, then you should do as I suggest first,  frame_size=16  and numf=1.
>
> I think am confusing you.  I would like to explain with an example.  Lets
> say
> each video frame is 4k size starting at address 0x10004000 (-0x10005000) and
> other frame at 0x20002000 (-0x20003000), and so on.
>
As I said plz dont confuse video frame with DMA frame.... in video
frame the stride is constant(zero or not) whereas in DMA context the
stride must be zero for the frame to be called contiguous.

> So, the frames are
> scattered in memory and as the template doesnt allow multiple src_start/
> dst_start we could not use single template to fill the HW descriptors (of
> frames).  So, I feel your suggestion might not work if the frames are
> scattered.
> Also, how could we get 'vsize' value in your approach?
>
The client driver(video driver) should know the frame parameters.
Practically you'll have to populate 16 transfer templates (frame
attributes and locations won't change for a session) and submit to be
transferred _cyclically_.

>  More importantly,
> we are overriding the semantics of interleaved template members.
>
Not at all. Interleaved-dma isn't meant for only constant stride/icg
transfers. Rather it's for identical frames with random strides.

>
>>
>> If no, then it seems you are already doing the right thing.... the
>> ring-buffer scheme. Please share some stats how the current api is
>> causing you overhead because that is a very common case (many
>> controllers support LLI) and you have 467ms (@30fps with 16-frames
>> ring-buffer) to queue in before you see any frame drop.
>
> As I mentioned earlier in the thread, our hardware has a SG engine where by
> we could send multiple frames in a batch.  Using the original implementation
> of interleaved API, we have three options to transfer.
>
> One is to send frame by frame to the hardware.  We get a async_tx desc for
> each frame and then we submit it to hardware triggering it to transfer this
> BD.
> We queue the next descriptor on the pending queue and whenever there is
> a completion interrupt we submit the next BD on this queue to hardware. In
> this implementation we are not efficiently using the SG engine in the
> hardware
> as we transferring frame by frame even HW allows us to transfer multiple
> frames.
>
Sending one frame at a time will likely cause jitters. That shouldn't be done.

> The second option is to queue all the BDs until the maximum frames that HW
> is
> capable to transfer in a batch and then submit to SG engine in the HW.  With
> this approach I feel there will be additional software overhead to track the
> number
> of maximum transfers and few additional cycles to release the cookie of each
> desc.
> Here, each desc represents a frame.
>
APIs are written for the gcd of h/w. We should optimize only for
majority and here isn't even anything gained after changing the api.
What you want to 'optimize' has been there since ever. Nobody
considers that overhead. BTW you don't even want to spend a few 'extra
cycles' but what about every other platform that doesn't support this
and will have to scan for such transfer requests?

> The last option is the current implementation of the driver along with this
> change in
> API.  It will allow us to send array of interleaved templates wherein we
> could allocate
> a single async desc which will handle multiple frames (or segments) and just
> submit
> this desc to HW.  Then we program the current to first frame, tail to last
> frame and
> HW will complete this transfer.  Here, each desc represents multiple frames.
>
> My point here is the driver should use hardware resources efficiently and I
> feel the
> driver will be inefficient if we dont use them.
>
I am afraid you are getting carried away by the 'awesomeness' of your
hardware. RingBuffers/Cyclic transfers  are meant for cases just like
yours.
Consider the following ... if you queue 16 frames and don't care to
track before all are transmitted, you'll have very high latency. Video
seeks will take unacceptably long and give the impression of a slow
system. Whereas if you get callbacks for each frame rendered, you
could updates frames from the next one thereby having very quick
response time.

Anyways there are already ways to achieve what you want (however a bad
idea that might be). So I am not convinced the change to api is any
useful (your hardware won't run any more efficiently with the change).

Regards,
Jassi
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 26, 2014, 2:21 p.m. UTC | #15
On Mon, Feb 24, 2014 at 7:39 AM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
> On 21 February 2014 23:37, Srikanth Thokala <sthokal@xilinx.com> wrote:
>> On Thu, Feb 20, 2014 at 3:23 PM, Jassi Brar <jaswinder.singh@linaro.org>
>> wrote:
>>> On 20 February 2014 14:54, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>> On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org>
>>>> wrote:
>>>>> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar
>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>> wrote:
>>>>>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar
>>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>>> wrote:
>>>>>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>>>>>> frames only when the memory is contiguous by incrementing
>>>>>>>>>> src_start/
>>>>>>>>>> dst_start members of interleaved template.
>>>>>>>>>>
>>>>>>>>>> But, when the memory is non-contiguous it will restrict slave
>>>>>>>>>> device
>>>>>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>>>>>> templates each having a different memory location.
>>>>>>>>>>
>>>>>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>>>>>> submit separate transfers for each segment/frame?
>>>>>>>>> It will help if you could give a typical example (chunk size and gap
>>>>>>>>> in bytes) of what you worry about.
>>>>>>>>
>>>>>>>> With scatter-gather engine feature in the hardware, submitting
>>>>>>>> separate
>>>>>>>> transfers for each frame look inefficient. As an example, our DMA
>>>>>>>> engine
>>>>>>>> supports up to 16 video frames, with each frame (a typical video
>>>>>>>> frame
>>>>>>>> size) being contiguous in memory but frames are scattered into
>>>>>>>> different
>>>>>>>> locations. We could not definitely submit frame by frame as it would
>>>>>>>> be
>>>>>>>> software overhead (HW interrupting for each frame) resulting in video
>>>>>>>> lags.
>>>>>>>>
>>>>>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>>>>>> inefficient at all. Even poor-latency audio would generate a higher
>>>>>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>>>>>> me.
>>>>>>>
>>>>>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>>>>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>>>>>> should be possible since for a session of video playback the frame
>>>>>>> buffers' locations wouldn't change.
>>>>>>>
>>>>>>> Yet another option is to use the full potential of the
>>>>>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>>>>>> with the interleaved-xfer api's 'frame'. They are different.
>>>>>>>
>>>>>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>>>>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>>>>>> for i!=j
>>>>>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>>>>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>>>>>> 0<=k<15
>>>>>>> So for your use-case .....
>>>>>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>>>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>>>>>    ...... //other parameters
>>>>>>>
>>>>>>> You have 3 options to choose from and all should work just as fine.
>>>>>>> Otherwise please state your problem in real numbers (video-frames'
>>>>>>> size, count & gap in bytes).
>>>>>>
>>>>>> Initially I interpreted interleaved template the same.  But, Lars
>>>>>> corrected me
>>>>>> in the subsequent discussion and let me put it here briefly,
>>>>>>
>>>>>> In the interleaved template, each frame represents a line of size
>>>>>> denoted by
>>>>>> chunk.size and the stride by icg.  'numf' represent number of frames
>>>>>> i.e.
>>>>>> number of lines.
>>>>>>
>>>>>> In video frame context,
>>>>>> chunk.size -> hsize
>>>>>> chunk.icg -> stride
>>>>>> numf -> vsize
>>>>>> and frame_size is always 1 as it will have only one chunk in a line.
>>>>>>
>>>>> But you said in your last post
>>>>>   "with each frame (a typical video frame size) being contiguous in
>>>>> memory"
>>>>>  ... which is not true from what you write above. Anyways, my first 2
>>>>> suggestions still hold.
>>>>
>>>> Yes, each video frame is contiguous and they can be scattered.
>>>>
>>> I assume by contiguous frame you mean as in framebuffer?  Which is an
>>> array of bytes.
>>> If yes, then you should do as I suggest first,  frame_size=16  and numf=1.
>>
>> I think am confusing you.  I would like to explain with an example.  Lets
>> say
>> each video frame is 4k size starting at address 0x10004000 (-0x10005000) and
>> other frame at 0x20002000 (-0x20003000), and so on.
>>
> As I said plz dont confuse video frame with DMA frame.... in video
> frame the stride is constant(zero or not) whereas in DMA context the
> stride must be zero for the frame to be called contiguous.
>
>> So, the frames are
>> scattered in memory and as the template doesnt allow multiple src_start/
>> dst_start we could not use single template to fill the HW descriptors (of
>> frames).  So, I feel your suggestion might not work if the frames are
>> scattered.
>> Also, how could we get 'vsize' value in your approach?
>>
> The client driver(video driver) should know the frame parameters.
> Practically you'll have to populate 16 transfer templates (frame
> attributes and locations won't change for a session) and submit to be
> transferred _cyclically_.
>
>>  More importantly,
>> we are overriding the semantics of interleaved template members.
>>
> Not at all. Interleaved-dma isn't meant for only constant stride/icg
> transfers. Rather it's for identical frames with random strides.
>
>>
>>>
>>> If no, then it seems you are already doing the right thing.... the
>>> ring-buffer scheme. Please share some stats how the current api is
>>> causing you overhead because that is a very common case (many
>>> controllers support LLI) and you have 467ms (@30fps with 16-frames
>>> ring-buffer) to queue in before you see any frame drop.
>>
>> As I mentioned earlier in the thread, our hardware has a SG engine where by
>> we could send multiple frames in a batch.  Using the original implementation
>> of interleaved API, we have three options to transfer.
>>
>> One is to send frame by frame to the hardware.  We get a async_tx desc for
>> each frame and then we submit it to hardware triggering it to transfer this
>> BD.
>> We queue the next descriptor on the pending queue and whenever there is
>> a completion interrupt we submit the next BD on this queue to hardware. In
>> this implementation we are not efficiently using the SG engine in the
>> hardware
>> as we transferring frame by frame even HW allows us to transfer multiple
>> frames.
>>
> Sending one frame at a time will likely cause jitters. That shouldn't be done.
>
>> The second option is to queue all the BDs until the maximum frames that HW
>> is
>> capable to transfer in a batch and then submit to SG engine in the HW.  With
>> this approach I feel there will be additional software overhead to track the
>> number
>> of maximum transfers and few additional cycles to release the cookie of each
>> desc.
>> Here, each desc represents a frame.
>>
> APIs are written for the gcd of h/w. We should optimize only for
> majority and here isn't even anything gained after changing the api.
> What you want to 'optimize' has been there since ever. Nobody
> considers that overhead. BTW you don't even want to spend a few 'extra
> cycles' but what about every other platform that doesn't support this
> and will have to scan for such transfer requests?
>
>> The last option is the current implementation of the driver along with this
>> change in
>> API.  It will allow us to send array of interleaved templates wherein we
>> could allocate
>> a single async desc which will handle multiple frames (or segments) and just
>> submit
>> this desc to HW.  Then we program the current to first frame, tail to last
>> frame and
>> HW will complete this transfer.  Here, each desc represents multiple frames.
>>
>> My point here is the driver should use hardware resources efficiently and I
>> feel the
>> driver will be inefficient if we dont use them.
>>
> I am afraid you are getting carried away by the 'awesomeness' of your
> hardware. RingBuffers/Cyclic transfers  are meant for cases just like
> yours.
> Consider the following ... if you queue 16 frames and don't care to
> track before all are transmitted, you'll have very high latency. Video
> seeks will take unacceptably long and give the impression of a slow
> system. Whereas if you get callbacks for each frame rendered, you
> could updates frames from the next one thereby having very quick
> response time.

Not at all, at least in our hardware, when we submit transfers it is most
likely to be completed. There are few errors, but mostly are recoverable
and it doesnt stop the transfer unless there is a critical error which needs
a reset to the whole system.  So, at least in my use case there will be
no latency.  I am not saying hardware is great, but it is the IP implementation.

Regarding this API change, I had earlier explained my use case in my
v2 thread.
Lars and Vinod came up with this resolution to allow array of
interleaved templates.
I also feel this is reasonable change for the subsystem.

Lars/Vinod, could you also comment on this thread for an early resolution?
Based on everyone's opinion I can take a call for my v4 patch.

Thanks
Srikanth

>
> Anyways there are already ways to achieve what you want (however a bad
> idea that might be). So I am not convinced the change to api is any
> useful (your hardware won't run any more efficiently with the change).
>
> Regards,
> Jassi
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jassi Brar Feb. 26, 2014, 3:02 p.m. UTC | #16
On 26 February 2014 23:21, Srikanth Thokala <sthokal@xilinx.com> wrote:
> On Mon, Feb 24, 2014 at 7:39 AM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>> On 21 February 2014 23:37, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>> On Thu, Feb 20, 2014 at 3:23 PM, Jassi Brar <jaswinder.singh@linaro.org>
>>> wrote:
>>>> On 20 February 2014 14:54, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>> On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org>
>>>>> wrote:
>>>>>> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar
>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>> wrote:
>>>>>>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar
>>>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>>>> wrote:
>>>>>>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>>>>>>> frames only when the memory is contiguous by incrementing
>>>>>>>>>>> src_start/
>>>>>>>>>>> dst_start members of interleaved template.
>>>>>>>>>>>
>>>>>>>>>>> But, when the memory is non-contiguous it will restrict slave
>>>>>>>>>>> device
>>>>>>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>>>>>>> templates each having a different memory location.
>>>>>>>>>>>
>>>>>>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>>>>>>> submit separate transfers for each segment/frame?
>>>>>>>>>> It will help if you could give a typical example (chunk size and gap
>>>>>>>>>> in bytes) of what you worry about.
>>>>>>>>>
>>>>>>>>> With scatter-gather engine feature in the hardware, submitting
>>>>>>>>> separate
>>>>>>>>> transfers for each frame look inefficient. As an example, our DMA
>>>>>>>>> engine
>>>>>>>>> supports up to 16 video frames, with each frame (a typical video
>>>>>>>>> frame
>>>>>>>>> size) being contiguous in memory but frames are scattered into
>>>>>>>>> different
>>>>>>>>> locations. We could not definitely submit frame by frame as it would
>>>>>>>>> be
>>>>>>>>> software overhead (HW interrupting for each frame) resulting in video
>>>>>>>>> lags.
>>>>>>>>>
>>>>>>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>>>>>>> inefficient at all. Even poor-latency audio would generate a higher
>>>>>>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>>>>>>> me.
>>>>>>>>
>>>>>>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>>>>>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>>>>>>> should be possible since for a session of video playback the frame
>>>>>>>> buffers' locations wouldn't change.
>>>>>>>>
>>>>>>>> Yet another option is to use the full potential of the
>>>>>>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>>>>>>> with the interleaved-xfer api's 'frame'. They are different.
>>>>>>>>
>>>>>>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>>>>>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>>>>>>> for i!=j
>>>>>>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>>>>>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>>>>>>> 0<=k<15
>>>>>>>> So for your use-case .....
>>>>>>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>>>>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>>>>>>    ...... //other parameters
>>>>>>>>
>>>>>>>> You have 3 options to choose from and all should work just as fine.
>>>>>>>> Otherwise please state your problem in real numbers (video-frames'
>>>>>>>> size, count & gap in bytes).
>>>>>>>
>>>>>>> Initially I interpreted interleaved template the same.  But, Lars
>>>>>>> corrected me
>>>>>>> in the subsequent discussion and let me put it here briefly,
>>>>>>>
>>>>>>> In the interleaved template, each frame represents a line of size
>>>>>>> denoted by
>>>>>>> chunk.size and the stride by icg.  'numf' represent number of frames
>>>>>>> i.e.
>>>>>>> number of lines.
>>>>>>>
>>>>>>> In video frame context,
>>>>>>> chunk.size -> hsize
>>>>>>> chunk.icg -> stride
>>>>>>> numf -> vsize
>>>>>>> and frame_size is always 1 as it will have only one chunk in a line.
>>>>>>>
>>>>>> But you said in your last post
>>>>>>   "with each frame (a typical video frame size) being contiguous in
>>>>>> memory"
>>>>>>  ... which is not true from what you write above. Anyways, my first 2
>>>>>> suggestions still hold.
>>>>>
>>>>> Yes, each video frame is contiguous and they can be scattered.
>>>>>
>>>> I assume by contiguous frame you mean as in framebuffer?  Which is an
>>>> array of bytes.
>>>> If yes, then you should do as I suggest first,  frame_size=16  and numf=1.
>>>
>>> I think am confusing you.  I would like to explain with an example.  Lets
>>> say
>>> each video frame is 4k size starting at address 0x10004000 (-0x10005000) and
>>> other frame at 0x20002000 (-0x20003000), and so on.
>>>
>> As I said plz dont confuse video frame with DMA frame.... in video
>> frame the stride is constant(zero or not) whereas in DMA context the
>> stride must be zero for the frame to be called contiguous.
>>
>>> So, the frames are
>>> scattered in memory and as the template doesnt allow multiple src_start/
>>> dst_start we could not use single template to fill the HW descriptors (of
>>> frames).  So, I feel your suggestion might not work if the frames are
>>> scattered.
>>> Also, how could we get 'vsize' value in your approach?
>>>
>> The client driver(video driver) should know the frame parameters.
>> Practically you'll have to populate 16 transfer templates (frame
>> attributes and locations won't change for a session) and submit to be
>> transferred _cyclically_.
>>
>>>  More importantly,
>>> we are overriding the semantics of interleaved template members.
>>>
>> Not at all. Interleaved-dma isn't meant for only constant stride/icg
>> transfers. Rather it's for identical frames with random strides.
>>
>>>
>>>>
>>>> If no, then it seems you are already doing the right thing.... the
>>>> ring-buffer scheme. Please share some stats how the current api is
>>>> causing you overhead because that is a very common case (many
>>>> controllers support LLI) and you have 467ms (@30fps with 16-frames
>>>> ring-buffer) to queue in before you see any frame drop.
>>>
>>> As I mentioned earlier in the thread, our hardware has a SG engine where by
>>> we could send multiple frames in a batch.  Using the original implementation
>>> of interleaved API, we have three options to transfer.
>>>
>>> One is to send frame by frame to the hardware.  We get a async_tx desc for
>>> each frame and then we submit it to hardware triggering it to transfer this
>>> BD.
>>> We queue the next descriptor on the pending queue and whenever there is
>>> a completion interrupt we submit the next BD on this queue to hardware. In
>>> this implementation we are not efficiently using the SG engine in the
>>> hardware
>>> as we transferring frame by frame even HW allows us to transfer multiple
>>> frames.
>>>
>> Sending one frame at a time will likely cause jitters. That shouldn't be done.
>>
>>> The second option is to queue all the BDs until the maximum frames that HW
>>> is
>>> capable to transfer in a batch and then submit to SG engine in the HW.  With
>>> this approach I feel there will be additional software overhead to track the
>>> number
>>> of maximum transfers and few additional cycles to release the cookie of each
>>> desc.
>>> Here, each desc represents a frame.
>>>
>> APIs are written for the gcd of h/w. We should optimize only for
>> majority and here isn't even anything gained after changing the api.
>> What you want to 'optimize' has been there since ever. Nobody
>> considers that overhead. BTW you don't even want to spend a few 'extra
>> cycles' but what about every other platform that doesn't support this
>> and will have to scan for such transfer requests?
>>
>>> The last option is the current implementation of the driver along with this
>>> change in
>>> API.  It will allow us to send array of interleaved templates wherein we
>>> could allocate
>>> a single async desc which will handle multiple frames (or segments) and just
>>> submit
>>> this desc to HW.  Then we program the current to first frame, tail to last
>>> frame and
>>> HW will complete this transfer.  Here, each desc represents multiple frames.
>>>
>>> My point here is the driver should use hardware resources efficiently and I
>>> feel the
>>> driver will be inefficient if we dont use them.
>>>
>> I am afraid you are getting carried away by the 'awesomeness' of your
>> hardware. RingBuffers/Cyclic transfers  are meant for cases just like
>> yours.
>> Consider the following ... if you queue 16 frames and don't care to
>> track before all are transmitted, you'll have very high latency. Video
>> seeks will take unacceptably long and give the impression of a slow
>> system. Whereas if you get callbacks for each frame rendered, you
>> could updates frames from the next one thereby having very quick
>> response time.
>
> Not at all, at least in our hardware, when we submit transfers it is most
> likely to be completed. There are few errors, but mostly are recoverable
> and it doesnt stop the transfer unless there is a critical error which needs
> a reset to the whole system.  So, at least in my use case there will be
> no latency.  I am not saying hardware is great, but it is the IP implementation.
>
I am not talking about the error cases.
Apply your patch locally so that you queue 16frames and not get
notified upon each frame 'rendered'...  now click on 'seek bar' of the
video player. See how slow it is to jump to play from the new
location.  Or if the frames are to be encoded after dma transfer...
see the 'padding' that would need to be done at the end.  These
concerns are common with audio subsystem using ring-buffers.
Anyways that is just FYI, I don't care how you implement your platform.

> Regarding this API change, I had earlier explained my use case in my
> v2 thread.
> Lars and Vinod came up with this resolution to allow array of
> interleaved templates.
> I also feel this is reasonable change for the subsystem.
>
I don't see you getting any better performance from your hardware and
I certainly don't find your usecase anything new (many dma controllers
support LLI and they work just fine as such). And I have already
explained how you could do, whatever you want, without this change.
So a polite NAK from me.
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srikanth Thokala Feb. 27, 2014, 6:04 p.m. UTC | #17
On Wed, Feb 26, 2014 at 8:32 PM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
> On 26 February 2014 23:21, Srikanth Thokala <sthokal@xilinx.com> wrote:
>> On Mon, Feb 24, 2014 at 7:39 AM, Jassi Brar <jaswinder.singh@linaro.org> wrote:
>>> On 21 February 2014 23:37, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>> On Thu, Feb 20, 2014 at 3:23 PM, Jassi Brar <jaswinder.singh@linaro.org>
>>>> wrote:
>>>>> On 20 February 2014 14:54, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>> On Wed, Feb 19, 2014 at 12:33 AM, Jassi Brar <jaswinder.singh@linaro.org>
>>>>>> wrote:
>>>>>>> On 18 February 2014 23:16, Srikanth Thokala <sthokal@xilinx.com> wrote:
>>>>>>>> On Tue, Feb 18, 2014 at 10:20 PM, Jassi Brar
>>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>>> On 18 February 2014 16:58, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>>> wrote:
>>>>>>>>>> On Mon, Feb 17, 2014 at 3:27 PM, Jassi Brar
>>>>>>>>>> <jaswinder.singh@linaro.org> wrote:
>>>>>>>>>>> On 15 February 2014 17:30, Srikanth Thokala <sthokal@xilinx.com>
>>>>>>>>>>> wrote:
>>>>>>>>>>>> The current implementation of interleaved DMA API support multiple
>>>>>>>>>>>> frames only when the memory is contiguous by incrementing
>>>>>>>>>>>> src_start/
>>>>>>>>>>>> dst_start members of interleaved template.
>>>>>>>>>>>>
>>>>>>>>>>>> But, when the memory is non-contiguous it will restrict slave
>>>>>>>>>>>> device
>>>>>>>>>>>> to not submit multiple frames in a batch.  This patch handles this
>>>>>>>>>>>> issue by allowing the slave device to send array of interleaved dma
>>>>>>>>>>>> templates each having a different memory location.
>>>>>>>>>>>>
>>>>>>>>>>> How fragmented could be memory in your case? Is it inefficient to
>>>>>>>>>>> submit separate transfers for each segment/frame?
>>>>>>>>>>> It will help if you could give a typical example (chunk size and gap
>>>>>>>>>>> in bytes) of what you worry about.
>>>>>>>>>>
>>>>>>>>>> With scatter-gather engine feature in the hardware, submitting
>>>>>>>>>> separate
>>>>>>>>>> transfers for each frame look inefficient. As an example, our DMA
>>>>>>>>>> engine
>>>>>>>>>> supports up to 16 video frames, with each frame (a typical video
>>>>>>>>>> frame
>>>>>>>>>> size) being contiguous in memory but frames are scattered into
>>>>>>>>>> different
>>>>>>>>>> locations. We could not definitely submit frame by frame as it would
>>>>>>>>>> be
>>>>>>>>>> software overhead (HW interrupting for each frame) resulting in video
>>>>>>>>>> lags.
>>>>>>>>>>
>>>>>>>>> IIUIC, it is 30fps and one dma interrupt per frame ... it doesn't seem
>>>>>>>>> inefficient at all. Even poor-latency audio would generate a higher
>>>>>>>>> interrupt-rate. So the "inefficiency concern" doesn't seem valid to
>>>>>>>>> me.
>>>>>>>>>
>>>>>>>>> Not to mean we shouldn't strive to reduce the interrupt-rate further.
>>>>>>>>> Another option is to emulate the ring-buffer scheme of ALSA.... which
>>>>>>>>> should be possible since for a session of video playback the frame
>>>>>>>>> buffers' locations wouldn't change.
>>>>>>>>>
>>>>>>>>> Yet another option is to use the full potential of the
>>>>>>>>> interleaved-xfer api as such. It seems you confuse a 'video frame'
>>>>>>>>> with the interleaved-xfer api's 'frame'. They are different.
>>>>>>>>>
>>>>>>>>> Assuming your one video frame is F bytes long and Gk is the gap in
>>>>>>>>> bytes between end of frame [k] and start of frame [k+1] and  Gi != Gj
>>>>>>>>> for i!=j
>>>>>>>>> In the context of interleaved-xfer api, you have just 1 Frame of 16
>>>>>>>>> chunks. Each chunk is Fbytes and the inter-chunk-gap(ICG) is Gk  where
>>>>>>>>> 0<=k<15
>>>>>>>>> So for your use-case .....
>>>>>>>>>   dma_interleaved_template.numf = 1   /* just 1 frame */
>>>>>>>>>   dma_interleaved_template.frame_size = 16  /* containing 16 chunks */
>>>>>>>>>    ...... //other parameters
>>>>>>>>>
>>>>>>>>> You have 3 options to choose from and all should work just as fine.
>>>>>>>>> Otherwise please state your problem in real numbers (video-frames'
>>>>>>>>> size, count & gap in bytes).
>>>>>>>>
>>>>>>>> Initially I interpreted interleaved template the same.  But, Lars
>>>>>>>> corrected me
>>>>>>>> in the subsequent discussion and let me put it here briefly,
>>>>>>>>
>>>>>>>> In the interleaved template, each frame represents a line of size
>>>>>>>> denoted by
>>>>>>>> chunk.size and the stride by icg.  'numf' represent number of frames
>>>>>>>> i.e.
>>>>>>>> number of lines.
>>>>>>>>
>>>>>>>> In video frame context,
>>>>>>>> chunk.size -> hsize
>>>>>>>> chunk.icg -> stride
>>>>>>>> numf -> vsize
>>>>>>>> and frame_size is always 1 as it will have only one chunk in a line.
>>>>>>>>
>>>>>>> But you said in your last post
>>>>>>>   "with each frame (a typical video frame size) being contiguous in
>>>>>>> memory"
>>>>>>>  ... which is not true from what you write above. Anyways, my first 2
>>>>>>> suggestions still hold.
>>>>>>
>>>>>> Yes, each video frame is contiguous and they can be scattered.
>>>>>>
>>>>> I assume by contiguous frame you mean as in framebuffer?  Which is an
>>>>> array of bytes.
>>>>> If yes, then you should do as I suggest first,  frame_size=16  and numf=1.
>>>>
>>>> I think am confusing you.  I would like to explain with an example.  Lets
>>>> say
>>>> each video frame is 4k size starting at address 0x10004000 (-0x10005000) and
>>>> other frame at 0x20002000 (-0x20003000), and so on.
>>>>
>>> As I said plz dont confuse video frame with DMA frame.... in video
>>> frame the stride is constant(zero or not) whereas in DMA context the
>>> stride must be zero for the frame to be called contiguous.
>>>
>>>> So, the frames are
>>>> scattered in memory and as the template doesnt allow multiple src_start/
>>>> dst_start we could not use single template to fill the HW descriptors (of
>>>> frames).  So, I feel your suggestion might not work if the frames are
>>>> scattered.
>>>> Also, how could we get 'vsize' value in your approach?
>>>>
>>> The client driver(video driver) should know the frame parameters.
>>> Practically you'll have to populate 16 transfer templates (frame
>>> attributes and locations won't change for a session) and submit to be
>>> transferred _cyclically_.
>>>
>>>>  More importantly,
>>>> we are overriding the semantics of interleaved template members.
>>>>
>>> Not at all. Interleaved-dma isn't meant for only constant stride/icg
>>> transfers. Rather it's for identical frames with random strides.
>>>
>>>>
>>>>>
>>>>> If no, then it seems you are already doing the right thing.... the
>>>>> ring-buffer scheme. Please share some stats how the current api is
>>>>> causing you overhead because that is a very common case (many
>>>>> controllers support LLI) and you have 467ms (@30fps with 16-frames
>>>>> ring-buffer) to queue in before you see any frame drop.
>>>>
>>>> As I mentioned earlier in the thread, our hardware has a SG engine where by
>>>> we could send multiple frames in a batch.  Using the original implementation
>>>> of interleaved API, we have three options to transfer.
>>>>
>>>> One is to send frame by frame to the hardware.  We get a async_tx desc for
>>>> each frame and then we submit it to hardware triggering it to transfer this
>>>> BD.
>>>> We queue the next descriptor on the pending queue and whenever there is
>>>> a completion interrupt we submit the next BD on this queue to hardware. In
>>>> this implementation we are not efficiently using the SG engine in the
>>>> hardware
>>>> as we transferring frame by frame even HW allows us to transfer multiple
>>>> frames.
>>>>
>>> Sending one frame at a time will likely cause jitters. That shouldn't be done.
>>>
>>>> The second option is to queue all the BDs until the maximum frames that HW
>>>> is
>>>> capable to transfer in a batch and then submit to SG engine in the HW.  With
>>>> this approach I feel there will be additional software overhead to track the
>>>> number
>>>> of maximum transfers and few additional cycles to release the cookie of each
>>>> desc.
>>>> Here, each desc represents a frame.
>>>>
>>> APIs are written for the gcd of h/w. We should optimize only for
>>> majority and here isn't even anything gained after changing the api.
>>> What you want to 'optimize' has been there since ever. Nobody
>>> considers that overhead. BTW you don't even want to spend a few 'extra
>>> cycles' but what about every other platform that doesn't support this
>>> and will have to scan for such transfer requests?
>>>
>>>> The last option is the current implementation of the driver along with this
>>>> change in
>>>> API.  It will allow us to send array of interleaved templates wherein we
>>>> could allocate
>>>> a single async desc which will handle multiple frames (or segments) and just
>>>> submit
>>>> this desc to HW.  Then we program the current to first frame, tail to last
>>>> frame and
>>>> HW will complete this transfer.  Here, each desc represents multiple frames.
>>>>
>>>> My point here is the driver should use hardware resources efficiently and I
>>>> feel the
>>>> driver will be inefficient if we dont use them.
>>>>
>>> I am afraid you are getting carried away by the 'awesomeness' of your
>>> hardware. RingBuffers/Cyclic transfers  are meant for cases just like
>>> yours.
>>> Consider the following ... if you queue 16 frames and don't care to
>>> track before all are transmitted, you'll have very high latency. Video
>>> seeks will take unacceptably long and give the impression of a slow
>>> system. Whereas if you get callbacks for each frame rendered, you
>>> could updates frames from the next one thereby having very quick
>>> response time.
>>
>> Not at all, at least in our hardware, when we submit transfers it is most
>> likely to be completed. There are few errors, but mostly are recoverable
>> and it doesnt stop the transfer unless there is a critical error which needs
>> a reset to the whole system.  So, at least in my use case there will be
>> no latency.  I am not saying hardware is great, but it is the IP implementation.
>>
> I am not talking about the error cases.
> Apply your patch locally so that you queue 16frames and not get
> notified upon each frame 'rendered'...  now click on 'seek bar' of the
> video player. See how slow it is to jump to play from the new
> location.  Or if the frames are to be encoded after dma transfer...
> see the 'padding' that would need to be done at the end.  These
> concerns are common with audio subsystem using ring-buffers.
> Anyways that is just FYI, I don't care how you implement your platform.
>
>> Regarding this API change, I had earlier explained my use case in my
>> v2 thread.
>> Lars and Vinod came up with this resolution to allow array of
>> interleaved templates.
>> I also feel this is reasonable change for the subsystem.
>>
> I don't see you getting any better performance from your hardware and
> I certainly don't find your usecase anything new (many dma controllers
> support LLI and they work just fine as such). And I have already
> explained how you could do, whatever you want, without this change.
> So a polite NAK from me.

Ok.  I would go with your suggestion of having frame_size = 16 and will
send v4 dropping this change.

Thanks for the valuable suggestions.

Srikanth

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
index 879b6e3..c642614 100644
--- a/Documentation/dmaengine.txt
+++ b/Documentation/dmaengine.txt
@@ -94,7 +94,7 @@  The slave DMA usage consists of following steps:
 		size_t period_len, enum dma_data_direction direction);
 
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
-		struct dma_chan *chan, struct dma_interleaved_template *xt,
+		struct dma_chan *chan, struct dma_interleaved_template **xts,
 		unsigned long flags);
 
    The peripheral driver is expected to have mapped the scatterlist for
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 6f9ac20..e2c52ce 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -954,12 +954,13 @@  static struct dma_async_tx_descriptor *imxdma_prep_dma_memcpy(
 }
 
 static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
-	struct dma_chan *chan, struct dma_interleaved_template *xt,
+	struct dma_chan *chan, struct dma_interleaved_template **xts,
 	unsigned long flags)
 {
 	struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
 	struct imxdma_engine *imxdma = imxdmac->imxdma;
 	struct imxdma_desc *desc;
+	struct dma_interleaved_template *xt = *xts;
 
 	dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx dst_start=0x%llx\n"
 		"   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n", __func__,
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
index d4d3a31..b6a150b 100644
--- a/drivers/dma/sirf-dma.c
+++ b/drivers/dma/sirf-dma.c
@@ -509,12 +509,13 @@  sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 }
 
 static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
-	struct dma_chan *chan, struct dma_interleaved_template *xt,
+	struct dma_chan *chan, struct dma_interleaved_template **xts,
 	unsigned long flags)
 {
 	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
 	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
 	struct sirfsoc_dma_desc *sdesc = NULL;
+	struct dma_interleaved_template *xt = *xts;
 	unsigned long iflags;
 	int ret;
 
diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c
index 6bb86b5..468110a 100644
--- a/drivers/media/platform/m2m-deinterlace.c
+++ b/drivers/media/platform/m2m-deinterlace.c
@@ -343,7 +343,7 @@  static void deinterlace_issue_dma(struct deinterlace_ctx *ctx, int op,
 	ctx->xt->dst_sgl = true;
 	flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
 
-	tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
+	tx = dmadev->device_prep_interleaved_dma(chan, &ctx->xt, flags);
 	if (tx == NULL) {
 		v4l2_warn(&pcdev->v4l2_dev, "DMA interleaved prep error\n");
 		return;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c5c92d5..2f77a9a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -675,7 +675,7 @@  struct dma_device {
 		size_t period_len, enum dma_transfer_direction direction,
 		unsigned long flags, void *context);
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
-		struct dma_chan *chan, struct dma_interleaved_template *xt,
+		struct dma_chan *chan, struct dma_interleaved_template **xts,
 		unsigned long flags);
 	int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 		unsigned long arg);
@@ -752,10 +752,10 @@  static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
 }
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
-		struct dma_chan *chan, struct dma_interleaved_template *xt,
+		struct dma_chan *chan, struct dma_interleaved_template **xts,
 		unsigned long flags)
 {
-	return chan->device->device_prep_interleaved_dma(chan, xt, flags);
+	return chan->device->device_prep_interleaved_dma(chan, xts, flags);
 }
 
 static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)