Message ID | 20201214081310.10746-3-peter.ujfalusi@ti.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | dmaengine: ti: k3-udma: memcpy throughput improvement | expand |
On 14-12-20, 10:13, Peter Ujfalusi wrote: > The UDMA and BCDMA can provide higher throughput if the burst_size of the > channel is changed from it's default (which is 64 bytes) for Ultra-high > and high capacity channels. > > This performance benefit is even more visible when the buffers are aligned > with the burst_size configuration. > > The am654 does not have a way to change the burst size, but it is using > 64 bytes burst, so increasing the copy_align from 8 bytes to 64 (and > clients taking that into account) can increase the throughput as well. > > Numbers gathered on j721e: > echo 8000000 > /sys/module/dmatest/parameters/test_buf_size > echo 2000 > /sys/module/dmatest/parameters/timeout > echo 50 > /sys/module/dmatest/parameters/iterations > echo 1 > /sys/module/dmatest/parameters/max_channels > > Prior this patch: ~1.3 GB/s > After this patch: ~1.8 GB/s > with 1 byte alignment: ~1.7 GB/s > > Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com> > --- > drivers/dma/ti/k3-udma.c | 115 +++++++++++++++++++++++++++++++++++++-- > 1 file changed, 110 insertions(+), 5 deletions(-) > > diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c > index 87157cbae1b8..54e4ccb1b37e 100644 > --- a/drivers/dma/ti/k3-udma.c > +++ b/drivers/dma/ti/k3-udma.c > @@ -121,6 +121,11 @@ struct udma_oes_offsets { > #define UDMA_FLAG_PDMA_ACC32 BIT(0) > #define UDMA_FLAG_PDMA_BURST BIT(1) > #define UDMA_FLAG_TDTYPE BIT(2) > +#define UDMA_FLAG_BURST_SIZE BIT(3) > +#define UDMA_FLAGS_J7_CLASS (UDMA_FLAG_PDMA_ACC32 | \ > + UDMA_FLAG_PDMA_BURST | \ > + UDMA_FLAG_TDTYPE | \ > + UDMA_FLAG_BURST_SIZE) > > struct udma_match_data { > enum k3_dma_type type; > @@ -128,6 +133,7 @@ struct udma_match_data { > bool enable_memcpy_support; > u32 flags; > u32 statictr_z_mask; > + u8 burst_size[3]; > }; > > struct udma_soc_data { > @@ -436,6 +442,18 @@ static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel) > } > } > > +static u8 udma_get_chan_tpl_index(struct udma_tpl *tpl_map, int chan_id) > +{ > + int i; > + > + for (i = 0; i < tpl_map->levels; i++) { > + if (chan_id >= tpl_map->start_idx[i]) > + return i; > + } Braces seem not required > + > + return 0; > +} > + > static void udma_reset_uchan(struct udma_chan *uc) > { > memset(&uc->config, 0, sizeof(uc->config)); > @@ -1811,6 +1829,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; > struct udma_tchan *tchan = uc->tchan; > struct udma_rchan *rchan = uc->rchan; > + u8 burst_size = 0; > int ret = 0; > > /* Non synchronized - mem to mem type of transfer */ > @@ -1818,6 +1837,12 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; > struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; > > + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { > + u8 tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, tchan->id); Can we define variable at function start please > + > + burst_size = ud->match_data->burst_size[tpl]; > + } > + > req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS; > req_tx.nav_id = tisci_rm->tisci_dev_id; > req_tx.index = tchan->id; > @@ -1825,6 +1850,10 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > req_tx.tx_fetch_size = sizeof(struct cppi5_desc_hdr_t) >> 2; > req_tx.txcq_qnum = tc_ring; > req_tx.tx_atype = ud->atype; > + if (burst_size) { > + req_tx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; > + req_tx.tx_burst_size = burst_size; > + } > > ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); > if (ret) { > @@ -1839,6 +1868,10 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > req_rx.rxcq_qnum = tc_ring; > req_rx.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR; > req_rx.rx_atype = ud->atype; > + if (burst_size) { > + req_rx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; > + req_rx.rx_burst_size = burst_size; > + } > > ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx); > if (ret) > @@ -1854,12 +1887,23 @@ static int bcdma_tisci_m2m_channel_config(struct udma_chan *uc) > const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; > struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; > struct udma_bchan *bchan = uc->bchan; > + u8 burst_size = 0; > int ret = 0; > > + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { > + u8 tpl = udma_get_chan_tpl_index(&ud->bchan_tpl, bchan->id); here as well > + > + burst_size = ud->match_data->burst_size[tpl]; > + } > + > req_tx.valid_params = TISCI_BCDMA_BCHAN_VALID_PARAMS; > req_tx.nav_id = tisci_rm->tisci_dev_id; > req_tx.extended_ch_type = TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN; > req_tx.index = bchan->id; > + if (burst_size) { > + req_tx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; > + req_tx.tx_burst_size = burst_size; > + } > > ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); > if (ret) > @@ -4167,6 +4211,11 @@ static struct udma_match_data am654_main_data = { > .psil_base = 0x1000, > .enable_memcpy_support = true, > .statictr_z_mask = GENMASK(11, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* H Channels */ > + 0, /* No UH Channels */ > + }, > }; > > static struct udma_match_data am654_mcu_data = { > @@ -4174,38 +4223,63 @@ static struct udma_match_data am654_mcu_data = { > .psil_base = 0x6000, > .enable_memcpy_support = false, > .statictr_z_mask = GENMASK(11, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* H Channels */ > + 0, /* No UH Channels */ > + }, > }; > > static struct udma_match_data j721e_main_data = { > .type = DMA_TYPE_UDMA, > .psil_base = 0x1000, > .enable_memcpy_support = true, > - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, > + .flags = UDMA_FLAGS_J7_CLASS, > .statictr_z_mask = GENMASK(23, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES, /* H Channels */ > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES, /* UH Channels */ > + }, > }; > > static struct udma_match_data j721e_mcu_data = { > .type = DMA_TYPE_UDMA, > .psil_base = 0x6000, > .enable_memcpy_support = false, /* MEM_TO_MEM is slow via MCU UDMA */ > - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, > + .flags = UDMA_FLAGS_J7_CLASS, > .statictr_z_mask = GENMASK(23, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES, /* H Channels */ > + 0, /* No UH Channels */ > + }, > }; > > static struct udma_match_data am64_bcdma_data = { > .type = DMA_TYPE_BCDMA, > .psil_base = 0x2000, /* for tchan and rchan, not applicable to bchan */ > .enable_memcpy_support = true, /* Supported via bchan */ > - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, > + .flags = UDMA_FLAGS_J7_CLASS, > .statictr_z_mask = GENMASK(23, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + 0, /* No H Channels */ > + 0, /* No UH Channels */ > + }, > }; > > static struct udma_match_data am64_pktdma_data = { > .type = DMA_TYPE_PKTDMA, > .psil_base = 0x1000, > .enable_memcpy_support = false, /* PKTDMA does not support MEM_TO_MEM */ > - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, > + .flags = UDMA_FLAGS_J7_CLASS, > .statictr_z_mask = GENMASK(23, 0), > + .burst_size = { > + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ > + 0, /* No H Channels */ > + 0, /* No UH Channels */ > + }, > }; > > static const struct of_device_id udma_of_match[] = { > @@ -5045,6 +5119,35 @@ static void udma_dbg_summary_show(struct seq_file *s, > } > #endif /* CONFIG_DEBUG_FS */ > > +static enum dmaengine_alignment udma_get_copy_align(struct udma_dev *ud) > +{ > + const struct udma_match_data *match_data = ud->match_data; > + u8 tpl; > + > + if (!match_data->enable_memcpy_support) > + return DMAENGINE_ALIGN_8_BYTES; > + > + /* Get the highest TPL level the device supports for memcpy */ > + if (ud->bchan_cnt) { > + tpl = udma_get_chan_tpl_index(&ud->bchan_tpl, 0); > + } else if (ud->tchan_cnt) { > + tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, 0); > + } else { > + return DMAENGINE_ALIGN_8_BYTES; > + } Braces seem not required > + > + switch (match_data->burst_size[tpl]) { > + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES: > + return DMAENGINE_ALIGN_256_BYTES; > + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES: > + return DMAENGINE_ALIGN_128_BYTES; > + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES: > + fallthrough; > + default: > + return DMAENGINE_ALIGN_64_BYTES; ah, we are supposed to have case at same indent as switch, pls run checkpatch to have these flagged off
Hi Vinod, On 1/12/21 12:16 PM, Vinod Koul wrote: > On 14-12-20, 10:13, Peter Ujfalusi wrote: >> The UDMA and BCDMA can provide higher throughput if the burst_size of the >> channel is changed from it's default (which is 64 bytes) for Ultra-high >> and high capacity channels. >> >> This performance benefit is even more visible when the buffers are aligned >> with the burst_size configuration. >> >> The am654 does not have a way to change the burst size, but it is using >> 64 bytes burst, so increasing the copy_align from 8 bytes to 64 (and >> clients taking that into account) can increase the throughput as well. >> >> Numbers gathered on j721e: >> echo 8000000 > /sys/module/dmatest/parameters/test_buf_size >> echo 2000 > /sys/module/dmatest/parameters/timeout >> echo 50 > /sys/module/dmatest/parameters/iterations >> echo 1 > /sys/module/dmatest/parameters/max_channels >> >> Prior this patch: ~1.3 GB/s >> After this patch: ~1.8 GB/s >> with 1 byte alignment: ~1.7 GB/s >> >> Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com> >> --- >> drivers/dma/ti/k3-udma.c | 115 +++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 110 insertions(+), 5 deletions(-) >> >> diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c >> index 87157cbae1b8..54e4ccb1b37e 100644 >> --- a/drivers/dma/ti/k3-udma.c >> +++ b/drivers/dma/ti/k3-udma.c >> @@ -121,6 +121,11 @@ struct udma_oes_offsets { >> #define UDMA_FLAG_PDMA_ACC32 BIT(0) >> #define UDMA_FLAG_PDMA_BURST BIT(1) >> #define UDMA_FLAG_TDTYPE BIT(2) >> +#define UDMA_FLAG_BURST_SIZE BIT(3) >> +#define UDMA_FLAGS_J7_CLASS (UDMA_FLAG_PDMA_ACC32 | \ >> + UDMA_FLAG_PDMA_BURST | \ >> + UDMA_FLAG_TDTYPE | \ >> + UDMA_FLAG_BURST_SIZE) >> >> struct udma_match_data { >> enum k3_dma_type type; >> @@ -128,6 +133,7 @@ struct udma_match_data { >> bool enable_memcpy_support; >> u32 flags; >> u32 statictr_z_mask; >> + u8 burst_size[3]; >> }; >> >> struct udma_soc_data { >> @@ -436,6 +442,18 @@ static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel) >> } >> } >> >> +static u8 udma_get_chan_tpl_index(struct udma_tpl *tpl_map, int chan_id) >> +{ >> + int i; >> + >> + for (i = 0; i < tpl_map->levels; i++) { >> + if (chan_id >= tpl_map->start_idx[i]) >> + return i; >> + } > > Braces seem not required True, they are not strictly needed but I prefer to have them when I have any condition in the loop. >> + >> + return 0; >> +} >> + >> static void udma_reset_uchan(struct udma_chan *uc) >> { >> memset(&uc->config, 0, sizeof(uc->config)); >> @@ -1811,6 +1829,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) >> const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; >> struct udma_tchan *tchan = uc->tchan; >> struct udma_rchan *rchan = uc->rchan; >> + u8 burst_size = 0; >> int ret = 0; >> >> /* Non synchronized - mem to mem type of transfer */ >> @@ -1818,6 +1837,12 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) >> struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; >> struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; >> >> + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { >> + u8 tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, tchan->id); > > Can we define variable at function start please The 'tpl' is only used within this if branch, it looks a bit cleaner imho, but if you insist, I can move the definition. ... >> +static enum dmaengine_alignment udma_get_copy_align(struct udma_dev *ud) >> +{ >> + const struct udma_match_data *match_data = ud->match_data; >> + u8 tpl; >> + >> + if (!match_data->enable_memcpy_support) >> + return DMAENGINE_ALIGN_8_BYTES; >> + >> + /* Get the highest TPL level the device supports for memcpy */ >> + if (ud->bchan_cnt) { >> + tpl = udma_get_chan_tpl_index(&ud->bchan_tpl, 0); >> + } else if (ud->tchan_cnt) { >> + tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, 0); >> + } else { >> + return DMAENGINE_ALIGN_8_BYTES; >> + } > > Braces seem not required Very true. > >> + >> + switch (match_data->burst_size[tpl]) { >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES: >> + return DMAENGINE_ALIGN_256_BYTES; >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES: >> + return DMAENGINE_ALIGN_128_BYTES; >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES: >> + fallthrough; >> + default: >> + return DMAENGINE_ALIGN_64_BYTES; > > ah, we are supposed to have case at same indent as switch, pls run > checkpatch to have these flagged off Yes, they should be. The other me did a sloppy job for sure, this should have been screaming even without checkpatch... This has been done in a rush during the last days to close on the backlog item which got the most votes.
On 13-01-21, 09:39, Péter Ujfalusi wrote: > Hi Vinod, > > On 1/12/21 12:16 PM, Vinod Koul wrote: > > On 14-12-20, 10:13, Peter Ujfalusi wrote: > >> The UDMA and BCDMA can provide higher throughput if the burst_size of the > >> channel is changed from it's default (which is 64 bytes) for Ultra-high > >> and high capacity channels. > >> > >> This performance benefit is even more visible when the buffers are aligned > >> with the burst_size configuration. > >> > >> The am654 does not have a way to change the burst size, but it is using > >> 64 bytes burst, so increasing the copy_align from 8 bytes to 64 (and > >> clients taking that into account) can increase the throughput as well. > >> > >> Numbers gathered on j721e: > >> echo 8000000 > /sys/module/dmatest/parameters/test_buf_size > >> echo 2000 > /sys/module/dmatest/parameters/timeout > >> echo 50 > /sys/module/dmatest/parameters/iterations > >> echo 1 > /sys/module/dmatest/parameters/max_channels > >> > >> Prior this patch: ~1.3 GB/s > >> After this patch: ~1.8 GB/s > >> with 1 byte alignment: ~1.7 GB/s > >> > >> Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com> > >> --- > >> drivers/dma/ti/k3-udma.c | 115 +++++++++++++++++++++++++++++++++++++-- > >> 1 file changed, 110 insertions(+), 5 deletions(-) > >> > >> diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c > >> index 87157cbae1b8..54e4ccb1b37e 100644 > >> --- a/drivers/dma/ti/k3-udma.c > >> +++ b/drivers/dma/ti/k3-udma.c > >> @@ -121,6 +121,11 @@ struct udma_oes_offsets { > >> #define UDMA_FLAG_PDMA_ACC32 BIT(0) > >> #define UDMA_FLAG_PDMA_BURST BIT(1) > >> #define UDMA_FLAG_TDTYPE BIT(2) > >> +#define UDMA_FLAG_BURST_SIZE BIT(3) > >> +#define UDMA_FLAGS_J7_CLASS (UDMA_FLAG_PDMA_ACC32 | \ > >> + UDMA_FLAG_PDMA_BURST | \ > >> + UDMA_FLAG_TDTYPE | \ > >> + UDMA_FLAG_BURST_SIZE) > >> > >> struct udma_match_data { > >> enum k3_dma_type type; > >> @@ -128,6 +133,7 @@ struct udma_match_data { > >> bool enable_memcpy_support; > >> u32 flags; > >> u32 statictr_z_mask; > >> + u8 burst_size[3]; > >> }; > >> > >> struct udma_soc_data { > >> @@ -436,6 +442,18 @@ static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel) > >> } > >> } > >> > >> +static u8 udma_get_chan_tpl_index(struct udma_tpl *tpl_map, int chan_id) > >> +{ > >> + int i; > >> + > >> + for (i = 0; i < tpl_map->levels; i++) { > >> + if (chan_id >= tpl_map->start_idx[i]) > >> + return i; > >> + } > > > > Braces seem not required > > True, they are not strictly needed but I prefer to have them when I have > any condition in the loop. ok > >> static void udma_reset_uchan(struct udma_chan *uc) > >> { > >> memset(&uc->config, 0, sizeof(uc->config)); > >> @@ -1811,6 +1829,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > >> const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; > >> struct udma_tchan *tchan = uc->tchan; > >> struct udma_rchan *rchan = uc->rchan; > >> + u8 burst_size = 0; > >> int ret = 0; > >> > >> /* Non synchronized - mem to mem type of transfer */ > >> @@ -1818,6 +1837,12 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) > >> struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; > >> struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; > >> > >> + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { > >> + u8 tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, tchan->id); > > > > Can we define variable at function start please > > The 'tpl' is only used within this if branch, it looks a bit cleaner > imho, but if you insist, I can move the definition. yeah lets be consistent and keep them at the start of the function please > >> + switch (match_data->burst_size[tpl]) { > >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES: > >> + return DMAENGINE_ALIGN_256_BYTES; > >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES: > >> + return DMAENGINE_ALIGN_128_BYTES; > >> + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES: > >> + fallthrough; > >> + default: > >> + return DMAENGINE_ALIGN_64_BYTES; > > > > ah, we are supposed to have case at same indent as switch, pls run > > checkpatch to have these flagged off > > Yes, they should be. > > The other me did a sloppy job for sure, this should have been screaming > even without checkpatch... > This has been done in a rush during the last days to close on the > backlog item which got the most votes. no worries, that is where reviews help :)
diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 87157cbae1b8..54e4ccb1b37e 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -121,6 +121,11 @@ struct udma_oes_offsets { #define UDMA_FLAG_PDMA_ACC32 BIT(0) #define UDMA_FLAG_PDMA_BURST BIT(1) #define UDMA_FLAG_TDTYPE BIT(2) +#define UDMA_FLAG_BURST_SIZE BIT(3) +#define UDMA_FLAGS_J7_CLASS (UDMA_FLAG_PDMA_ACC32 | \ + UDMA_FLAG_PDMA_BURST | \ + UDMA_FLAG_TDTYPE | \ + UDMA_FLAG_BURST_SIZE) struct udma_match_data { enum k3_dma_type type; @@ -128,6 +133,7 @@ struct udma_match_data { bool enable_memcpy_support; u32 flags; u32 statictr_z_mask; + u8 burst_size[3]; }; struct udma_soc_data { @@ -436,6 +442,18 @@ static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel) } } +static u8 udma_get_chan_tpl_index(struct udma_tpl *tpl_map, int chan_id) +{ + int i; + + for (i = 0; i < tpl_map->levels; i++) { + if (chan_id >= tpl_map->start_idx[i]) + return i; + } + + return 0; +} + static void udma_reset_uchan(struct udma_chan *uc) { memset(&uc->config, 0, sizeof(uc->config)); @@ -1811,6 +1829,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; struct udma_tchan *tchan = uc->tchan; struct udma_rchan *rchan = uc->rchan; + u8 burst_size = 0; int ret = 0; /* Non synchronized - mem to mem type of transfer */ @@ -1818,6 +1837,12 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { + u8 tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, tchan->id); + + burst_size = ud->match_data->burst_size[tpl]; + } + req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS; req_tx.nav_id = tisci_rm->tisci_dev_id; req_tx.index = tchan->id; @@ -1825,6 +1850,10 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) req_tx.tx_fetch_size = sizeof(struct cppi5_desc_hdr_t) >> 2; req_tx.txcq_qnum = tc_ring; req_tx.tx_atype = ud->atype; + if (burst_size) { + req_tx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; + req_tx.tx_burst_size = burst_size; + } ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); if (ret) { @@ -1839,6 +1868,10 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) req_rx.rxcq_qnum = tc_ring; req_rx.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR; req_rx.rx_atype = ud->atype; + if (burst_size) { + req_rx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; + req_rx.rx_burst_size = burst_size; + } ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx); if (ret) @@ -1854,12 +1887,23 @@ static int bcdma_tisci_m2m_channel_config(struct udma_chan *uc) const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; struct udma_bchan *bchan = uc->bchan; + u8 burst_size = 0; int ret = 0; + if (ud->match_data->flags & UDMA_FLAG_BURST_SIZE) { + u8 tpl = udma_get_chan_tpl_index(&ud->bchan_tpl, bchan->id); + + burst_size = ud->match_data->burst_size[tpl]; + } + req_tx.valid_params = TISCI_BCDMA_BCHAN_VALID_PARAMS; req_tx.nav_id = tisci_rm->tisci_dev_id; req_tx.extended_ch_type = TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN; req_tx.index = bchan->id; + if (burst_size) { + req_tx.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID; + req_tx.tx_burst_size = burst_size; + } ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); if (ret) @@ -4167,6 +4211,11 @@ static struct udma_match_data am654_main_data = { .psil_base = 0x1000, .enable_memcpy_support = true, .statictr_z_mask = GENMASK(11, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* H Channels */ + 0, /* No UH Channels */ + }, }; static struct udma_match_data am654_mcu_data = { @@ -4174,38 +4223,63 @@ static struct udma_match_data am654_mcu_data = { .psil_base = 0x6000, .enable_memcpy_support = false, .statictr_z_mask = GENMASK(11, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* H Channels */ + 0, /* No UH Channels */ + }, }; static struct udma_match_data j721e_main_data = { .type = DMA_TYPE_UDMA, .psil_base = 0x1000, .enable_memcpy_support = true, - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .flags = UDMA_FLAGS_J7_CLASS, .statictr_z_mask = GENMASK(23, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES, /* H Channels */ + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES, /* UH Channels */ + }, }; static struct udma_match_data j721e_mcu_data = { .type = DMA_TYPE_UDMA, .psil_base = 0x6000, .enable_memcpy_support = false, /* MEM_TO_MEM is slow via MCU UDMA */ - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .flags = UDMA_FLAGS_J7_CLASS, .statictr_z_mask = GENMASK(23, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES, /* H Channels */ + 0, /* No UH Channels */ + }, }; static struct udma_match_data am64_bcdma_data = { .type = DMA_TYPE_BCDMA, .psil_base = 0x2000, /* for tchan and rchan, not applicable to bchan */ .enable_memcpy_support = true, /* Supported via bchan */ - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .flags = UDMA_FLAGS_J7_CLASS, .statictr_z_mask = GENMASK(23, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + 0, /* No H Channels */ + 0, /* No UH Channels */ + }, }; static struct udma_match_data am64_pktdma_data = { .type = DMA_TYPE_PKTDMA, .psil_base = 0x1000, .enable_memcpy_support = false, /* PKTDMA does not support MEM_TO_MEM */ - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .flags = UDMA_FLAGS_J7_CLASS, .statictr_z_mask = GENMASK(23, 0), + .burst_size = { + TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES, /* Normal Channels */ + 0, /* No H Channels */ + 0, /* No UH Channels */ + }, }; static const struct of_device_id udma_of_match[] = { @@ -5045,6 +5119,35 @@ static void udma_dbg_summary_show(struct seq_file *s, } #endif /* CONFIG_DEBUG_FS */ +static enum dmaengine_alignment udma_get_copy_align(struct udma_dev *ud) +{ + const struct udma_match_data *match_data = ud->match_data; + u8 tpl; + + if (!match_data->enable_memcpy_support) + return DMAENGINE_ALIGN_8_BYTES; + + /* Get the highest TPL level the device supports for memcpy */ + if (ud->bchan_cnt) { + tpl = udma_get_chan_tpl_index(&ud->bchan_tpl, 0); + } else if (ud->tchan_cnt) { + tpl = udma_get_chan_tpl_index(&ud->tchan_tpl, 0); + } else { + return DMAENGINE_ALIGN_8_BYTES; + } + + switch (match_data->burst_size[tpl]) { + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES: + return DMAENGINE_ALIGN_256_BYTES; + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES: + return DMAENGINE_ALIGN_128_BYTES; + case TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES: + fallthrough; + default: + return DMAENGINE_ALIGN_64_BYTES; + } +} + #define TI_UDMAC_BUSWIDTHS (BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \ BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \ BIT(DMA_SLAVE_BUSWIDTH_3_BYTES) | \ @@ -5201,7 +5304,6 @@ static int udma_probe(struct platform_device *pdev) ud->ddev.dst_addr_widths = TI_UDMAC_BUSWIDTHS; ud->ddev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); ud->ddev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; - ud->ddev.copy_align = DMAENGINE_ALIGN_8_BYTES; ud->ddev.desc_metadata_modes = DESC_METADATA_CLIENT | DESC_METADATA_ENGINE; if (ud->match_data->enable_memcpy_support && @@ -5283,6 +5385,9 @@ static int udma_probe(struct platform_device *pdev) INIT_DELAYED_WORK(&uc->tx_drain.work, udma_check_tx_completion); } + /* Configure the copy_align to the maximum burst size the device supports */ + ud->ddev.copy_align = udma_get_copy_align(ud); + ret = dma_async_device_register(&ud->ddev); if (ret) { dev_err(dev, "failed to register slave DMA engine: %d\n", ret);
The UDMA and BCDMA can provide higher throughput if the burst_size of the channel is changed from it's default (which is 64 bytes) for Ultra-high and high capacity channels. This performance benefit is even more visible when the buffers are aligned with the burst_size configuration. The am654 does not have a way to change the burst size, but it is using 64 bytes burst, so increasing the copy_align from 8 bytes to 64 (and clients taking that into account) can increase the throughput as well. Numbers gathered on j721e: echo 8000000 > /sys/module/dmatest/parameters/test_buf_size echo 2000 > /sys/module/dmatest/parameters/timeout echo 50 > /sys/module/dmatest/parameters/iterations echo 1 > /sys/module/dmatest/parameters/max_channels Prior this patch: ~1.3 GB/s After this patch: ~1.8 GB/s with 1 byte alignment: ~1.7 GB/s Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com> --- drivers/dma/ti/k3-udma.c | 115 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 5 deletions(-)