diff mbox series

[net-next,v3,4/4] net: ocelot: add FDMA support

Message ID 20211126172739.329098-5-clement.leger@bootlin.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Add FDMA support on ocelot switch driver | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next

Commit Message

Clément Léger Nov. 26, 2021, 5:27 p.m. UTC
Ethernet frames can be extracted or injected autonomously to or from
the device’s DDR3/DDR3L memory and/or PCIe memory space. Linked list
data structures in memory are used for injecting or extracting Ethernet
frames. The FDMA generates interrupts when frame extraction or
injection is done and when the linked lists need updating.

The FDMA is shared between all the ethernet ports of the switch and
uses a linked list of descriptors (DCB) to inject and extract packets.
Before adding descriptors, the FDMA channels must be stopped. It would
be inefficient to do that each time a descriptor would be added so the
channels are restarted only once they stopped.

Both channels uses ring-like structure to feed the DCBs to the FDMA.
head and tail are never touched by hardware and are completely handled
by the driver.

Co-developed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Clément Léger <clement.leger@bootlin.com>
---
 drivers/net/ethernet/mscc/Makefile         |   1 +
 drivers/net/ethernet/mscc/ocelot.c         |  43 +-
 drivers/net/ethernet/mscc/ocelot.h         |   1 +
 drivers/net/ethernet/mscc/ocelot_fdma.c    | 713 +++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_fdma.h    |  96 +++
 drivers/net/ethernet/mscc/ocelot_net.c     |  18 +-
 drivers/net/ethernet/mscc/ocelot_vsc7514.c |  13 +
 include/soc/mscc/ocelot.h                  |   4 +
 8 files changed, 869 insertions(+), 20 deletions(-)
 create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.c
 create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.h

Comments

kernel test robot Nov. 26, 2021, 8:04 p.m. UTC | #1
Hi "Clément,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on robh/for-next]
[also build test WARNING on net/master linus/master v5.16-rc2 next-20211126]
[cannot apply to net-next/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Cl-ment-L-ger/Add-FDMA-support-on-ocelot-switch-driver/20211127-013140
base:   https://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git for-next
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20211127/202111270323.iOXcpsFC-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/07c95fe9105be293d2b7edf193e5fd139c70c194
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Cl-ment-L-ger/Add-FDMA-support-on-ocelot-switch-driver/20211127-013140
        git checkout 07c95fe9105be293d2b7edf193e5fd139c70c194
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=m68k SHELL=/bin/bash drivers/net/ethernet/mscc/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/mscc/ocelot_fdma.c: In function 'ocelot_fdma_tx_cleanup':
>> drivers/net/ethernet/mscc/ocelot_fdma.c:306:22: warning: variable 'tmp_head' set but not used [-Wunused-but-set-variable]
     306 |         unsigned int tmp_head, new_null_llp_idx;
         |                      ^~~~~~~~


vim +/tmp_head +306 drivers/net/ethernet/mscc/ocelot_fdma.c

   302	
   303	static void ocelot_fdma_tx_cleanup(struct ocelot_fdma *fdma, int budget)
   304	{
   305		struct ocelot_fdma_ring *ring = &fdma->inj;
 > 306		unsigned int tmp_head, new_null_llp_idx;
   307		struct ocelot_fdma_dcb *dcb;
   308		bool end_of_list = false;
   309		int ret;
   310	
   311		spin_lock_bh(&fdma->xmit_lock);
   312	
   313		/* Purge the TX packets that have been sent up to the NULL llp or the
   314		 * end of done list.
   315		 */
   316		while (!ocelot_fdma_ring_empty(&fdma->inj)) {
   317			dcb = &ring->dcbs[ring->head];
   318			if (!(dcb->hw->stat & MSCC_FDMA_DCB_STAT_PD))
   319				break;
   320	
   321			tmp_head = ring->head;
   322			ring->head = ocelot_fdma_idx_incr(ring->head);
   323	
   324			dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
   325					 DMA_TO_DEVICE);
   326			napi_consume_skb(dcb->skb, budget);
   327	
   328			/* If we hit the NULL LLP, stop, we might need to reload FDMA */
   329			if (dcb->hw->llp == 0) {
   330				end_of_list = true;
   331				break;
   332			}
   333		}
   334	
   335		/* If there is still some DCBs to be processed by the FDMA or if the
   336		 * pending list is empty, there is no need to restart the FDMA.
   337		 */
   338		if (!end_of_list || ocelot_fdma_ring_empty(&fdma->inj))
   339			goto out_unlock;
   340	
   341		ret = ocelot_fdma_wait_chan_safe(fdma, MSCC_FDMA_INJ_CHAN);
   342		if (ret) {
   343			dev_warn(fdma->dev, "Failed to wait for TX channel to stop\n");
   344			goto out_unlock;
   345		}
   346	
   347		/* Set NULL LLP */
   348		new_null_llp_idx = ocelot_fdma_idx_decr(ring->tail);
   349		dcb = &ring->dcbs[new_null_llp_idx];
   350		dcb->hw->llp = 0;
   351	
   352		dcb = &ring->dcbs[ring->head];
   353		ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
   354	
   355	out_unlock:
   356		spin_unlock_bh(&fdma->xmit_lock);
   357	}
   358	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Vladimir Oltean Nov. 27, 2021, 2:58 p.m. UTC | #2
On Fri, Nov 26, 2021 at 06:27:39PM +0100, Clément Léger wrote:
> Ethernet frames can be extracted or injected autonomously to or from
> the device’s DDR3/DDR3L memory and/or PCIe memory space. Linked list
> data structures in memory are used for injecting or extracting Ethernet
> frames. The FDMA generates interrupts when frame extraction or
> injection is done and when the linked lists need updating.
> 
> The FDMA is shared between all the ethernet ports of the switch and
> uses a linked list of descriptors (DCB) to inject and extract packets.
> Before adding descriptors, the FDMA channels must be stopped. It would
> be inefficient to do that each time a descriptor would be added so the
> channels are restarted only once they stopped.
> 
> Both channels uses ring-like structure to feed the DCBs to the FDMA.
> head and tail are never touched by hardware and are completely handled
> by the driver.
> 
> Co-developed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> ---

I need to ask, was there any change in performance, in one direction or
the other, between the ring and list based implementations?

>  drivers/net/ethernet/mscc/Makefile         |   1 +
>  drivers/net/ethernet/mscc/ocelot.c         |  43 +-
>  drivers/net/ethernet/mscc/ocelot.h         |   1 +
>  drivers/net/ethernet/mscc/ocelot_fdma.c    | 713 +++++++++++++++++++++
>  drivers/net/ethernet/mscc/ocelot_fdma.h    |  96 +++
>  drivers/net/ethernet/mscc/ocelot_net.c     |  18 +-
>  drivers/net/ethernet/mscc/ocelot_vsc7514.c |  13 +
>  include/soc/mscc/ocelot.h                  |   4 +
>  8 files changed, 869 insertions(+), 20 deletions(-)
>  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.c
>  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.h
> 
> diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
> index 722c27694b21..d76a9b78b6ca 100644
> --- a/drivers/net/ethernet/mscc/Makefile
> +++ b/drivers/net/ethernet/mscc/Makefile
> @@ -11,5 +11,6 @@ mscc_ocelot_switch_lib-y := \
>  mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o
>  obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o
>  mscc_ocelot-y := \
> +	ocelot_fdma.o \
>  	ocelot_vsc7514.o \
>  	ocelot_net.o
> diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
> index 1f7c9ff18ac5..4b2460d232c2 100644
> --- a/drivers/net/ethernet/mscc/ocelot.c
> +++ b/drivers/net/ethernet/mscc/ocelot.c
> @@ -966,14 +966,37 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
>  	return 0;
>  }
>  
> -int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> +			     u64 timestamp)
>  {
>  	struct skb_shared_hwtstamps *shhwtstamps;
>  	u64 tod_in_ns, full_ts_in_ns;
> +	struct timespec64 ts;
> +
> +	if (!ocelot->ptp)
> +		return;
> +
> +	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> +
> +	tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> +	if ((tod_in_ns & 0xffffffff) < timestamp)
> +		full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> +				timestamp;
> +	else
> +		full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> +				timestamp;
> +
> +	shhwtstamps = skb_hwtstamps(skb);
> +	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> +	shhwtstamps->hwtstamp = full_ts_in_ns;
> +}
> +EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);

This split can very well be a separate patch, it's distracting.

> +
> +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> +{
>  	u64 timestamp, src_port, len;
>  	u32 xfh[OCELOT_TAG_LEN / 4];
>  	struct net_device *dev;
> -	struct timespec64 ts;
>  	struct sk_buff *skb;
>  	int sz, buf_len;
>  	u32 val, *buf;
> @@ -1029,21 +1052,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
>  		*buf = val;
>  	}
>  
> -	if (ocelot->ptp) {
> -		ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> -
> -		tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> -		if ((tod_in_ns & 0xffffffff) < timestamp)
> -			full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> -					timestamp;
> -		else
> -			full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> -					timestamp;
> -
> -		shhwtstamps = skb_hwtstamps(skb);
> -		memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> -		shhwtstamps->hwtstamp = full_ts_in_ns;
> -	}
> +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
>  
>  	/* Everything we see on an interface that is in the HW bridge
>  	 * has already been forwarded.
> diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
> index e43da09b8f91..f1a7b403e221 100644
> --- a/drivers/net/ethernet/mscc/ocelot.h
> +++ b/drivers/net/ethernet/mscc/ocelot.h
> @@ -9,6 +9,7 @@
>  #define _MSCC_OCELOT_H_
>  
>  #include <linux/bitops.h>
> +#include <linux/dsa/ocelot.h>
>  #include <linux/etherdevice.h>
>  #include <linux/if_vlan.h>
>  #include <linux/net_tstamp.h>
> diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
> new file mode 100644
> index 000000000000..e42c2c3ad273
> --- /dev/null
> +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
> @@ -0,0 +1,713 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/*
> + * Microsemi SoCs FDMA driver
> + *
> + * Copyright (c) 2021 Microchip
> + */
> +
> +#include <linux/bitops.h>
> +#include <linux/dmapool.h>
> +#include <linux/dsa/ocelot.h>
> +#include <linux/netdevice.h>
> +#include <linux/of_platform.h>
> +#include <linux/skbuff.h>
> +
> +#include "ocelot_fdma.h"
> +#include "ocelot_qs.h"
> +
> +#define MSCC_FDMA_DCB_LLP(x)			((x) * 4 + 0x0)
> +#define MSCC_FDMA_DCB_LLP_PREV(x)		((x) * 4 + 0xA0)
> +
> +#define MSCC_FDMA_DCB_STAT_BLOCKO(x)		(((x) << 20) & GENMASK(31, 20))
> +#define MSCC_FDMA_DCB_STAT_BLOCKO_M		GENMASK(31, 20)
> +#define MSCC_FDMA_DCB_STAT_BLOCKO_X(x)		(((x) & GENMASK(31, 20)) >> 20)
> +#define MSCC_FDMA_DCB_STAT_PD			BIT(19)
> +#define MSCC_FDMA_DCB_STAT_ABORT		BIT(18)
> +#define MSCC_FDMA_DCB_STAT_EOF			BIT(17)
> +#define MSCC_FDMA_DCB_STAT_SOF			BIT(16)
> +#define MSCC_FDMA_DCB_STAT_BLOCKL_M		GENMASK(15, 0)
> +#define MSCC_FDMA_DCB_STAT_BLOCKL(x)		((x) & GENMASK(15, 0))
> +
> +#define MSCC_FDMA_CH_SAFE			0xcc
> +
> +#define MSCC_FDMA_CH_ACTIVATE			0xd0
> +
> +#define MSCC_FDMA_CH_DISABLE			0xd4
> +
> +#define MSCC_FDMA_EVT_ERR			0x164
> +
> +#define MSCC_FDMA_EVT_ERR_CODE			0x168
> +
> +#define MSCC_FDMA_INTR_LLP			0x16c
> +
> +#define MSCC_FDMA_INTR_LLP_ENA			0x170
> +
> +#define MSCC_FDMA_INTR_FRM			0x174
> +
> +#define MSCC_FDMA_INTR_FRM_ENA			0x178
> +
> +#define MSCC_FDMA_INTR_ENA			0x184
> +
> +#define MSCC_FDMA_INTR_IDENT			0x188
> +
> +#define MSCC_FDMA_INJ_CHAN			2
> +#define MSCC_FDMA_XTR_CHAN			0
> +
> +#define OCELOT_FDMA_RX_MTU			ETH_DATA_LEN
> +#define OCELOT_FDMA_WEIGHT			32

I guess you've reduced to half of NAPI_POLL_WEIGHT because the NET_RX
softirq is consuming too much CPU time with the default value? I don't
know if this is the productive thing to do with a very slow CPU that is
swamped with traffic, since you're practically leaving yourself exposed
to more interrupts, can somebody please chime in?

> +#define OCELOT_FDMA_RX_REFILL_COUNT		(OCELOT_FDMA_MAX_DCB / 2)

Unused. I suppose you wanted to refill more than once per NAPI poll
cycle (as you currently do in ocelot_fdma_rx_restart) but you didn't get
around to it? I think you should still do that, don't leave the RX ring
running dry.

> +
> +#define OCELOT_FDMA_CH_SAFE_TIMEOUT_MS		100
> +
> +#define OCELOT_FDMA_RX_EXTRA_SIZE \
> +				(OCELOT_TAG_LEN + ETH_FCS_LEN + ETH_HLEN)

Could all these macros belong to ocelot_fdma.h?

> +
> +static int ocelot_fdma_rx_buf_size(int mtu)
> +{
> +	return ALIGN(mtu + OCELOT_FDMA_RX_EXTRA_SIZE, 4);
> +}
> +
> +static void ocelot_fdma_writel(struct ocelot_fdma *fdma, u32 reg, u32 data)
> +{
> +	writel(data, fdma->base + reg);
> +}

Regmap is too slow for you, you're using direct I/O accessors now?

> +
> +static u32 ocelot_fdma_readl(struct ocelot_fdma *fdma, u32 reg)
> +{
> +	return readl(fdma->base + reg);
> +}
> +
> +static unsigned int ocelot_fdma_idx_incr(unsigned int idx)

Minor comment, but "inc" and "dec" are much more popular abbreviations.
Although the way in which you use them is not quite the same way in
which other drivers use them (something called "inc" would take a
reference on the number and actually increment it). So maybe "next" and
"prev"?

> +{
> +	idx++;
> +	if (idx == OCELOT_FDMA_MAX_DCB)
> +		idx = 0;
> +
> +	return idx;
> +}
> +
> +static unsigned int ocelot_fdma_idx_decr(unsigned int idx)
> +{
> +	if (idx == 0)
> +		idx = OCELOT_FDMA_MAX_DCB - 1;
> +	else
> +		idx--;
> +
> +	return idx;
> +}
> +
> +static int ocelot_fdma_tx_free_count(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +
> +	if (ring->tail >= ring->head)
> +		return OCELOT_FDMA_MAX_DCB - (ring->tail - ring->head) - 1;
> +	else
> +		return ring->head - ring->tail - 1;
> +}
> +
> +static bool ocelot_fdma_ring_empty(struct ocelot_fdma_ring *ring)
> +{
> +	return ring->head == ring->tail;
> +}
> +
> +static void ocelot_fdma_activate_chan(struct ocelot_fdma *fdma,
> +				      struct ocelot_fdma_dcb *dcb, int chan)
> +{
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_DCB_LLP(chan), dcb->hw_dma);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_ACTIVATE, BIT(chan));
> +}
> +
> +static int ocelot_fdma_wait_chan_safe(struct ocelot_fdma *fdma, int chan)
> +{
> +	unsigned long timeout;
> +	u32 safe;
> +
> +	timeout = jiffies + msecs_to_jiffies(OCELOT_FDMA_CH_SAFE_TIMEOUT_MS);
> +	do {
> +		safe = ocelot_fdma_readl(fdma, MSCC_FDMA_CH_SAFE);
> +		if (safe & BIT(chan))
> +			return 0;

Pretty busy loop, and your timeout is 100 ms. Kinda nasty stuff for the
latency of your system.

> +	} while (time_after(jiffies, timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
> +static int ocelot_fdma_stop_channel(struct ocelot_fdma *fdma, int chan)
> +{
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_DISABLE, BIT(chan));
> +
> +	return ocelot_fdma_wait_chan_safe(fdma, chan);

Maybe for the extraction channel it would make sense to have an async
stop, meaning that you ask it to stop, then process the frames you've
got so far, and wait until the channel has completely stopped only when
you need to refill?

> +}
> +
> +static bool ocelot_fdma_dcb_set_data(struct ocelot_fdma *fdma,
> +				     struct ocelot_fdma_dcb *dcb,
> +				     struct sk_buff *skb,
> +				     size_t size, enum dma_data_direction dir)
> +{
> +	struct ocelot_fdma_dcb_hw_v2 *hw = dcb->hw;
> +	u32 offset;
> +
> +	dcb->skb = skb;
> +	dcb->mapped_size = size;
> +	dcb->mapping = dma_map_single(fdma->dev, skb->data, size, dir);
> +	if (unlikely(dma_mapping_error(fdma->dev, dcb->mapping)))
> +		return false;
> +
> +	offset = dcb->mapping & 0x3;
> +
> +	hw->llp = 0;
> +	hw->datap = ALIGN_DOWN(dcb->mapping, 4);
> +	hw->datal = ALIGN_DOWN(size, 4);
> +	hw->stat = MSCC_FDMA_DCB_STAT_BLOCKO(offset);
> +
> +	return true;
> +}
> +
> +static bool ocelot_fdma_rx_set_skb(struct ocelot_fdma *fdma,
> +				   struct ocelot_fdma_dcb *dcb,
> +				   struct sk_buff *skb, size_t size)
> +{
> +	return ocelot_fdma_dcb_set_data(fdma, dcb, skb, size,
> +					DMA_FROM_DEVICE);
> +}
> +
> +static bool ocelot_fdma_tx_dcb_set_skb(struct ocelot_fdma *fdma,
> +				       struct ocelot_fdma_dcb *dcb,
> +				       struct sk_buff *skb)
> +{
> +	if (!ocelot_fdma_dcb_set_data(fdma, dcb, skb, skb->len,
> +				      DMA_TO_DEVICE))
> +		return false;
> +
> +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_BLOCKL(skb->len);
> +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_SOF | MSCC_FDMA_DCB_STAT_EOF;
> +
> +	return true;
> +}
> +
> +static void ocelot_fdma_rx_restart(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb, *last_dcb;
> +	unsigned int idx;
> +	int ret;
> +	u32 llp;
> +
> +	/* Check if the FDMA hits the DCB with LLP == NULL */
> +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_DCB_LLP(MSCC_FDMA_XTR_CHAN));
> +	if (llp)
> +		return;

I'm not sure why you're letting the hardware grind to a halt first,
before refilling? I think since the CPU is the bottleneck anyway, you
can stop the extraction channel at any time you want to refill.
A constant stream of less data might be better than a bursty one.
Or maybe I'm misunderstanding some of the details of the hardware.

> +
> +	ret = ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> +	if (ret) {
> +		dev_warn(fdma->dev, "Unable to stop RX channel\n");

Rate limit these prints maybe.

> +		return;
> +	}
> +
> +	/* Chain the tail with the next DCB */
> +	dcb = &ring->dcbs[ring->tail];
> +	idx = ocelot_fdma_idx_incr(ring->tail);
> +	dcb->hw->llp = ring->dcbs[idx].hw_dma;
> +	dcb = &ring->dcbs[idx];
> +
> +	/* Place a NULL terminator in last DCB added (head - 1) */
> +	idx = ocelot_fdma_idx_decr(ring->head);
> +	last_dcb = &ring->dcbs[idx];
> +	last_dcb->hw->llp = 0;
> +	ring->tail = idx;
> +
> +	/* Finally reactivate the channel */
> +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_XTR_CHAN);
> +}
> +
> +static bool ocelot_fdma_rx_get(struct ocelot_fdma *fdma, int budget)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb, *next_dcb;
> +	struct ocelot *ocelot = fdma->ocelot;
> +	struct net_device *ndev;
> +	struct sk_buff *skb;
> +	bool valid = true;
> +	u64 timestamp;
> +	u64 src_port;
> +	void *xfh;
> +	u32 stat;
> +
> +	/* We should not go past the tail */
> +	if (ring->head == ring->tail)
> +		return false;
> +
> +	dcb = &ring->dcbs[ring->head];
> +	stat = dcb->hw->stat;
> +	if (MSCC_FDMA_DCB_STAT_BLOCKL(stat) == 0)
> +		return false;
> +
> +	ring->head = ocelot_fdma_idx_incr(ring->head);
> +
> +	if (stat & MSCC_FDMA_DCB_STAT_ABORT || stat & MSCC_FDMA_DCB_STAT_PD)
> +		valid = false;
> +
> +	if (!(stat & MSCC_FDMA_DCB_STAT_SOF) ||
> +	    !(stat & MSCC_FDMA_DCB_STAT_EOF))
> +		valid = false;
> +
> +	dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> +			 DMA_FROM_DEVICE);
> +
> +	skb = dcb->skb;
> +
> +	if (unlikely(!valid)) {
> +		dev_warn(fdma->dev, "Invalid packet\n");

Rate limit please, and try to show something which might be relevant to
why it is not valid.

> +		goto refill;
> +	}
> +
> +	xfh = skb->data;
> +	ocelot_xfh_get_src_port(xfh, &src_port);
> +
> +	if (WARN_ON(src_port >= ocelot->num_phys_ports))
> +		goto refill;
> +
> +	ndev = ocelot_port_to_netdev(ocelot, src_port);
> +	if (unlikely(!ndev))
> +		goto refill;
> +
> +	skb_put(skb, MSCC_FDMA_DCB_STAT_BLOCKL(stat) - ETH_FCS_LEN);
> +	skb_pull(skb, OCELOT_TAG_LEN);
> +
> +	skb->dev = ndev;
> +	skb->protocol = eth_type_trans(skb, skb->dev);
> +	skb->dev->stats.rx_bytes += skb->len;
> +	skb->dev->stats.rx_packets++;
> +
> +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);

You forgot to extract the "timestamp" from the XFH, and are providing
junk from the kernel stack memory. Please make sure to test PTP.

> +
> +	if (!skb_defer_rx_timestamp(skb))
> +		netif_receive_skb(skb);
> +
> +	skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);
> +	if (!skb)
> +		return false;

See my comment below, on the ocelot_fdma_rx_skb_alloc() function, on why
I think you are making sub-optimal use of the ring concept.

> +
> +refill:
> +	if (!ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size))
> +		return false;
> +
> +	/* Chain the next DCB */
> +	next_dcb = &ring->dcbs[ring->head];
> +	dcb->hw->llp = next_dcb->hw_dma;
> +
> +	return true;
> +}
> +
> +static void ocelot_fdma_tx_cleanup(struct ocelot_fdma *fdma, int budget)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +	unsigned int tmp_head, new_null_llp_idx;
> +	struct ocelot_fdma_dcb *dcb;
> +	bool end_of_list = false;
> +	int ret;
> +
> +	spin_lock_bh(&fdma->xmit_lock);
> +
> +	/* Purge the TX packets that have been sent up to the NULL llp or the
> +	 * end of done list.
> +	 */
> +	while (!ocelot_fdma_ring_empty(&fdma->inj)) {

s/&fdma->inj/ring/

> +		dcb = &ring->dcbs[ring->head];
> +		if (!(dcb->hw->stat & MSCC_FDMA_DCB_STAT_PD))
> +			break;
> +
> +		tmp_head = ring->head;

Unused.

> +		ring->head = ocelot_fdma_idx_incr(ring->head);
> +
> +		dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> +				 DMA_TO_DEVICE);
> +		napi_consume_skb(dcb->skb, budget);
> +
> +		/* If we hit the NULL LLP, stop, we might need to reload FDMA */
> +		if (dcb->hw->llp == 0) {
> +			end_of_list = true;
> +			break;
> +		}
> +	}
> +
> +	/* If there is still some DCBs to be processed by the FDMA or if the
> +	 * pending list is empty, there is no need to restart the FDMA.
> +	 */

I don't understand why you restart the injection channel from the TX
confirmation interrupt. It raised the interrupt to tell you that it hit
a NULL LLP because there's nothing left to send. If you restart it now and
no other transmission has happened in the meantime, won't it stop again?

> +	if (!end_of_list || ocelot_fdma_ring_empty(&fdma->inj))

s/&fdma->inj/ring/

> +		goto out_unlock;
> +
> +	ret = ocelot_fdma_wait_chan_safe(fdma, MSCC_FDMA_INJ_CHAN);
> +	if (ret) {
> +		dev_warn(fdma->dev, "Failed to wait for TX channel to stop\n");
> +		goto out_unlock;
> +	}
> +
> +	/* Set NULL LLP */
> +	new_null_llp_idx = ocelot_fdma_idx_decr(ring->tail);
> +	dcb = &ring->dcbs[new_null_llp_idx];
> +	dcb->hw->llp = 0;
> +
> +	dcb = &ring->dcbs[ring->head];
> +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);

	if (unlikely(netif_queue_stopped(dev)) &&
	    ocelot_fdma_tx_free_count(fdma))
		netif_wake_queue(dev);

This can then be tweaked for when you add support for scatter/gather xmit.

> +
> +out_unlock:
> +	spin_unlock_bh(&fdma->xmit_lock);
> +}
> +
> +static int ocelot_fdma_napi_poll(struct napi_struct *napi, int budget)
> +{
> +	struct ocelot_fdma *fdma = container_of(napi, struct ocelot_fdma, napi);
> +	int work_done = 0;
> +
> +	ocelot_fdma_tx_cleanup(fdma, budget);
> +
> +	while (work_done < budget) {
> +		if (!ocelot_fdma_rx_get(fdma, budget))
> +			break;
> +
> +		work_done++;
> +	}
> +
> +	ocelot_fdma_rx_restart(fdma);
> +
> +	if (work_done < budget) {
> +		napi_complete_done(&fdma->napi, work_done);
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> +				   BIT(MSCC_FDMA_INJ_CHAN) |
> +				   BIT(MSCC_FDMA_XTR_CHAN));
> +	}
> +
> +	return work_done;
> +}
> +
> +static irqreturn_t ocelot_fdma_interrupt(int irq, void *dev_id)
> +{
> +	u32 ident, llp, frm, err, err_code;
> +	struct ocelot_fdma *fdma = dev_id;
> +
> +	ident = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_IDENT);
> +	frm = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_FRM);
> +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_LLP);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, llp & ident);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, frm & ident);
> +	if (frm || llp) {
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +		napi_schedule(&fdma->napi);
> +	}
> +
> +	err = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR);
> +	if (unlikely(err)) {
> +		err_code = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR_CODE);
> +		dev_err_ratelimited(fdma->dev,
> +				    "Error ! chans mask: %#x, code: %#x\n",
> +				    err, err_code);
> +
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR, err);
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR_CODE, err_code);
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void ocelot_fdma_send_skb(struct ocelot_fdma *fdma, struct sk_buff *skb)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +	struct ocelot_fdma_dcb *dcb, *next;
> +
> +	dcb = &ring->dcbs[ring->tail];
> +	if (!ocelot_fdma_tx_dcb_set_skb(fdma, dcb, skb)) {
> +		dev_kfree_skb_any(skb);
> +		return;
> +	}
> +
> +	if (ocelot_fdma_ring_empty(&fdma->inj)) {

s/&fdma->inj/ring/

> +		ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
> +	} else {
> +		next = &ring->dcbs[ocelot_fdma_idx_incr(ring->tail)];
> +		dcb->hw->llp = next->hw_dma;
> +	}
> +
> +	ring->tail = ocelot_fdma_idx_incr(ring->tail);

You still have locking between TX and TX conf, that's too bad. Why is
that, I wonder? Because TX conf (ocelot_fdma_tx_cleanup) updates
ring->head and TX (ocelot_fdma_send_skb) updates ring->tail. Could it be
because you're updating the ring->tail _after_ you've activated the
injection channel, therefore exposing you to a race with the completion
interrupt which reads ring->tail?

> +
> +	skb_tx_timestamp(skb);
> +}
> +
> +static int ocelot_fdma_prepare_skb(struct ocelot_fdma *fdma, int port,
> +				   u32 rew_op, struct sk_buff *skb,
> +				   struct net_device *dev)
> +{
> +	int needed_headroom = max_t(int, OCELOT_TAG_LEN - skb_headroom(skb), 0);
> +	int needed_tailroom = max_t(int, ETH_FCS_LEN - skb_tailroom(skb), 0);
> +	struct ocelot_port *ocelot_port = fdma->ocelot->ports[port];
> +	void *ifh;
> +	int err;
> +
> +	if (unlikely(needed_headroom || needed_tailroom ||
> +		     skb_header_cloned(skb))) {
> +		err = pskb_expand_head(skb, needed_headroom, needed_tailroom,
> +				       GFP_ATOMIC);
> +		if (unlikely(err)) {
> +			dev_kfree_skb_any(skb);
> +			return 1;
> +		}
> +	}
> +
> +	err = skb_linearize(skb);
> +	if (err) {
> +		net_err_ratelimited("%s: skb_linearize error (%d)!\n",
> +				    dev->name, err);
> +		dev_kfree_skb_any(skb);
> +		return 1;
> +	}
> +
> +	ifh = skb_push(skb, OCELOT_TAG_LEN);
> +	skb_put(skb, ETH_FCS_LEN);
> +	ocelot_ifh_port_set(ifh, ocelot_port, rew_op, skb_vlan_tag_get(skb));
> +
> +	return 0;
> +}
> +
> +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> +			     struct sk_buff *skb, struct net_device *dev)
> +{
> +	int ret = NETDEV_TX_OK;
> +
> +	spin_lock(&fdma->xmit_lock);
> +
> +	if (ocelot_fdma_tx_free_count(fdma) == 0) {
> +		ret = NETDEV_TX_BUSY;

netif_stop_queue(dev);

> +		goto out;
> +	}
> +
> +	if (ocelot_fdma_prepare_skb(fdma, port, rew_op, skb, dev))
> +		goto out;
> +
> +	ocelot_fdma_send_skb(fdma, skb);
> +
> +out:
> +	spin_unlock(&fdma->xmit_lock);
> +
> +	return ret;
> +}
> +
> +static void ocelot_fdma_ring_free(struct ocelot_fdma *fdma,
> +				  struct ocelot_fdma_ring *ring)
> +{
> +	dmam_free_coherent(fdma->dev, OCELOT_DCBS_HW_ALLOC_SIZE, ring->hw_dcbs,
> +			   ring->hw_dcbs_dma);
> +}
> +
> +static int ocelot_fdma_ring_alloc(struct ocelot_fdma *fdma,
> +				  struct ocelot_fdma_ring *ring)
> +{
> +	struct ocelot_fdma_dcb_hw_v2 *hw_dcbs;
> +	struct ocelot_fdma_dcb *dcb;
> +	dma_addr_t hw_dcbs_dma;
> +	unsigned int adjust;
> +	int i;
> +
> +	/* Create a pool of consistent memory blocks for hardware descriptors */
> +	ring->hw_dcbs = dmam_alloc_coherent(fdma->dev,
> +					    OCELOT_DCBS_HW_ALLOC_SIZE,
> +					    &ring->hw_dcbs_dma, GFP_KERNEL);
> +	if (!ring->hw_dcbs)
> +		return -ENOMEM;
> +
> +	/* DCBs must be aligned on a 32bit boundary */
> +	hw_dcbs = ring->hw_dcbs;
> +	hw_dcbs_dma = ring->hw_dcbs_dma;
> +	if (!IS_ALIGNED(hw_dcbs_dma, 4)) {
> +		adjust = hw_dcbs_dma & 0x3;
> +		hw_dcbs_dma = ALIGN(hw_dcbs_dma, 4);
> +		hw_dcbs = (void *)hw_dcbs + adjust;
> +	}
> +
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		dcb->hw = &hw_dcbs[i];
> +		dcb->hw_dma = hw_dcbs_dma +
> +			     i * sizeof(struct ocelot_fdma_dcb_hw_v2);
> +	}
> +
> +	return 0;
> +}
> +
> +static int ocelot_fdma_rx_skb_alloc(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_dcb *dcb, *prev_dcb = NULL;
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct sk_buff *skb;
> +	int i;
> +
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);

I have to tell you, skb allocation at probe time is something I haven't
seen before, I'll have to defer to somebody else for a second opinion.
I understand that you keep the sk_buff structure closely tied to the DCB
structure, whereas normally you'd see only DCB structures in the RX ring.

And napi_alloc_skb? This isn't NAPI context.

The whole idea (as far as I understand it!) with a receive ring is that
hardware produces data on one side of it, and software consumes from
that side too and in the same direction, just lags behind a little bit.
Software must also provide buffers at the opposite end, in which
hardware will put the produced data, so that the pipeline never runs
out. You'll notice that it is standard amongst ring based drivers to
name your ring->head as ring->next_to_clean, and your ring->tail as
ring->next_to_use (actually not quite: your hardware doesn't really
provide producer and consumer indices, and the way in which you update
the ring->tail past the NULL LLP pointer is a bit different from the way
in which drivers use ring->next_to_use and more like ring->next_to_alloc,
but more on that later). I think people reviewing your code will instantly
know what it's about if you name and structure things in this way,
perhaps the lack of consumer and producer indices is merely an
irrelevant detail.

As long as the ring size is large enough in order to give software some
time to react, you can reach a state of equilibrium where you don't need
to allocate or DMA map any new buffer, you can just recycle the ones you
already have. To use buffer recycling, you need to replace alloc_skb()
with functions such as build_skb(), which builds an sk_buff structure
around a pre-existing buffer.

One simple way to achieve this is the page flipping technique. There may
be others and you don't have to use this one. The principle is as
follows: you allocate and DMA map buffers double the size you need
(PAGE_SIZE is a common choice, divided by two you get about 2K of data
per buffer, for frames larger than that you should do scatter/gather)
and you populate an RX DCB with just one half of that page, the other
half is unused for now. You repeat this process until you've filled up
the RX ring with sufficient DCB entries, and all of this happens at
initialization time.

Then, when you process a frame from the RX DCB ring (in the NAPI poll
function), you construct a skb around that page half, and before giving
it to the network stack, you speculatively attempt to reuse the other
half of the page by putting it back into the RX DCB ring, at the
opposite end compared to where you're consuming the other half from.
Ring-based drivers use a separate ring->next_to_alloc variable for this.
DCB elements ranging from ring->next_to_use and up to ring->next_to_alloc
are simply halves of pages that have been processed by your NAPI, and
are ready to be committed to hardware when you need to refill (in your
case update the LLP pointer).

So each page could have up to two users (references), and you need to be
very careful how you deal with concurrency with the stack. As Claudiu
explained in great detail to me when I was first studying this
mechanism, the driver is the producer of these RX pages and the network
stack is the consumer. And since the network stack will eventually call
kfree_skb() after it's done, which ultimately results into a put_page(),
you need to counteract that action and ensure that if you want to recycle
the other half of the page, the page's refcount never reaches zero.

As a result, if you deem the other page half as good for recycling, you
need to bump the reference count of the page from 1 to 2. This will make
it safe for you to refill the DCB ring with that other half.

Of course, the other half of the page might not be available for
recycling, this is why the technique is speculative. If the stack hasn't
yet called kfree_skb() on the other half, the page reference count will
be 2 at the time you're cleaning the buffer, and bad luck, you can no
longer reuse this page, so the driver needs to take its hands off of it.

If you look at driver implementations of the half page flip heuristic,
you'll see that "taking your hands off the page" means simply to DMA
unmap the entire page. This might surprise you, after all, the reference
count of the page is 2, you might think that the page will leak if you
don't decrement one of the references on it, or something. But if you
think about it, the reference count is 2 because one half has an skb
built at an earlier time around it, which is still in the network stack
there somewhere pending a kfree_skb(), and the other half is the buffer
you're cleaning right now. This half is practically promised to the
network stack, you'll create an skb around this half too, and the stack
will kfree it too, and the refcount of the page will thus drop to zero.

Of course, when you need to refill the RX ring with, say, 32 buffers,
you might find that (ring->next_to_alloc - ring->next_to_use) mod ring size
is less than 32 (otherwise said, the buffer reuse technique couldn't
provide enough buffers). So you need to alloc and DMA map new pages from
ring->next_to_alloc up to 32 (but this is still in contrast with your
current implementation which always calls napi_alloc_skb 32 times).
Nonetheless, in my experience, the technique works very well in real
life situations and is a very good way to reduce the pressure on the
memory allocator and avoid costly DMA mapping and unmapping per packet.
Since you've already went through the trouble of making this a
ring-based driver, I believe you should try to implement a buffer reuse
technique too, especially for this particular case of a very slow
processor.

> +		if (!skb)
> +			goto skb_alloc_failed;
> +
> +		ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size);
> +
> +		if (prev_dcb)
> +			prev_dcb->hw->llp = dcb->hw_dma;
> +
> +		prev_dcb = dcb;
> +	}
> +
> +	ring->head = 0;
> +	ring->tail = OCELOT_FDMA_MAX_DCB - 1;
> +
> +	return 0;
> +
> +skb_alloc_failed:
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		if (!dcb->skb)
> +			break;
> +
> +		dev_kfree_skb_any(dcb->skb);
> +	}
> +
> +	return -ENOMEM;
> +}
> +
> +static int ocelot_fdma_rx_init(struct ocelot_fdma *fdma)
> +{
> +	int ret;
> +
> +	fdma->rx_buf_size = ocelot_fdma_rx_buf_size(OCELOT_FDMA_RX_MTU);
> +
> +	ret = ocelot_fdma_rx_skb_alloc(fdma);
> +	if (ret) {
> +		netif_napi_del(&fdma->napi);
> +		return ret;
> +	}
> +
> +	napi_enable(&fdma->napi);
> +
> +	ocelot_fdma_activate_chan(fdma, &fdma->xtr.dcbs[0],
> +				  MSCC_FDMA_XTR_CHAN);
> +
> +	return 0;
> +}
> +
> +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> +{
> +	dev->needed_headroom = OCELOT_TAG_LEN;
> +	dev->needed_tailroom = ETH_FCS_LEN;

The needed_headroom is in no way specific to FDMA, right? Why aren't you
doing it for manual register-based injection too? (in a separate patch ofc)

> +
> +	if (fdma->ndev)
> +		return;
> +
> +	fdma->ndev = dev;
> +	netif_napi_add(dev, &fdma->napi, ocelot_fdma_napi_poll,
> +		       OCELOT_FDMA_WEIGHT);

I understand that NAPI is per netdev but you have a single interrupt so
you need to share the NAPI instance for all ports. That is fine.
But danger ahead, see this:

mscc_ocelot_init_ports() does:

		err = ocelot_probe_port(ocelot, port, target, portnp);
		if (err) {
			ocelot_port_devlink_teardown(ocelot, port);
			continue;
		}

aka it skips over ports that failed to probe.
And ocelot_probe_port does:

	if (ocelot->fdma)
		ocelot_fdma_netdev_init(ocelot->fdma, dev);

	err = register_netdev(dev);
	if (err) {
		dev_err(ocelot->dev, "register_netdev failed\n");
		goto out;
	}

So if register_netdev() fails, you will have a dangling, freed pointer
inside fdma->ndev. That is not good, as far as I can tell. Try to make
the probing of your first port fail at register_netdev() time, to see
what I mean.

> +}
> +
> +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma, struct net_device *dev)
> +{
> +	if (dev == fdma->ndev)
> +		netif_napi_del(&fdma->napi);
> +}
> +
> +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> +				     struct ocelot *ocelot)
> +{
> +	struct ocelot_fdma *fdma;
> +	void __iomem *base;
> +	int ret;
> +
> +	base = devm_platform_ioremap_resource_byname(pdev, "fdma");
> +	if (IS_ERR_OR_NULL(base))
> +		return NULL;
> +
> +	fdma = devm_kzalloc(&pdev->dev, sizeof(*fdma), GFP_KERNEL);
> +	if (!fdma)
> +		goto err_release_resource;
> +
> +	fdma->ocelot = ocelot;
> +	fdma->base = base;
> +	fdma->dev = &pdev->dev;
> +	fdma->dev->coherent_dma_mask = DMA_BIT_MASK(32);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +
> +	fdma->irq = platform_get_irq_byname(pdev, "fdma");
> +	ret = devm_request_irq(&pdev->dev, fdma->irq, ocelot_fdma_interrupt, 0,
> +			       dev_name(&pdev->dev), fdma);
> +	if (ret)
> +		goto err_free_fdma;
> +
> +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->inj);
> +	if (ret)
> +		goto err_free_irq;
> +
> +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->xtr);
> +	if (ret)
> +		goto free_inj_ring;
> +
> +	return fdma;
> +
> +free_inj_ring:
> +	ocelot_fdma_ring_free(fdma, &fdma->inj);
> +err_free_irq:
> +	devm_free_irq(&pdev->dev, fdma->irq, fdma);
> +err_free_fdma:
> +	devm_kfree(&pdev->dev, fdma);
> +err_release_resource:
> +	devm_iounmap(&pdev->dev, base);
> +
> +	return NULL;
> +}
> +
> +int ocelot_fdma_start(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot *ocelot = fdma->ocelot;
> +	int ret;
> +
> +	ret = ocelot_fdma_rx_init(fdma);
> +	if (ret)
> +		return -EINVAL;
> +
> +	/* Reconfigure for extraction and injection using DMA */
> +	ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_MODE(2), QS_INJ_GRP_CFG, 0);
> +	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(0), QS_INJ_CTRL, 0);
> +
> +	ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_MODE(2), QS_XTR_GRP_CFG, 0);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, 0xffffffff);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, 0xffffffff);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP_ENA,
> +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM_ENA, BIT(MSCC_FDMA_XTR_CHAN));
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> +
> +	return 0;
> +}
> +
> +int ocelot_fdma_stop(struct ocelot_fdma *fdma)

This should return void.

> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb;
> +	int i;
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +
> +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_INJ_CHAN);
> +
> +	/* Free the SKB hold in the extraction ring */
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		dev_kfree_skb_any(dcb->skb);
> +	}
> +
> +	napi_synchronize(&fdma->napi);
> +	napi_disable(&fdma->napi);
> +
> +	return 0;
> +}
> diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.h b/drivers/net/ethernet/mscc/ocelot_fdma.h
> new file mode 100644
> index 000000000000..b6f1dda0e0c7
> --- /dev/null
> +++ b/drivers/net/ethernet/mscc/ocelot_fdma.h
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> +/*
> + * Microsemi SoCs FDMA driver
> + *
> + * Copyright (c) 2021 Microchip
> + */
> +#ifndef _MSCC_OCELOT_FDMA_H_
> +#define _MSCC_OCELOT_FDMA_H_
> +
> +#include "ocelot.h"
> +
> +#define OCELOT_FDMA_MAX_DCB		128
> +/* +4 allows for word alignment after allocation */
> +#define OCELOT_DCBS_HW_ALLOC_SIZE	(OCELOT_FDMA_MAX_DCB * \
> +					 sizeof(struct ocelot_fdma_dcb_hw_v2) + \
> +					 4)
> +
> +struct ocelot_fdma_dcb_hw_v2 {
> +	u32 llp;
> +	u32 datap;
> +	u32 datal;
> +	u32 stat;
> +};

Could you declare this using __attribute((packed)) to show that you're
mapping it over hardware?

> +
> +/**
> + * struct ocelot_fdma_dcb - Software DCBs description
> + *
> + * @hw: hardware DCB used by hardware(coherent memory)
> + * @hw_dma: DMA address of the DCB
> + * @skb: skb associated with the DCB
> + * @mapping: Address of the skb data mapping
> + * @mapped_size: Mapped size
> + */
> +struct ocelot_fdma_dcb {
> +	struct ocelot_fdma_dcb_hw_v2	*hw;
> +	dma_addr_t			hw_dma;
> +	struct sk_buff			*skb;
> +	dma_addr_t			mapping;
> +	size_t				mapped_size;
> +};
> +
> +/**
> + * struct ocelot_fdma_ring - "Ring" description of DCBs
> + *
> + * @hw_dcbs: Hardware DCBs allocated for the ring
> + * @hw_dcbs_dma: DMA address of the DCBs
> + * @dcbs: List of software DCBs
> + * @head: pointer to first available DCB
> + * @tail: pointer to last available DCB
> + */
> +struct ocelot_fdma_ring {
> +	struct ocelot_fdma_dcb_hw_v2	*hw_dcbs;
> +	dma_addr_t			hw_dcbs_dma;
> +	struct ocelot_fdma_dcb		dcbs[OCELOT_FDMA_MAX_DCB];
> +	unsigned int			head;
> +	unsigned int			tail;
> +};
> +
> +/**
> + * struct ocelot_fdma - FMDA struct

s/FMDA/FDMA/

> + *
> + * @ocelot: Pointer to ocelot struct
> + * @base: base address of FDMA registers
> + * @irq: FDMA interrupt
> + * @dev: Ocelot device
> + * @napi: napi handle
> + * @rx_buf_size: Size of RX buffer
> + * @inj: Injection ring
> + * @xtr: Extraction ring
> + * @xmit_lock: Xmit lock
> + *
> + */
> +struct ocelot_fdma {
> +	struct ocelot			*ocelot;

To me, this structure organization in which "struct ocelot_fdma *" is
passed as argument to all FDMA functions, instead of "struct ocelot *",
is strange, and leads to oddities such as this backpointer right here.
Do it in whichever way you want, I'm just pointing this out.

> +	void __iomem			*base;
> +	int				irq;
> +	struct device			*dev;
> +	struct napi_struct		napi;
> +	struct net_device		*ndev;
> +	size_t				rx_buf_size;
> +	struct ocelot_fdma_ring		inj;
> +	struct ocelot_fdma_ring		xtr;
> +	spinlock_t			xmit_lock;
> +};
> +
> +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> +				     struct ocelot *ocelot);
> +int ocelot_fdma_start(struct ocelot_fdma *fdma);
> +int ocelot_fdma_stop(struct ocelot_fdma *fdma);
> +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> +			     struct sk_buff *skb, struct net_device *dev);
> +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev);
> +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma,
> +			       struct net_device *dev);
> +
> +#endif
> diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
> index b589ae95e29b..9dcaf421da12 100644
> --- a/drivers/net/ethernet/mscc/ocelot_net.c
> +++ b/drivers/net/ethernet/mscc/ocelot_net.c
> @@ -15,6 +15,7 @@
>  #include <net/pkt_cls.h>
>  #include "ocelot.h"
>  #include "ocelot_vcap.h"
> +#include "ocelot_fdma.h"
>  
>  #define OCELOT_MAC_QUIRKS	OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP
>  
> @@ -457,7 +458,7 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
>  	int port = priv->chip_port;
>  	u32 rew_op = 0;
>  
> -	if (!ocelot_can_inject(ocelot, 0))
> +	if (!ocelot->fdma && !ocelot_can_inject(ocelot, 0))
>  		return NETDEV_TX_BUSY;
>  
>  	/* Check if timestamping is needed */
> @@ -475,9 +476,13 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
>  		rew_op = ocelot_ptp_rew_op(skb);
>  	}
>  
> -	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
> +	if (ocelot->fdma) {
> +		ocelot_fdma_inject_frame(ocelot->fdma, port, rew_op, skb, dev);
> +	} else {
> +		ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);

I can't help but think how painful it is that with a CPU as slow as
yours, insult over injury, you also need to check for each packet
whether the device tree had defined the "fdma" region or not, because
you practically keep two traffic I/O implementations due to that sole
reason. I think for the ocelot switchdev driver, which is strictly for
MIPS CPUs embedded within the device, it should be fine to introduce a
static key here (search for static_branch_likely in the kernel).

>  
> -	kfree_skb(skb);
> +		consume_skb(skb);
> +	}
>  
>  	return NETDEV_TX_OK;
>  }
> @@ -1717,6 +1722,9 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
>  	if (err)
>  		goto out;
>  
> +	if (ocelot->fdma)
> +		ocelot_fdma_netdev_init(ocelot->fdma, dev);
> +
>  	err = register_netdev(dev);
>  	if (err) {
>  		dev_err(ocelot->dev, "register_netdev failed\n");
> @@ -1737,9 +1745,13 @@ void ocelot_release_port(struct ocelot_port *ocelot_port)
>  	struct ocelot_port_private *priv = container_of(ocelot_port,
>  						struct ocelot_port_private,
>  						port);
> +	struct ocelot_fdma *fdma = ocelot_port->ocelot->fdma;
>  
>  	unregister_netdev(priv->dev);
>  
> +	if (fdma)
> +		ocelot_fdma_netdev_deinit(fdma, priv->dev);
> +
>  	if (priv->phylink) {
>  		rtnl_lock();
>  		phylink_disconnect_phy(priv->phylink);
> diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> index 38103b0255b0..fa68eb23a333 100644
> --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> @@ -18,6 +18,7 @@
>  
>  #include <soc/mscc/ocelot_vcap.h>
>  #include <soc/mscc/ocelot_hsio.h>
> +#include "ocelot_fdma.h"
>  #include "ocelot.h"
>  
>  static const u32 ocelot_ana_regmap[] = {
> @@ -1080,6 +1081,10 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
>  		ocelot->targets[io_target[i].id] = target;
>  	}
>  
> +	ocelot->fdma = ocelot_fdma_init(pdev, ocelot);
> +	if (IS_ERR(ocelot->fdma))
> +		ocelot->fdma = NULL;
> +
>  	hsio = syscon_regmap_lookup_by_compatible("mscc,ocelot-hsio");
>  	if (IS_ERR(hsio)) {
>  		dev_err(&pdev->dev, "missing hsio syscon\n");
> @@ -1139,6 +1144,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
>  	if (err)
>  		goto out_ocelot_devlink_unregister;
>  
> +	if (ocelot->fdma) {
> +		err = ocelot_fdma_start(ocelot->fdma);
> +		if (err)
> +			goto out_ocelot_release_ports;
> +	}
> +
>  	err = ocelot_devlink_sb_register(ocelot);
>  	if (err)
>  		goto out_ocelot_release_ports;
> @@ -1179,6 +1190,8 @@ static int mscc_ocelot_remove(struct platform_device *pdev)
>  {
>  	struct ocelot *ocelot = platform_get_drvdata(pdev);
>  
> +	if (ocelot->fdma)
> +		ocelot_fdma_stop(ocelot->fdma);
>  	devlink_unregister(ocelot->devlink);
>  	ocelot_deinit_timestamp(ocelot);
>  	ocelot_devlink_sb_unregister(ocelot);
> diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> index b3381c90ff3e..351ab385ab98 100644
> --- a/include/soc/mscc/ocelot.h
> +++ b/include/soc/mscc/ocelot.h
> @@ -695,6 +695,8 @@ struct ocelot {
>  	/* Protects the PTP clock */
>  	spinlock_t			ptp_clock_lock;
>  	struct ptp_pin_desc		ptp_pins[OCELOT_PTP_PINS_NUM];
> +
> +	struct ocelot_fdma		*fdma;
>  };
>  
>  struct ocelot_policer {
> @@ -761,6 +763,8 @@ void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
>  			 u32 vlan_tag);
>  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
>  void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
> +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> +			     u64 timestamp);
>  
>  /* Hardware initialization */
>  int ocelot_regfields_init(struct ocelot *ocelot,
> -- 
> 2.33.1
>
Clément Léger Nov. 29, 2021, 8:19 a.m. UTC | #3
Le Sat, 27 Nov 2021 14:58:06 +0000,
Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :

> On Fri, Nov 26, 2021 at 06:27:39PM +0100, Clément Léger wrote:
> > Ethernet frames can be extracted or injected autonomously to or from
> > the device’s DDR3/DDR3L memory and/or PCIe memory space. Linked list
> > data structures in memory are used for injecting or extracting Ethernet
> > frames. The FDMA generates interrupts when frame extraction or
> > injection is done and when the linked lists need updating.
> > 
> > The FDMA is shared between all the ethernet ports of the switch and
> > uses a linked list of descriptors (DCB) to inject and extract packets.
> > Before adding descriptors, the FDMA channels must be stopped. It would
> > be inefficient to do that each time a descriptor would be added so the
> > channels are restarted only once they stopped.
> > 
> > Both channels uses ring-like structure to feed the DCBs to the FDMA.
> > head and tail are never touched by hardware and are completely handled
> > by the driver.
> > 
> > Co-developed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> > Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> > Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> > ---  
> 
> I need to ask, was there any change in performance, in one direction or
> the other, between the ring and list based implementations?

Unfortunately, nothing really noticeable... Last numbers I gave you
were made with this version.

> 
> >  drivers/net/ethernet/mscc/Makefile         |   1 +
> >  drivers/net/ethernet/mscc/ocelot.c         |  43 +-
> >  drivers/net/ethernet/mscc/ocelot.h         |   1 +
> >  drivers/net/ethernet/mscc/ocelot_fdma.c    | 713 +++++++++++++++++++++
> >  drivers/net/ethernet/mscc/ocelot_fdma.h    |  96 +++
> >  drivers/net/ethernet/mscc/ocelot_net.c     |  18 +-
> >  drivers/net/ethernet/mscc/ocelot_vsc7514.c |  13 +
> >  include/soc/mscc/ocelot.h                  |   4 +
> >  8 files changed, 869 insertions(+), 20 deletions(-)
> >  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.c
> >  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.h
> > 
> > diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
> > index 722c27694b21..d76a9b78b6ca 100644
> > --- a/drivers/net/ethernet/mscc/Makefile
> > +++ b/drivers/net/ethernet/mscc/Makefile
> > @@ -11,5 +11,6 @@ mscc_ocelot_switch_lib-y := \
> >  mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o
> >  obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o
> >  mscc_ocelot-y := \
> > +	ocelot_fdma.o \
> >  	ocelot_vsc7514.o \
> >  	ocelot_net.o
> > diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
> > index 1f7c9ff18ac5..4b2460d232c2 100644
> > --- a/drivers/net/ethernet/mscc/ocelot.c
> > +++ b/drivers/net/ethernet/mscc/ocelot.c
> > @@ -966,14 +966,37 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
> >  	return 0;
> >  }
> >  
> > -int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> > +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> > +			     u64 timestamp)
> >  {
> >  	struct skb_shared_hwtstamps *shhwtstamps;
> >  	u64 tod_in_ns, full_ts_in_ns;
> > +	struct timespec64 ts;
> > +
> > +	if (!ocelot->ptp)
> > +		return;
> > +
> > +	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> > +
> > +	tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> > +	if ((tod_in_ns & 0xffffffff) < timestamp)
> > +		full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> > +				timestamp;
> > +	else
> > +		full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> > +				timestamp;
> > +
> > +	shhwtstamps = skb_hwtstamps(skb);
> > +	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> > +	shhwtstamps->hwtstamp = full_ts_in_ns;
> > +}
> > +EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);  
> 
> This split can very well be a separate patch, it's distracting.

Acked.

> 
> > +
> > +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> > +{
> >  	u64 timestamp, src_port, len;
> >  	u32 xfh[OCELOT_TAG_LEN / 4];
> >  	struct net_device *dev;
> > -	struct timespec64 ts;
> >  	struct sk_buff *skb;
> >  	int sz, buf_len;
> >  	u32 val, *buf;
> > @@ -1029,21 +1052,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> >  		*buf = val;
> >  	}
> >  
> > -	if (ocelot->ptp) {
> > -		ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> > -
> > -		tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> > -		if ((tod_in_ns & 0xffffffff) < timestamp)
> > -			full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> > -					timestamp;
> > -		else
> > -			full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> > -					timestamp;
> > -
> > -		shhwtstamps = skb_hwtstamps(skb);
> > -		memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> > -		shhwtstamps->hwtstamp = full_ts_in_ns;
> > -	}
> > +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
> >  
> >  	/* Everything we see on an interface that is in the HW bridge
> >  	 * has already been forwarded.
> > diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
> > index e43da09b8f91..f1a7b403e221 100644
> > --- a/drivers/net/ethernet/mscc/ocelot.h
> > +++ b/drivers/net/ethernet/mscc/ocelot.h
> > @@ -9,6 +9,7 @@
> >  #define _MSCC_OCELOT_H_
> >  
> >  #include <linux/bitops.h>
> > +#include <linux/dsa/ocelot.h>
> >  #include <linux/etherdevice.h>
> >  #include <linux/if_vlan.h>
> >  #include <linux/net_tstamp.h>
> > diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
> > new file mode 100644
> > index 000000000000..e42c2c3ad273
> > --- /dev/null
> > +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
> > @@ -0,0 +1,713 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> > +/*
> > + * Microsemi SoCs FDMA driver
> > + *
> > + * Copyright (c) 2021 Microchip
> > + */
> > +
> > +#include <linux/bitops.h>
> > +#include <linux/dmapool.h>
> > +#include <linux/dsa/ocelot.h>
> > +#include <linux/netdevice.h>
> > +#include <linux/of_platform.h>
> > +#include <linux/skbuff.h>
> > +
> > +#include "ocelot_fdma.h"
> > +#include "ocelot_qs.h"
> > +
> > +#define MSCC_FDMA_DCB_LLP(x)			((x) * 4 + 0x0)
> > +#define MSCC_FDMA_DCB_LLP_PREV(x)		((x) * 4 + 0xA0)
> > +
> > +#define MSCC_FDMA_DCB_STAT_BLOCKO(x)		(((x) << 20) & GENMASK(31, 20))
> > +#define MSCC_FDMA_DCB_STAT_BLOCKO_M		GENMASK(31, 20)
> > +#define MSCC_FDMA_DCB_STAT_BLOCKO_X(x)		(((x) & GENMASK(31, 20)) >> 20)
> > +#define MSCC_FDMA_DCB_STAT_PD			BIT(19)
> > +#define MSCC_FDMA_DCB_STAT_ABORT		BIT(18)
> > +#define MSCC_FDMA_DCB_STAT_EOF			BIT(17)
> > +#define MSCC_FDMA_DCB_STAT_SOF			BIT(16)
> > +#define MSCC_FDMA_DCB_STAT_BLOCKL_M		GENMASK(15, 0)
> > +#define MSCC_FDMA_DCB_STAT_BLOCKL(x)		((x) & GENMASK(15, 0))
> > +
> > +#define MSCC_FDMA_CH_SAFE			0xcc
> > +
> > +#define MSCC_FDMA_CH_ACTIVATE			0xd0
> > +
> > +#define MSCC_FDMA_CH_DISABLE			0xd4
> > +
> > +#define MSCC_FDMA_EVT_ERR			0x164
> > +
> > +#define MSCC_FDMA_EVT_ERR_CODE			0x168
> > +
> > +#define MSCC_FDMA_INTR_LLP			0x16c
> > +
> > +#define MSCC_FDMA_INTR_LLP_ENA			0x170
> > +
> > +#define MSCC_FDMA_INTR_FRM			0x174
> > +
> > +#define MSCC_FDMA_INTR_FRM_ENA			0x178
> > +
> > +#define MSCC_FDMA_INTR_ENA			0x184
> > +
> > +#define MSCC_FDMA_INTR_IDENT			0x188
> > +
> > +#define MSCC_FDMA_INJ_CHAN			2
> > +#define MSCC_FDMA_XTR_CHAN			0
> > +
> > +#define OCELOT_FDMA_RX_MTU			ETH_DATA_LEN
> > +#define OCELOT_FDMA_WEIGHT			32  
> 
> I guess you've reduced to half of NAPI_POLL_WEIGHT because the NET_RX
> softirq is consuming too much CPU time with the default value? I don't
> know if this is the productive thing to do with a very slow CPU that is
> swamped with traffic, since you're practically leaving yourself exposed
> to more interrupts, can somebody please chime in?

Yes, indeed. 

> 
> > +#define OCELOT_FDMA_RX_REFILL_COUNT		(OCELOT_FDMA_MAX_DCB / 2)  
> 
> Unused. I suppose you wanted to refill more than once per NAPI poll
> cycle (as you currently do in ocelot_fdma_rx_restart) but you didn't get
> around to it? I think you should still do that, don't leave the RX ring
> running dry.

Exactly, that's what I wanted to do originally but to refill the RX,
the RX channel must be disabled... which means it will probably drop
packet in the meantime if FIFOs are full. Not sure it will bring any
improvement.

> 
> > +
> > +#define OCELOT_FDMA_CH_SAFE_TIMEOUT_MS		100
> > +
> > +#define OCELOT_FDMA_RX_EXTRA_SIZE \
> > +				(OCELOT_TAG_LEN + ETH_FCS_LEN + ETH_HLEN)  
> 
> Could all these macros belong to ocelot_fdma.h?

Yes.

> 
> > +
> > +static int ocelot_fdma_rx_buf_size(int mtu)
> > +{
> > +	return ALIGN(mtu + OCELOT_FDMA_RX_EXTRA_SIZE, 4);
> > +}
> > +
> > +static void ocelot_fdma_writel(struct ocelot_fdma *fdma, u32 reg, u32 data)
> > +{
> > +	writel(data, fdma->base + reg);
> > +}  
> 
> Regmap is too slow for you, you're using direct I/O accessors now?

This was in the original patch I had, and I don't think it was intended
for performances. I will switch to regmap and check if it is noticeable
at all but I really don't think so as there are a few register read
only in rx path.

> 
> > +
> > +static u32 ocelot_fdma_readl(struct ocelot_fdma *fdma, u32 reg)
> > +{
> > +	return readl(fdma->base + reg);
> > +}
> > +
> > +static unsigned int ocelot_fdma_idx_incr(unsigned int idx)  
> 
> Minor comment, but "inc" and "dec" are much more popular abbreviations.
> Although the way in which you use them is not quite the same way in
> which other drivers use them (something called "inc" would take a
> reference on the number and actually increment it). So maybe "next" and
> "prev"?

Agreed, next and prev makes more sense.

> 
> > +{
> > +	idx++;
> > +	if (idx == OCELOT_FDMA_MAX_DCB)
> > +		idx = 0;
> > +
> > +	return idx;
> > +}
> > +
> > +static unsigned int ocelot_fdma_idx_decr(unsigned int idx)
> > +{
> > +	if (idx == 0)
> > +		idx = OCELOT_FDMA_MAX_DCB - 1;
> > +	else
> > +		idx--;
> > +
> > +	return idx;
> > +}
> > +
> > +static int ocelot_fdma_tx_free_count(struct ocelot_fdma *fdma)
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->inj;
> > +
> > +	if (ring->tail >= ring->head)
> > +		return OCELOT_FDMA_MAX_DCB - (ring->tail - ring->head) - 1;
> > +	else
> > +		return ring->head - ring->tail - 1;
> > +}
> > +
> > +static bool ocelot_fdma_ring_empty(struct ocelot_fdma_ring *ring)
> > +{
> > +	return ring->head == ring->tail;
> > +}
> > +
> > +static void ocelot_fdma_activate_chan(struct ocelot_fdma *fdma,
> > +				      struct ocelot_fdma_dcb *dcb, int chan)
> > +{
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_DCB_LLP(chan), dcb->hw_dma);
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_ACTIVATE, BIT(chan));
> > +}
> > +
> > +static int ocelot_fdma_wait_chan_safe(struct ocelot_fdma *fdma, int chan)
> > +{
> > +	unsigned long timeout;
> > +	u32 safe;
> > +
> > +	timeout = jiffies + msecs_to_jiffies(OCELOT_FDMA_CH_SAFE_TIMEOUT_MS);
> > +	do {
> > +		safe = ocelot_fdma_readl(fdma, MSCC_FDMA_CH_SAFE);
> > +		if (safe & BIT(chan))
> > +			return 0;  
> 
> Pretty busy loop, and your timeout is 100 ms. Kinda nasty stuff for the
> latency of your system.

Ok, I will try to relax that a bit.

> 
> > +	} while (time_after(jiffies, timeout));
> > +
> > +	return -ETIMEDOUT;
> > +}
> > +
> > +static int ocelot_fdma_stop_channel(struct ocelot_fdma *fdma, int chan)
> > +{
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_DISABLE, BIT(chan));
> > +
> > +	return ocelot_fdma_wait_chan_safe(fdma, chan);  
> 
> Maybe for the extraction channel it would make sense to have an async
> stop, meaning that you ask it to stop, then process the frames you've
> got so far, and wait until the channel has completely stopped only when
> you need to refill?

That's an idea I could dig up yes. I haven't actually measured how much
tilme is spent busy looping.

> 
> > +}
> > +
> > +static bool ocelot_fdma_dcb_set_data(struct ocelot_fdma *fdma,
> > +				     struct ocelot_fdma_dcb *dcb,
> > +				     struct sk_buff *skb,
> > +				     size_t size, enum dma_data_direction dir)
> > +{
> > +	struct ocelot_fdma_dcb_hw_v2 *hw = dcb->hw;
> > +	u32 offset;
> > +
> > +	dcb->skb = skb;
> > +	dcb->mapped_size = size;
> > +	dcb->mapping = dma_map_single(fdma->dev, skb->data, size, dir);
> > +	if (unlikely(dma_mapping_error(fdma->dev, dcb->mapping)))
> > +		return false;
> > +
> > +	offset = dcb->mapping & 0x3;
> > +
> > +	hw->llp = 0;
> > +	hw->datap = ALIGN_DOWN(dcb->mapping, 4);
> > +	hw->datal = ALIGN_DOWN(size, 4);
> > +	hw->stat = MSCC_FDMA_DCB_STAT_BLOCKO(offset);
> > +
> > +	return true;
> > +}
> > +
> > +static bool ocelot_fdma_rx_set_skb(struct ocelot_fdma *fdma,
> > +				   struct ocelot_fdma_dcb *dcb,
> > +				   struct sk_buff *skb, size_t size)
> > +{
> > +	return ocelot_fdma_dcb_set_data(fdma, dcb, skb, size,
> > +					DMA_FROM_DEVICE);
> > +}
> > +
> > +static bool ocelot_fdma_tx_dcb_set_skb(struct ocelot_fdma *fdma,
> > +				       struct ocelot_fdma_dcb *dcb,
> > +				       struct sk_buff *skb)
> > +{
> > +	if (!ocelot_fdma_dcb_set_data(fdma, dcb, skb, skb->len,
> > +				      DMA_TO_DEVICE))
> > +		return false;
> > +
> > +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_BLOCKL(skb->len);
> > +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_SOF | MSCC_FDMA_DCB_STAT_EOF;
> > +
> > +	return true;
> > +}
> > +
> > +static void ocelot_fdma_rx_restart(struct ocelot_fdma *fdma)
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> > +	struct ocelot_fdma_dcb *dcb, *last_dcb;
> > +	unsigned int idx;
> > +	int ret;
> > +	u32 llp;
> > +
> > +	/* Check if the FDMA hits the DCB with LLP == NULL */
> > +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_DCB_LLP(MSCC_FDMA_XTR_CHAN));
> > +	if (llp)
> > +		return;  
> 
> I'm not sure why you're letting the hardware grind to a halt first,
> before refilling? I think since the CPU is the bottleneck anyway, you
> can stop the extraction channel at any time you want to refill.
> A constant stream of less data might be better than a bursty one.
> Or maybe I'm misunderstanding some of the details of the hardware.

Indeed, I can stop the extraction channel but that does not seems a
good idea to stop the channel in a steady state. At least that's what I
thought since it will make the receive "window" non predictable. Not
sure how well it will play with various protocol but I will try
implementing the refill we talked previously (ie when there an
available threshold is reached).

> 
> > +
> > +	ret = ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> > +	if (ret) {
> > +		dev_warn(fdma->dev, "Unable to stop RX channel\n");  
> 
> Rate limit these prints maybe.

Ok.

> 
> > +		return;
> > +	}
> > +
> > +	/* Chain the tail with the next DCB */
> > +	dcb = &ring->dcbs[ring->tail];
> > +	idx = ocelot_fdma_idx_incr(ring->tail);
> > +	dcb->hw->llp = ring->dcbs[idx].hw_dma;
> > +	dcb = &ring->dcbs[idx];
> > +
> > +	/* Place a NULL terminator in last DCB added (head - 1) */
> > +	idx = ocelot_fdma_idx_decr(ring->head);
> > +	last_dcb = &ring->dcbs[idx];
> > +	last_dcb->hw->llp = 0;
> > +	ring->tail = idx;
> > +
> > +	/* Finally reactivate the channel */
> > +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_XTR_CHAN);
> > +}
> > +
> > +static bool ocelot_fdma_rx_get(struct ocelot_fdma *fdma, int budget)
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> > +	struct ocelot_fdma_dcb *dcb, *next_dcb;
> > +	struct ocelot *ocelot = fdma->ocelot;
> > +	struct net_device *ndev;
> > +	struct sk_buff *skb;
> > +	bool valid = true;
> > +	u64 timestamp;
> > +	u64 src_port;
> > +	void *xfh;
> > +	u32 stat;
> > +
> > +	/* We should not go past the tail */
> > +	if (ring->head == ring->tail)
> > +		return false;
> > +
> > +	dcb = &ring->dcbs[ring->head];
> > +	stat = dcb->hw->stat;
> > +	if (MSCC_FDMA_DCB_STAT_BLOCKL(stat) == 0)
> > +		return false;
> > +
> > +	ring->head = ocelot_fdma_idx_incr(ring->head);
> > +
> > +	if (stat & MSCC_FDMA_DCB_STAT_ABORT || stat & MSCC_FDMA_DCB_STAT_PD)
> > +		valid = false;
> > +
> > +	if (!(stat & MSCC_FDMA_DCB_STAT_SOF) ||
> > +	    !(stat & MSCC_FDMA_DCB_STAT_EOF))
> > +		valid = false;
> > +
> > +	dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> > +			 DMA_FROM_DEVICE);
> > +
> > +	skb = dcb->skb;
> > +
> > +	if (unlikely(!valid)) {
> > +		dev_warn(fdma->dev, "Invalid packet\n");  
> 
> Rate limit please, and try to show something which might be relevant to
> why it is not valid.

Ok.

> 
> > +		goto refill;
> > +	}
> > +
> > +	xfh = skb->data;
> > +	ocelot_xfh_get_src_port(xfh, &src_port);
> > +
> > +	if (WARN_ON(src_port >= ocelot->num_phys_ports))
> > +		goto refill;
> > +
> > +	ndev = ocelot_port_to_netdev(ocelot, src_port);
> > +	if (unlikely(!ndev))
> > +		goto refill;
> > +
> > +	skb_put(skb, MSCC_FDMA_DCB_STAT_BLOCKL(stat) - ETH_FCS_LEN);
> > +	skb_pull(skb, OCELOT_TAG_LEN);
> > +
> > +	skb->dev = ndev;
> > +	skb->protocol = eth_type_trans(skb, skb->dev);
> > +	skb->dev->stats.rx_bytes += skb->len;
> > +	skb->dev->stats.rx_packets++;
> > +
> > +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);  
> 
> You forgot to extract the "timestamp" from the XFH, and are providing
> junk from the kernel stack memory. Please make sure to test PTP.
> 
> > +
> > +	if (!skb_defer_rx_timestamp(skb))
> > +		netif_receive_skb(skb);
> > +
> > +	skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);
> > +	if (!skb)
> > +		return false;  
> 
> See my comment below, on the ocelot_fdma_rx_skb_alloc() function, on why
> I think you are making sub-optimal use of the ring concept.
> 
> > +
> > +refill:
> > +	if (!ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size))
> > +		return false;
> > +
> > +	/* Chain the next DCB */
> > +	next_dcb = &ring->dcbs[ring->head];
> > +	dcb->hw->llp = next_dcb->hw_dma;
> > +
> > +	return true;
> > +}
> > +
> > +static void ocelot_fdma_tx_cleanup(struct ocelot_fdma *fdma, int budget)
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->inj;
> > +	unsigned int tmp_head, new_null_llp_idx;
> > +	struct ocelot_fdma_dcb *dcb;
> > +	bool end_of_list = false;
> > +	int ret;
> > +
> > +	spin_lock_bh(&fdma->xmit_lock);
> > +
> > +	/* Purge the TX packets that have been sent up to the NULL llp or the
> > +	 * end of done list.
> > +	 */
> > +	while (!ocelot_fdma_ring_empty(&fdma->inj)) {  
> 
> s/&fdma->inj/ring/
> 
> > +		dcb = &ring->dcbs[ring->head];
> > +		if (!(dcb->hw->stat & MSCC_FDMA_DCB_STAT_PD))
> > +			break;
> > +
> > +		tmp_head = ring->head;  
> 
> Unused.
> 
> > +		ring->head = ocelot_fdma_idx_incr(ring->head);
> > +
> > +		dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> > +				 DMA_TO_DEVICE);
> > +		napi_consume_skb(dcb->skb, budget);
> > +
> > +		/* If we hit the NULL LLP, stop, we might need to reload FDMA */
> > +		if (dcb->hw->llp == 0) {
> > +			end_of_list = true;
> > +			break;
> > +		}
> > +	}
> > +
> > +	/* If there is still some DCBs to be processed by the FDMA or if the
> > +	 * pending list is empty, there is no need to restart the FDMA.
> > +	 */  
> 
> I don't understand why you restart the injection channel from the TX
> confirmation interrupt. It raised the interrupt to tell you that it hit
> a NULL LLP because there's nothing left to send. If you restart it now and
> no other transmission has happened in the meantime, won't it stop again?

Actually, it is only restarted if there is some pending packets to
send. With this hardware, packets can't be added while the FDMA is
running and it must be stopped everytime we want to add a packet to the
list. To avoid that, in the TX path, if the FDMA is stopped, we set the
llp of the packet to NULL and start the chan. However, if the FDMA TX
channel is running, we don't stop it, we simply add the next packets to
the ring. However, the FDMA will stop on the previous NULL LLP. So when
we hit a LLP, we might not be at the end of the list. This is why the
next check verifies if we hit a NULL LLP and if there is still some
packet to send. 

> 
> > +	if (!end_of_list || ocelot_fdma_ring_empty(&fdma->inj))  
> 
> s/&fdma->inj/ring/
> 
> > +		goto out_unlock;
> > +
> > +	ret = ocelot_fdma_wait_chan_safe(fdma, MSCC_FDMA_INJ_CHAN);
> > +	if (ret) {
> > +		dev_warn(fdma->dev, "Failed to wait for TX channel to stop\n");
> > +		goto out_unlock;
> > +	}
> > +
> > +	/* Set NULL LLP */
> > +	new_null_llp_idx = ocelot_fdma_idx_decr(ring->tail);
> > +	dcb = &ring->dcbs[new_null_llp_idx];
> > +	dcb->hw->llp = 0;
> > +
> > +	dcb = &ring->dcbs[ring->head];
> > +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);  
> 
> 	if (unlikely(netif_queue_stopped(dev)) &&
> 	    ocelot_fdma_tx_free_count(fdma))
> 		netif_wake_queue(dev);
> 
> This can then be tweaked for when you add support for scatter/gather xmit.

Acked. Note that this will loop an all netdev since the FDMA channels
are shared for all ports. BTW, I think this code should be called
before restarting the channel (ie after cleaning up the TX ring above).

> 
> > +
> > +out_unlock:
> > +	spin_unlock_bh(&fdma->xmit_lock);
> > +}
> > +
> > +static int ocelot_fdma_napi_poll(struct napi_struct *napi, int budget)
> > +{
> > +	struct ocelot_fdma *fdma = container_of(napi, struct ocelot_fdma, napi);
> > +	int work_done = 0;
> > +
> > +	ocelot_fdma_tx_cleanup(fdma, budget);
> > +
> > +	while (work_done < budget) {
> > +		if (!ocelot_fdma_rx_get(fdma, budget))
> > +			break;
> > +
> > +		work_done++;
> > +	}
> > +
> > +	ocelot_fdma_rx_restart(fdma);
> > +
> > +	if (work_done < budget) {
> > +		napi_complete_done(&fdma->napi, work_done);
> > +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> > +				   BIT(MSCC_FDMA_INJ_CHAN) |
> > +				   BIT(MSCC_FDMA_XTR_CHAN));
> > +	}
> > +
> > +	return work_done;
> > +}
> > +
> > +static irqreturn_t ocelot_fdma_interrupt(int irq, void *dev_id)
> > +{
> > +	u32 ident, llp, frm, err, err_code;
> > +	struct ocelot_fdma *fdma = dev_id;
> > +
> > +	ident = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_IDENT);
> > +	frm = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_FRM);
> > +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_LLP);
> > +
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, llp & ident);
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, frm & ident);
> > +	if (frm || llp) {
> > +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> > +		napi_schedule(&fdma->napi);
> > +	}
> > +
> > +	err = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR);
> > +	if (unlikely(err)) {
> > +		err_code = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR_CODE);
> > +		dev_err_ratelimited(fdma->dev,
> > +				    "Error ! chans mask: %#x, code: %#x\n",
> > +				    err, err_code);
> > +
> > +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR, err);
> > +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR_CODE, err_code);
> > +	}
> > +
> > +	return IRQ_HANDLED;
> > +}
> > +
> > +static void ocelot_fdma_send_skb(struct ocelot_fdma *fdma, struct sk_buff *skb)
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->inj;
> > +	struct ocelot_fdma_dcb *dcb, *next;
> > +
> > +	dcb = &ring->dcbs[ring->tail];
> > +	if (!ocelot_fdma_tx_dcb_set_skb(fdma, dcb, skb)) {
> > +		dev_kfree_skb_any(skb);
> > +		return;
> > +	}
> > +
> > +	if (ocelot_fdma_ring_empty(&fdma->inj)) {  
> 
> s/&fdma->inj/ring/
> 
> > +		ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
> > +	} else {
> > +		next = &ring->dcbs[ocelot_fdma_idx_incr(ring->tail)];
> > +		dcb->hw->llp = next->hw_dma;
> > +	}
> > +
> > +	ring->tail = ocelot_fdma_idx_incr(ring->tail);  
> 
> You still have locking between TX and TX conf, that's too bad. Why is
> that, I wonder? Because TX conf (ocelot_fdma_tx_cleanup) updates
> ring->head and TX (ocelot_fdma_send_skb) updates ring->tail. Could it be
> because you're updating the ring->tail _after_ you've activated the
> injection channel, therefore exposing you to a race with the completion
> interrupt which reads ring->tail?

Since the FDMA is shared between multiple netdevs, I think the TX
locking might be needed only in xmit when checking if there is space
to send a packet.
I think it can indeed be left out in the TX cleanup path.

> 
> > +
> > +	skb_tx_timestamp(skb);
> > +}
> > +
> > +static int ocelot_fdma_prepare_skb(struct ocelot_fdma *fdma, int port,
> > +				   u32 rew_op, struct sk_buff *skb,
> > +				   struct net_device *dev)
> > +{
> > +	int needed_headroom = max_t(int, OCELOT_TAG_LEN - skb_headroom(skb), 0);
> > +	int needed_tailroom = max_t(int, ETH_FCS_LEN - skb_tailroom(skb), 0);
> > +	struct ocelot_port *ocelot_port = fdma->ocelot->ports[port];
> > +	void *ifh;
> > +	int err;
> > +
> > +	if (unlikely(needed_headroom || needed_tailroom ||
> > +		     skb_header_cloned(skb))) {
> > +		err = pskb_expand_head(skb, needed_headroom, needed_tailroom,
> > +				       GFP_ATOMIC);
> > +		if (unlikely(err)) {
> > +			dev_kfree_skb_any(skb);
> > +			return 1;
> > +		}
> > +	}
> > +
> > +	err = skb_linearize(skb);
> > +	if (err) {
> > +		net_err_ratelimited("%s: skb_linearize error (%d)!\n",
> > +				    dev->name, err);
> > +		dev_kfree_skb_any(skb);
> > +		return 1;
> > +	}
> > +
> > +	ifh = skb_push(skb, OCELOT_TAG_LEN);
> > +	skb_put(skb, ETH_FCS_LEN);
> > +	ocelot_ifh_port_set(ifh, ocelot_port, rew_op, skb_vlan_tag_get(skb));
> > +
> > +	return 0;
> > +}
> > +
> > +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> > +			     struct sk_buff *skb, struct net_device *dev)
> > +{
> > +	int ret = NETDEV_TX_OK;
> > +
> > +	spin_lock(&fdma->xmit_lock);
> > +
> > +	if (ocelot_fdma_tx_free_count(fdma) == 0) {
> > +		ret = NETDEV_TX_BUSY;  
> 
> netif_stop_queue(dev);
> 
> > +		goto out;
> > +	}
> > +
> > +	if (ocelot_fdma_prepare_skb(fdma, port, rew_op, skb, dev))
> > +		goto out;
> > +
> > +	ocelot_fdma_send_skb(fdma, skb);
> > +
> > +out:
> > +	spin_unlock(&fdma->xmit_lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static void ocelot_fdma_ring_free(struct ocelot_fdma *fdma,
> > +				  struct ocelot_fdma_ring *ring)
> > +{
> > +	dmam_free_coherent(fdma->dev, OCELOT_DCBS_HW_ALLOC_SIZE, ring->hw_dcbs,
> > +			   ring->hw_dcbs_dma);
> > +}
> > +
> > +static int ocelot_fdma_ring_alloc(struct ocelot_fdma *fdma,
> > +				  struct ocelot_fdma_ring *ring)
> > +{
> > +	struct ocelot_fdma_dcb_hw_v2 *hw_dcbs;
> > +	struct ocelot_fdma_dcb *dcb;
> > +	dma_addr_t hw_dcbs_dma;
> > +	unsigned int adjust;
> > +	int i;
> > +
> > +	/* Create a pool of consistent memory blocks for hardware descriptors */
> > +	ring->hw_dcbs = dmam_alloc_coherent(fdma->dev,
> > +					    OCELOT_DCBS_HW_ALLOC_SIZE,
> > +					    &ring->hw_dcbs_dma, GFP_KERNEL);
> > +	if (!ring->hw_dcbs)
> > +		return -ENOMEM;
> > +
> > +	/* DCBs must be aligned on a 32bit boundary */
> > +	hw_dcbs = ring->hw_dcbs;
> > +	hw_dcbs_dma = ring->hw_dcbs_dma;
> > +	if (!IS_ALIGNED(hw_dcbs_dma, 4)) {
> > +		adjust = hw_dcbs_dma & 0x3;
> > +		hw_dcbs_dma = ALIGN(hw_dcbs_dma, 4);
> > +		hw_dcbs = (void *)hw_dcbs + adjust;
> > +	}
> > +
> > +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> > +		dcb = &ring->dcbs[i];
> > +		dcb->hw = &hw_dcbs[i];
> > +		dcb->hw_dma = hw_dcbs_dma +
> > +			     i * sizeof(struct ocelot_fdma_dcb_hw_v2);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int ocelot_fdma_rx_skb_alloc(struct ocelot_fdma *fdma)
> > +{
> > +	struct ocelot_fdma_dcb *dcb, *prev_dcb = NULL;
> > +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> > +	struct sk_buff *skb;
> > +	int i;
> > +
> > +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> > +		dcb = &ring->dcbs[i];
> > +		skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);  
> 
> I have to tell you, skb allocation at probe time is something I haven't
> seen before, I'll have to defer to somebody else for a second opinion.
> I understand that you keep the sk_buff structure closely tied to the DCB
> structure, whereas normally you'd see only DCB structures in the RX ring.
> 
> And napi_alloc_skb? This isn't NAPI context.
> 
> The whole idea (as far as I understand it!) with a receive ring is that
> hardware produces data on one side of it, and software consumes from
> that side too and in the same direction, just lags behind a little bit.
> Software must also provide buffers at the opposite end, in which
> hardware will put the produced data, so that the pipeline never runs
> out. You'll notice that it is standard amongst ring based drivers to
> name your ring->head as ring->next_to_clean, and your ring->tail as
> ring->next_to_use (actually not quite: your hardware doesn't really
> provide producer and consumer indices, and the way in which you update
> the ring->tail past the NULL LLP pointer is a bit different from the way
> in which drivers use ring->next_to_use and more like ring->next_to_alloc,
> but more on that later). I think people reviewing your code will instantly
> know what it's about if you name and structure things in this way,
> perhaps the lack of consumer and producer indices is merely an
> irrelevant detail.
> 
> As long as the ring size is large enough in order to give software some
> time to react, you can reach a state of equilibrium where you don't need
> to allocate or DMA map any new buffer, you can just recycle the ones you
> already have. To use buffer recycling, you need to replace alloc_skb()
> with functions such as build_skb(), which builds an sk_buff structure
> around a pre-existing buffer.
> 
> One simple way to achieve this is the page flipping technique. There may
> be others and you don't have to use this one. The principle is as
> follows: you allocate and DMA map buffers double the size you need
> (PAGE_SIZE is a common choice, divided by two you get about 2K of data
> per buffer, for frames larger than that you should do scatter/gather)
> and you populate an RX DCB with just one half of that page, the other
> half is unused for now. You repeat this process until you've filled up
> the RX ring with sufficient DCB entries, and all of this happens at
> initialization time.
> 
> Then, when you process a frame from the RX DCB ring (in the NAPI poll
> function), you construct a skb around that page half, and before giving
> it to the network stack, you speculatively attempt to reuse the other
> half of the page by putting it back into the RX DCB ring, at the
> opposite end compared to where you're consuming the other half from.
> Ring-based drivers use a separate ring->next_to_alloc variable for this.
> DCB elements ranging from ring->next_to_use and up to ring->next_to_alloc
> are simply halves of pages that have been processed by your NAPI, and
> are ready to be committed to hardware when you need to refill (in your
> case update the LLP pointer).
> 
> So each page could have up to two users (references), and you need to be
> very careful how you deal with concurrency with the stack. As Claudiu
> explained in great detail to me when I was first studying this
> mechanism, the driver is the producer of these RX pages and the network
> stack is the consumer. And since the network stack will eventually call
> kfree_skb() after it's done, which ultimately results into a put_page(),
> you need to counteract that action and ensure that if you want to recycle
> the other half of the page, the page's refcount never reaches zero.
> 
> As a result, if you deem the other page half as good for recycling, you
> need to bump the reference count of the page from 1 to 2. This will make
> it safe for you to refill the DCB ring with that other half.
> 
> Of course, the other half of the page might not be available for
> recycling, this is why the technique is speculative. If the stack hasn't
> yet called kfree_skb() on the other half, the page reference count will
> be 2 at the time you're cleaning the buffer, and bad luck, you can no
> longer reuse this page, so the driver needs to take its hands off of it.
> 
> If you look at driver implementations of the half page flip heuristic,
> you'll see that "taking your hands off the page" means simply to DMA
> unmap the entire page. This might surprise you, after all, the reference
> count of the page is 2, you might think that the page will leak if you
> don't decrement one of the references on it, or something. But if you
> think about it, the reference count is 2 because one half has an skb
> built at an earlier time around it, which is still in the network stack
> there somewhere pending a kfree_skb(), and the other half is the buffer
> you're cleaning right now. This half is practically promised to the
> network stack, you'll create an skb around this half too, and the stack
> will kfree it too, and the refcount of the page will thus drop to zero.
> 
> Of course, when you need to refill the RX ring with, say, 32 buffers,
> you might find that (ring->next_to_alloc - ring->next_to_use) mod ring size
> is less than 32 (otherwise said, the buffer reuse technique couldn't
> provide enough buffers). So you need to alloc and DMA map new pages from
> ring->next_to_alloc up to 32 (but this is still in contrast with your
> current implementation which always calls napi_alloc_skb 32 times).
> Nonetheless, in my experience, the technique works very well in real
> life situations and is a very good way to reduce the pressure on the
> memory allocator and avoid costly DMA mapping and unmapping per packet.
> Since you've already went through the trouble of making this a
> ring-based driver, I believe you should try to implement a buffer reuse
> technique too, especially for this particular case of a very slow
> processor.

Ok, I see the idea, thanks for the in-depth explanation.

> 
> > +		if (!skb)
> > +			goto skb_alloc_failed;
> > +
> > +		ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size);
> > +
> > +		if (prev_dcb)
> > +			prev_dcb->hw->llp = dcb->hw_dma;
> > +
> > +		prev_dcb = dcb;
> > +	}
> > +
> > +	ring->head = 0;
> > +	ring->tail = OCELOT_FDMA_MAX_DCB - 1;
> > +
> > +	return 0;
> > +
> > +skb_alloc_failed:
> > +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> > +		dcb = &ring->dcbs[i];
> > +		if (!dcb->skb)
> > +			break;
> > +
> > +		dev_kfree_skb_any(dcb->skb);
> > +	}
> > +
> > +	return -ENOMEM;
> > +}
> > +
> > +static int ocelot_fdma_rx_init(struct ocelot_fdma *fdma)
> > +{
> > +	int ret;
> > +
> > +	fdma->rx_buf_size = ocelot_fdma_rx_buf_size(OCELOT_FDMA_RX_MTU);
> > +
> > +	ret = ocelot_fdma_rx_skb_alloc(fdma);
> > +	if (ret) {
> > +		netif_napi_del(&fdma->napi);
> > +		return ret;
> > +	}
> > +
> > +	napi_enable(&fdma->napi);
> > +
> > +	ocelot_fdma_activate_chan(fdma, &fdma->xtr.dcbs[0],
> > +				  MSCC_FDMA_XTR_CHAN);
> > +
> > +	return 0;
> > +}
> > +
> > +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> > +{
> > +	dev->needed_headroom = OCELOT_TAG_LEN;
> > +	dev->needed_tailroom = ETH_FCS_LEN;  
> 
> The needed_headroom is in no way specific to FDMA, right? Why aren't you
> doing it for manual register-based injection too? (in a separate patch ofc)

Actually, If I switch to page based ring, This won't be useful anymore
because the header will be written directly in the page and not anymore
directly in the skb header.

> 
> > +
> > +	if (fdma->ndev)
> > +		return;
> > +
> > +	fdma->ndev = dev;
> > +	netif_napi_add(dev, &fdma->napi, ocelot_fdma_napi_poll,
> > +		       OCELOT_FDMA_WEIGHT);  
> 
> I understand that NAPI is per netdev but you have a single interrupt so
> you need to share the NAPI instance for all ports. That is fine.
> But danger ahead, see this:
> 
> mscc_ocelot_init_ports() does:
> 
> 		err = ocelot_probe_port(ocelot, port, target, portnp);
> 		if (err) {
> 			ocelot_port_devlink_teardown(ocelot, port);
> 			continue;
> 		}
> 
> aka it skips over ports that failed to probe.
> And ocelot_probe_port does:
> 
> 	if (ocelot->fdma)
> 		ocelot_fdma_netdev_init(ocelot->fdma, dev);
> 
> 	err = register_netdev(dev);
> 	if (err) {
> 		dev_err(ocelot->dev, "register_netdev failed\n");
> 		goto out;
> 	}
> 
> So if register_netdev() fails, you will have a dangling, freed pointer
> inside fdma->ndev. That is not good, as far as I can tell. Try to make
> the probing of your first port fail at register_netdev() time, to see
> what I mean.

Ok, I will look at that.

> 
> > +}
> > +
> > +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma, struct net_device *dev)
> > +{
> > +	if (dev == fdma->ndev)
> > +		netif_napi_del(&fdma->napi);
> > +}
> > +
> > +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> > +				     struct ocelot *ocelot)
> > +{
> > +	struct ocelot_fdma *fdma;
> > +	void __iomem *base;
> > +	int ret;
> > +
> > +	base = devm_platform_ioremap_resource_byname(pdev, "fdma");
> > +	if (IS_ERR_OR_NULL(base))
> > +		return NULL;
> > +
> > +	fdma = devm_kzalloc(&pdev->dev, sizeof(*fdma), GFP_KERNEL);
> > +	if (!fdma)
> > +		goto err_release_resource;
> > +
> > +	fdma->ocelot = ocelot;
> > +	fdma->base = base;
> > +	fdma->dev = &pdev->dev;
> > +	fdma->dev->coherent_dma_mask = DMA_BIT_MASK(32);
> > +
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> > +
> > +	fdma->irq = platform_get_irq_byname(pdev, "fdma");
> > +	ret = devm_request_irq(&pdev->dev, fdma->irq, ocelot_fdma_interrupt, 0,
> > +			       dev_name(&pdev->dev), fdma);
> > +	if (ret)
> > +		goto err_free_fdma;
> > +
> > +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->inj);
> > +	if (ret)
> > +		goto err_free_irq;
> > +
> > +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->xtr);
> > +	if (ret)
> > +		goto free_inj_ring;
> > +
> > +	return fdma;
> > +
> > +free_inj_ring:
> > +	ocelot_fdma_ring_free(fdma, &fdma->inj);
> > +err_free_irq:
> > +	devm_free_irq(&pdev->dev, fdma->irq, fdma);
> > +err_free_fdma:
> > +	devm_kfree(&pdev->dev, fdma);
> > +err_release_resource:
> > +	devm_iounmap(&pdev->dev, base);
> > +
> > +	return NULL;
> > +}
> > +
> > +int ocelot_fdma_start(struct ocelot_fdma *fdma)
> > +{
> > +	struct ocelot *ocelot = fdma->ocelot;
> > +	int ret;
> > +
> > +	ret = ocelot_fdma_rx_init(fdma);
> > +	if (ret)
> > +		return -EINVAL;
> > +
> > +	/* Reconfigure for extraction and injection using DMA */
> > +	ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_MODE(2), QS_INJ_GRP_CFG, 0);
> > +	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(0), QS_INJ_CTRL, 0);
> > +
> > +	ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_MODE(2), QS_XTR_GRP_CFG, 0);
> > +
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, 0xffffffff);
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, 0xffffffff);
> > +
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP_ENA,
> > +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM_ENA, BIT(MSCC_FDMA_XTR_CHAN));
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> > +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> > +
> > +	return 0;
> > +}
> > +
> > +int ocelot_fdma_stop(struct ocelot_fdma *fdma)  
> 
> This should return void.
> 
> > +{
> > +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> > +	struct ocelot_fdma_dcb *dcb;
> > +	int i;
> > +
> > +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> > +
> > +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> > +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_INJ_CHAN);
> > +
> > +	/* Free the SKB hold in the extraction ring */
> > +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> > +		dcb = &ring->dcbs[i];
> > +		dev_kfree_skb_any(dcb->skb);
> > +	}
> > +
> > +	napi_synchronize(&fdma->napi);
> > +	napi_disable(&fdma->napi);
> > +
> > +	return 0;
> > +}
> > diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.h b/drivers/net/ethernet/mscc/ocelot_fdma.h
> > new file mode 100644
> > index 000000000000..b6f1dda0e0c7
> > --- /dev/null
> > +++ b/drivers/net/ethernet/mscc/ocelot_fdma.h
> > @@ -0,0 +1,96 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> > +/*
> > + * Microsemi SoCs FDMA driver
> > + *
> > + * Copyright (c) 2021 Microchip
> > + */
> > +#ifndef _MSCC_OCELOT_FDMA_H_
> > +#define _MSCC_OCELOT_FDMA_H_
> > +
> > +#include "ocelot.h"
> > +
> > +#define OCELOT_FDMA_MAX_DCB		128
> > +/* +4 allows for word alignment after allocation */
> > +#define OCELOT_DCBS_HW_ALLOC_SIZE	(OCELOT_FDMA_MAX_DCB * \
> > +					 sizeof(struct ocelot_fdma_dcb_hw_v2) + \
> > +					 4)
> > +
> > +struct ocelot_fdma_dcb_hw_v2 {
> > +	u32 llp;
> > +	u32 datap;
> > +	u32 datal;
> > +	u32 stat;
> > +};  
> 
> Could you declare this using __attribute((packed)) to show that you're
> mapping it over hardware?

Ok.

> 
> > +
> > +/**
> > + * struct ocelot_fdma_dcb - Software DCBs description
> > + *
> > + * @hw: hardware DCB used by hardware(coherent memory)
> > + * @hw_dma: DMA address of the DCB
> > + * @skb: skb associated with the DCB
> > + * @mapping: Address of the skb data mapping
> > + * @mapped_size: Mapped size
> > + */
> > +struct ocelot_fdma_dcb {
> > +	struct ocelot_fdma_dcb_hw_v2	*hw;
> > +	dma_addr_t			hw_dma;
> > +	struct sk_buff			*skb;
> > +	dma_addr_t			mapping;
> > +	size_t				mapped_size;
> > +};
> > +
> > +/**
> > + * struct ocelot_fdma_ring - "Ring" description of DCBs
> > + *
> > + * @hw_dcbs: Hardware DCBs allocated for the ring
> > + * @hw_dcbs_dma: DMA address of the DCBs
> > + * @dcbs: List of software DCBs
> > + * @head: pointer to first available DCB
> > + * @tail: pointer to last available DCB
> > + */
> > +struct ocelot_fdma_ring {
> > +	struct ocelot_fdma_dcb_hw_v2	*hw_dcbs;
> > +	dma_addr_t			hw_dcbs_dma;
> > +	struct ocelot_fdma_dcb		dcbs[OCELOT_FDMA_MAX_DCB];
> > +	unsigned int			head;
> > +	unsigned int			tail;
> > +};
> > +
> > +/**
> > + * struct ocelot_fdma - FMDA struct  
> 
> s/FMDA/FDMA/
> 
> > + *
> > + * @ocelot: Pointer to ocelot struct
> > + * @base: base address of FDMA registers
> > + * @irq: FDMA interrupt
> > + * @dev: Ocelot device
> > + * @napi: napi handle
> > + * @rx_buf_size: Size of RX buffer
> > + * @inj: Injection ring
> > + * @xtr: Extraction ring
> > + * @xmit_lock: Xmit lock
> > + *
> > + */
> > +struct ocelot_fdma {
> > +	struct ocelot			*ocelot;  
> 
> To me, this structure organization in which "struct ocelot_fdma *" is
> passed as argument to all FDMA functions, instead of "struct ocelot *",
> is strange, and leads to oddities such as this backpointer right here.
> Do it in whichever way you want, I'm just pointing this out.

Ok, I'll probably pass a ocelot pointer for all of them to be more
coherent.

> 
> > +	void __iomem			*base;
> > +	int				irq;
> > +	struct device			*dev;
> > +	struct napi_struct		napi;
> > +	struct net_device		*ndev;
> > +	size_t				rx_buf_size;
> > +	struct ocelot_fdma_ring		inj;
> > +	struct ocelot_fdma_ring		xtr;
> > +	spinlock_t			xmit_lock;
> > +};
> > +
> > +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> > +				     struct ocelot *ocelot);
> > +int ocelot_fdma_start(struct ocelot_fdma *fdma);
> > +int ocelot_fdma_stop(struct ocelot_fdma *fdma);
> > +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> > +			     struct sk_buff *skb, struct net_device *dev);
> > +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev);
> > +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma,
> > +			       struct net_device *dev);
> > +
> > +#endif
> > diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
> > index b589ae95e29b..9dcaf421da12 100644
> > --- a/drivers/net/ethernet/mscc/ocelot_net.c
> > +++ b/drivers/net/ethernet/mscc/ocelot_net.c
> > @@ -15,6 +15,7 @@
> >  #include <net/pkt_cls.h>
> >  #include "ocelot.h"
> >  #include "ocelot_vcap.h"
> > +#include "ocelot_fdma.h"
> >  
> >  #define OCELOT_MAC_QUIRKS	OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP
> >  
> > @@ -457,7 +458,7 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	int port = priv->chip_port;
> >  	u32 rew_op = 0;
> >  
> > -	if (!ocelot_can_inject(ocelot, 0))
> > +	if (!ocelot->fdma && !ocelot_can_inject(ocelot, 0))
> >  		return NETDEV_TX_BUSY;
> >  
> >  	/* Check if timestamping is needed */
> > @@ -475,9 +476,13 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
> >  		rew_op = ocelot_ptp_rew_op(skb);
> >  	}
> >  
> > -	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
> > +	if (ocelot->fdma) {
> > +		ocelot_fdma_inject_frame(ocelot->fdma, port, rew_op, skb, dev);
> > +	} else {
> > +		ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);  
> 
> I can't help but think how painful it is that with a CPU as slow as
> yours, insult over injury, you also need to check for each packet
> whether the device tree had defined the "fdma" region or not, because
> you practically keep two traffic I/O implementations due to that sole
> reason. I think for the ocelot switchdev driver, which is strictly for
> MIPS CPUs embedded within the device, it should be fine to introduce a
> static key here (search for static_branch_likely in the kernel).

I thinked about it *but* did not wanted to add a key since it would be
global. However, we could consider that there is always only one
instance of the driver and indeed a static key is an option.
Unfortunately, I'm not sure this will yield any noticeable performance
improvement.

> 
> >  
> > -	kfree_skb(skb);
> > +		consume_skb(skb);
> > +	}
> >  
> >  	return NETDEV_TX_OK;
> >  }
> > @@ -1717,6 +1722,9 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
> >  	if (err)
> >  		goto out;
> >  
> > +	if (ocelot->fdma)
> > +		ocelot_fdma_netdev_init(ocelot->fdma, dev);
> > +
> >  	err = register_netdev(dev);
> >  	if (err) {
> >  		dev_err(ocelot->dev, "register_netdev failed\n");
> > @@ -1737,9 +1745,13 @@ void ocelot_release_port(struct ocelot_port *ocelot_port)
> >  	struct ocelot_port_private *priv = container_of(ocelot_port,
> >  						struct ocelot_port_private,
> >  						port);
> > +	struct ocelot_fdma *fdma = ocelot_port->ocelot->fdma;
> >  
> >  	unregister_netdev(priv->dev);
> >  
> > +	if (fdma)
> > +		ocelot_fdma_netdev_deinit(fdma, priv->dev);
> > +
> >  	if (priv->phylink) {
> >  		rtnl_lock();
> >  		phylink_disconnect_phy(priv->phylink);
> > diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> > index 38103b0255b0..fa68eb23a333 100644
> > --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> > +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> > @@ -18,6 +18,7 @@
> >  
> >  #include <soc/mscc/ocelot_vcap.h>
> >  #include <soc/mscc/ocelot_hsio.h>
> > +#include "ocelot_fdma.h"
> >  #include "ocelot.h"
> >  
> >  static const u32 ocelot_ana_regmap[] = {
> > @@ -1080,6 +1081,10 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
> >  		ocelot->targets[io_target[i].id] = target;
> >  	}
> >  
> > +	ocelot->fdma = ocelot_fdma_init(pdev, ocelot);
> > +	if (IS_ERR(ocelot->fdma))
> > +		ocelot->fdma = NULL;
> > +
> >  	hsio = syscon_regmap_lookup_by_compatible("mscc,ocelot-hsio");
> >  	if (IS_ERR(hsio)) {
> >  		dev_err(&pdev->dev, "missing hsio syscon\n");
> > @@ -1139,6 +1144,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
> >  	if (err)
> >  		goto out_ocelot_devlink_unregister;
> >  
> > +	if (ocelot->fdma) {
> > +		err = ocelot_fdma_start(ocelot->fdma);
> > +		if (err)
> > +			goto out_ocelot_release_ports;
> > +	}
> > +
> >  	err = ocelot_devlink_sb_register(ocelot);
> >  	if (err)
> >  		goto out_ocelot_release_ports;
> > @@ -1179,6 +1190,8 @@ static int mscc_ocelot_remove(struct platform_device *pdev)
> >  {
> >  	struct ocelot *ocelot = platform_get_drvdata(pdev);
> >  
> > +	if (ocelot->fdma)
> > +		ocelot_fdma_stop(ocelot->fdma);
> >  	devlink_unregister(ocelot->devlink);
> >  	ocelot_deinit_timestamp(ocelot);
> >  	ocelot_devlink_sb_unregister(ocelot);
> > diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> > index b3381c90ff3e..351ab385ab98 100644
> > --- a/include/soc/mscc/ocelot.h
> > +++ b/include/soc/mscc/ocelot.h
> > @@ -695,6 +695,8 @@ struct ocelot {
> >  	/* Protects the PTP clock */
> >  	spinlock_t			ptp_clock_lock;
> >  	struct ptp_pin_desc		ptp_pins[OCELOT_PTP_PINS_NUM];
> > +
> > +	struct ocelot_fdma		*fdma;
> >  };
> >  
> >  struct ocelot_policer {
> > @@ -761,6 +763,8 @@ void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
> >  			 u32 vlan_tag);
> >  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
> >  void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
> > +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> > +			     u64 timestamp);
> >  
> >  /* Hardware initialization */
> >  int ocelot_regfields_init(struct ocelot *ocelot,
> > -- 
> > 2.33.1
>   

Thanks for the review.
Vladimir Oltean Nov. 29, 2021, 5:40 p.m. UTC | #4
On Mon, Nov 29, 2021 at 09:19:02AM +0100, Clément Léger wrote:
> > I'm not sure why you're letting the hardware grind to a halt first,
> > before refilling? I think since the CPU is the bottleneck anyway, you
> > can stop the extraction channel at any time you want to refill.
> > A constant stream of less data might be better than a bursty one.
> > Or maybe I'm misunderstanding some of the details of the hardware.
> 
> Indeed, I can stop the extraction channel but that does not seems a
> good idea to stop the channel in a steady state. At least that's what I
> thought since it will make the receive "window" non predictable. Not
> sure how well it will play with various protocol but I will try
> implementing the refill we talked previously (ie when there an
> available threshold is reached).
(...)
> > I don't understand why you restart the injection channel from the TX
> > confirmation interrupt. It raised the interrupt to tell you that it hit
> > a NULL LLP because there's nothing left to send. If you restart it now and
> > no other transmission has happened in the meantime, won't it stop again?
> 
> Actually, it is only restarted if there is some pending packets to
> send. With this hardware, packets can't be added while the FDMA is
> running and it must be stopped everytime we want to add a packet to the
> list. To avoid that, in the TX path, if the FDMA is stopped, we set the
> llp of the packet to NULL and start the chan. However, if the FDMA TX
> channel is running, we don't stop it, we simply add the next packets to
> the ring. However, the FDMA will stop on the previous NULL LLP. So when
> we hit a LLP, we might not be at the end of the list. This is why the
> next check verifies if we hit a NULL LLP and if there is still some
> packet to send. 

Oh, is that so? That would be pretty odd if the hardware is so dumb that
it doesn't detect changes made to an LLP on the go.

The manual has this to say, and I'm not sure how to interpret it:

| It is possible to update an active channels LLP pointer and pointers in
| the DCB chains. Before changing pointers software must schedule the
| channel for disabling (by writing FDMA_CH_DISABLE.CH_DISABLE[ch]) and
| then wait for the channel to set FDMA_CH_SAFE.CH_SAFE[ch]. When the
| pointer update is complete, soft must re-activate the channel by setting
| FDMA_CH_ACTIVATE.CH_ACTIVATE[ch]. Setting activate will cancel the
| deactivate-request, or if the channel has disabled itself in the
| meantime, it will re activate the channel.

So it is possible to update an active channel's LLP pointer, but not
while it's active? Thank you very much!

If true, this will severely limit the termination performance one is
able to obtain with this switch, even with a faster CPU and PCIe.

> > > +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> > > +{
> > > +	dev->needed_headroom = OCELOT_TAG_LEN;
> > > +	dev->needed_tailroom = ETH_FCS_LEN;  
> > 
> > The needed_headroom is in no way specific to FDMA, right? Why aren't you
> > doing it for manual register-based injection too? (in a separate patch ofc)
> 
> Actually, If I switch to page based ring, This won't be useful anymore
> because the header will be written directly in the page and not anymore
> directly in the skb header.

I don't understand this comment. You set up the needed headroom and
tailroom netdev variables to avoid reallocation on TX, not for RX.
And you use half page buffers for RX, not for TX.

> > I can't help but think how painful it is that with a CPU as slow as
> > yours, insult over injury, you also need to check for each packet
> > whether the device tree had defined the "fdma" region or not, because
> > you practically keep two traffic I/O implementations due to that sole
> > reason. I think for the ocelot switchdev driver, which is strictly for
> > MIPS CPUs embedded within the device, it should be fine to introduce a
> > static key here (search for static_branch_likely in the kernel).
> 
> I thinked about it *but* did not wanted to add a key since it would be
> global. However, we could consider that there is always only one
> instance of the driver and indeed a static key is an option.
> Unfortunately, I'm not sure this will yield any noticeable performance
> improvement.

What is the concern with a static key in this driver, exactly?
Clément Léger Dec. 1, 2021, 9:29 a.m. UTC | #5
Le Mon, 29 Nov 2021 17:40:39 +0000,
Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :

> On Mon, Nov 29, 2021 at 09:19:02AM +0100, Clément Léger wrote:
> > > I'm not sure why you're letting the hardware grind to a halt first,
> > > before refilling? I think since the CPU is the bottleneck anyway, you
> > > can stop the extraction channel at any time you want to refill.
> > > A constant stream of less data might be better than a bursty one.
> > > Or maybe I'm misunderstanding some of the details of the hardware.  
> > 
> > Indeed, I can stop the extraction channel but that does not seems a
> > good idea to stop the channel in a steady state. At least that's what I
> > thought since it will make the receive "window" non predictable. Not
> > sure how well it will play with various protocol but I will try
> > implementing the refill we talked previously (ie when there an
> > available threshold is reached).  
> (...)
> > > I don't understand why you restart the injection channel from the TX
> > > confirmation interrupt. It raised the interrupt to tell you that it hit
> > > a NULL LLP because there's nothing left to send. If you restart it now and
> > > no other transmission has happened in the meantime, won't it stop again?  
> > 
> > Actually, it is only restarted if there is some pending packets to
> > send. With this hardware, packets can't be added while the FDMA is
> > running and it must be stopped everytime we want to add a packet to the
> > list. To avoid that, in the TX path, if the FDMA is stopped, we set the
> > llp of the packet to NULL and start the chan. However, if the FDMA TX
> > channel is running, we don't stop it, we simply add the next packets to
> > the ring. However, the FDMA will stop on the previous NULL LLP. So when
> > we hit a LLP, we might not be at the end of the list. This is why the
> > next check verifies if we hit a NULL LLP and if there is still some
> > packet to send.   
> 
> Oh, is that so? That would be pretty odd if the hardware is so dumb that
> it doesn't detect changes made to an LLP on the go.
> 
> The manual has this to say, and I'm not sure how to interpret it:
> 
> | It is possible to update an active channels LLP pointer and pointers in
> | the DCB chains. Before changing pointers software must schedule the
> | channel for disabling (by writing FDMA_CH_DISABLE.CH_DISABLE[ch]) and
> | then wait for the channel to set FDMA_CH_SAFE.CH_SAFE[ch]. When the
> | pointer update is complete, soft must re-activate the channel by setting
> | FDMA_CH_ACTIVATE.CH_ACTIVATE[ch]. Setting activate will cancel the
> | deactivate-request, or if the channel has disabled itself in the
> | meantime, it will re activate the channel.
> 
> So it is possible to update an active channel's LLP pointer, but not
> while it's active? Thank you very much!

In the manual, this is also stated that:

| The FDMA does not reload the current DCB when re- activated,
| so if the LLP-field of the current DCB is modified, then software must
| also modify FDMA_DCB_LLP[ch].

The FDMA present on the next generation (sparx5) is *almost* the same
but a new RELOAD register has been added and allows adding a DCB at the
end of the linked list without stopping the FDMA, and then simply hit
the RELOAD register to restart it if needed. Unfortunately, this is not
the case for the ocelot one. 

> 
> If true, this will severely limit the termination performance one is
> able to obtain with this switch, even with a faster CPU and PCIe.
> 
> > > > +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> > > > +{
> > > > +	dev->needed_headroom = OCELOT_TAG_LEN;
> > > > +	dev->needed_tailroom = ETH_FCS_LEN;    
> > > 
> > > The needed_headroom is in no way specific to FDMA, right? Why aren't you
> > > doing it for manual register-based injection too? (in a separate patch ofc)  
> > 
> > Actually, If I switch to page based ring, This won't be useful anymore
> > because the header will be written directly in the page and not anymore
> > directly in the skb header.  
> 
> I don't understand this comment. You set up the needed headroom and
> tailroom netdev variables to avoid reallocation on TX, not for RX.
> And you use half page buffers for RX, not for TX.

Ok, so indeed, I don't think it is needed for the register-based
injection since the IFH is computed on the stack and pushed word by
word into the fifo separately from the skb data. In the case of the
FDMA, it is read from the start of the DCB DATAL adress so this is why
this is needed. I could also put the IFH in a separate DCB and then
split the data in a next DCB using SOF/EOF flags but I'm not sure it
will be beneficial from a performance point of view. I could try that
since the CPU is slow, it might be better in some case to let the FDMA
handle this instead of usign the CPU to increase the SKB size and
linearize it.

> 
> > > I can't help but think how painful it is that with a CPU as slow as
> > > yours, insult over injury, you also need to check for each packet
> > > whether the device tree had defined the "fdma" region or not, because
> > > you practically keep two traffic I/O implementations due to that sole
> > > reason. I think for the ocelot switchdev driver, which is strictly for
> > > MIPS CPUs embedded within the device, it should be fine to introduce a
> > > static key here (search for static_branch_likely in the kernel).  
> > 
> > I thinked about it *but* did not wanted to add a key since it would be
> > global. However, we could consider that there is always only one
> > instance of the driver and indeed a static key is an option.
> > Unfortunately, I'm not sure this will yield any noticeable performance
> > improvement.  
> 
> What is the concern with a static key in this driver, exactly?

Only that the static key will be global but this driver does not have
anything global. If you have no concern about that, I'm ok to add one.
Vladimir Oltean Dec. 3, 2021, 11:23 a.m. UTC | #6
On Wed, Dec 01, 2021 at 10:29:26AM +0100, Clément Léger wrote:
> Le Mon, 29 Nov 2021 17:40:39 +0000,
> Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :
>
> > On Mon, Nov 29, 2021 at 09:19:02AM +0100, Clément Léger wrote:
> > > > I'm not sure why you're letting the hardware grind to a halt first,
> > > > before refilling? I think since the CPU is the bottleneck anyway, you
> > > > can stop the extraction channel at any time you want to refill.
> > > > A constant stream of less data might be better than a bursty one.
> > > > Or maybe I'm misunderstanding some of the details of the hardware.
> > >
> > > Indeed, I can stop the extraction channel but that does not seems a
> > > good idea to stop the channel in a steady state. At least that's what I
> > > thought since it will make the receive "window" non predictable. Not
> > > sure how well it will play with various protocol but I will try
> > > implementing the refill we talked previously (ie when there an
> > > available threshold is reached).
> > (...)
> > > > I don't understand why you restart the injection channel from the TX
> > > > confirmation interrupt. It raised the interrupt to tell you that it hit
> > > > a NULL LLP because there's nothing left to send. If you restart it now and
> > > > no other transmission has happened in the meantime, won't it stop again?
> > >
> > > Actually, it is only restarted if there is some pending packets to
> > > send. With this hardware, packets can't be added while the FDMA is
> > > running and it must be stopped everytime we want to add a packet to the
> > > list. To avoid that, in the TX path, if the FDMA is stopped, we set the
> > > llp of the packet to NULL and start the chan. However, if the FDMA TX
> > > channel is running, we don't stop it, we simply add the next packets to
> > > the ring. However, the FDMA will stop on the previous NULL LLP. So when
> > > we hit a LLP, we might not be at the end of the list. This is why the
> > > next check verifies if we hit a NULL LLP and if there is still some
> > > packet to send.
> >
> > Oh, is that so? That would be pretty odd if the hardware is so dumb that
> > it doesn't detect changes made to an LLP on the go.
> >
> > The manual has this to say, and I'm not sure how to interpret it:
> >
> > | It is possible to update an active channels LLP pointer and pointers in
> > | the DCB chains. Before changing pointers software must schedule the
> > | channel for disabling (by writing FDMA_CH_DISABLE.CH_DISABLE[ch]) and
> > | then wait for the channel to set FDMA_CH_SAFE.CH_SAFE[ch]. When the
> > | pointer update is complete, soft must re-activate the channel by setting
> > | FDMA_CH_ACTIVATE.CH_ACTIVATE[ch]. Setting activate will cancel the
> > | deactivate-request, or if the channel has disabled itself in the
> > | meantime, it will re activate the channel.
> >
> > So it is possible to update an active channel's LLP pointer, but not
> > while it's active? Thank you very much!
>
> In the manual, this is also stated that:
>
> | The FDMA does not reload the current DCB when re- activated,
> | so if the LLP-field of the current DCB is modified, then software must
> | also modify FDMA_DCB_LLP[ch].
>
> The FDMA present on the next generation (sparx5) is *almost* the same
> but a new RELOAD register has been added and allows adding a DCB at the
> end of the linked list without stopping the FDMA, and then simply hit
> the RELOAD register to restart it if needed. Unfortunately, this is not
> the case for the ocelot one.
>
> >
> > If true, this will severely limit the termination performance one is
> > able to obtain with this switch, even with a faster CPU and PCIe.

Sadly I don't have the time or hardware to dig deeper into this, so I'll
have to trust you, even if it sounds like a severe limitation.

> > > > > +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> > > > > +{
> > > > > +	dev->needed_headroom = OCELOT_TAG_LEN;
> > > > > +	dev->needed_tailroom = ETH_FCS_LEN;
> > > >
> > > > The needed_headroom is in no way specific to FDMA, right? Why aren't you
> > > > doing it for manual register-based injection too? (in a separate patch ofc)
> > >
> > > Actually, If I switch to page based ring, This won't be useful anymore
> > > because the header will be written directly in the page and not anymore
> > > directly in the skb header.
> >
> > I don't understand this comment. You set up the needed headroom and
> > tailroom netdev variables to avoid reallocation on TX, not for RX.
> > And you use half page buffers for RX, not for TX.
>
> Ok, so indeed, I don't think it is needed for the register-based
> injection since the IFH is computed on the stack and pushed word by
> word into the fifo separately from the skb data. In the case of the
> FDMA, it is read from the start of the DCB DATAL adress so this is why
> this is needed. I could also put the IFH in a separate DCB and then
> split the data in a next DCB using SOF/EOF flags but I'm not sure it
> will be beneficial from a performance point of view. I could try that
> since the CPU is slow, it might be better in some case to let the FDMA
> handle this instead of usign the CPU to increase the SKB size and
> linearize it.
>
> >
> > > > I can't help but think how painful it is that with a CPU as slow as
> > > > yours, insult over injury, you also need to check for each packet
> > > > whether the device tree had defined the "fdma" region or not, because
> > > > you practically keep two traffic I/O implementations due to that sole
> > > > reason. I think for the ocelot switchdev driver, which is strictly for
> > > > MIPS CPUs embedded within the device, it should be fine to introduce a
> > > > static key here (search for static_branch_likely in the kernel).
> > >
> > > I thinked about it *but* did not wanted to add a key since it would be
> > > global. However, we could consider that there is always only one
> > > instance of the driver and indeed a static key is an option.
> > > Unfortunately, I'm not sure this will yield any noticeable performance
> > > improvement.
> >
> > What is the concern with a static key in this driver, exactly?
>
> Only that the static key will be global but this driver does not have
> anything global. If you have no concern about that, I'm ok to add one.

I don't see a downside to this, do you? Even if we get support for a
PCIe ocelot driver later on, we don't know how that is going to look, if
it's going to reuse the exact same xmit function, etc.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
index 722c27694b21..d76a9b78b6ca 100644
--- a/drivers/net/ethernet/mscc/Makefile
+++ b/drivers/net/ethernet/mscc/Makefile
@@ -11,5 +11,6 @@  mscc_ocelot_switch_lib-y := \
 mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o
 obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o
 mscc_ocelot-y := \
+	ocelot_fdma.o \
 	ocelot_vsc7514.o \
 	ocelot_net.o
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 1f7c9ff18ac5..4b2460d232c2 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -966,14 +966,37 @@  static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
 	return 0;
 }
 
-int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
+void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
+			     u64 timestamp)
 {
 	struct skb_shared_hwtstamps *shhwtstamps;
 	u64 tod_in_ns, full_ts_in_ns;
+	struct timespec64 ts;
+
+	if (!ocelot->ptp)
+		return;
+
+	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
+
+	tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
+	if ((tod_in_ns & 0xffffffff) < timestamp)
+		full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
+				timestamp;
+	else
+		full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
+				timestamp;
+
+	shhwtstamps = skb_hwtstamps(skb);
+	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
+	shhwtstamps->hwtstamp = full_ts_in_ns;
+}
+EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);
+
+int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
+{
 	u64 timestamp, src_port, len;
 	u32 xfh[OCELOT_TAG_LEN / 4];
 	struct net_device *dev;
-	struct timespec64 ts;
 	struct sk_buff *skb;
 	int sz, buf_len;
 	u32 val, *buf;
@@ -1029,21 +1052,7 @@  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 		*buf = val;
 	}
 
-	if (ocelot->ptp) {
-		ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
-
-		tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
-		if ((tod_in_ns & 0xffffffff) < timestamp)
-			full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
-					timestamp;
-		else
-			full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
-					timestamp;
-
-		shhwtstamps = skb_hwtstamps(skb);
-		memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
-		shhwtstamps->hwtstamp = full_ts_in_ns;
-	}
+	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
 
 	/* Everything we see on an interface that is in the HW bridge
 	 * has already been forwarded.
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index e43da09b8f91..f1a7b403e221 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -9,6 +9,7 @@ 
 #define _MSCC_OCELOT_H_
 
 #include <linux/bitops.h>
+#include <linux/dsa/ocelot.h>
 #include <linux/etherdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/net_tstamp.h>
diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
new file mode 100644
index 000000000000..e42c2c3ad273
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
@@ -0,0 +1,713 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Microsemi SoCs FDMA driver
+ *
+ * Copyright (c) 2021 Microchip
+ */
+
+#include <linux/bitops.h>
+#include <linux/dmapool.h>
+#include <linux/dsa/ocelot.h>
+#include <linux/netdevice.h>
+#include <linux/of_platform.h>
+#include <linux/skbuff.h>
+
+#include "ocelot_fdma.h"
+#include "ocelot_qs.h"
+
+#define MSCC_FDMA_DCB_LLP(x)			((x) * 4 + 0x0)
+#define MSCC_FDMA_DCB_LLP_PREV(x)		((x) * 4 + 0xA0)
+
+#define MSCC_FDMA_DCB_STAT_BLOCKO(x)		(((x) << 20) & GENMASK(31, 20))
+#define MSCC_FDMA_DCB_STAT_BLOCKO_M		GENMASK(31, 20)
+#define MSCC_FDMA_DCB_STAT_BLOCKO_X(x)		(((x) & GENMASK(31, 20)) >> 20)
+#define MSCC_FDMA_DCB_STAT_PD			BIT(19)
+#define MSCC_FDMA_DCB_STAT_ABORT		BIT(18)
+#define MSCC_FDMA_DCB_STAT_EOF			BIT(17)
+#define MSCC_FDMA_DCB_STAT_SOF			BIT(16)
+#define MSCC_FDMA_DCB_STAT_BLOCKL_M		GENMASK(15, 0)
+#define MSCC_FDMA_DCB_STAT_BLOCKL(x)		((x) & GENMASK(15, 0))
+
+#define MSCC_FDMA_CH_SAFE			0xcc
+
+#define MSCC_FDMA_CH_ACTIVATE			0xd0
+
+#define MSCC_FDMA_CH_DISABLE			0xd4
+
+#define MSCC_FDMA_EVT_ERR			0x164
+
+#define MSCC_FDMA_EVT_ERR_CODE			0x168
+
+#define MSCC_FDMA_INTR_LLP			0x16c
+
+#define MSCC_FDMA_INTR_LLP_ENA			0x170
+
+#define MSCC_FDMA_INTR_FRM			0x174
+
+#define MSCC_FDMA_INTR_FRM_ENA			0x178
+
+#define MSCC_FDMA_INTR_ENA			0x184
+
+#define MSCC_FDMA_INTR_IDENT			0x188
+
+#define MSCC_FDMA_INJ_CHAN			2
+#define MSCC_FDMA_XTR_CHAN			0
+
+#define OCELOT_FDMA_RX_MTU			ETH_DATA_LEN
+#define OCELOT_FDMA_WEIGHT			32
+#define OCELOT_FDMA_RX_REFILL_COUNT		(OCELOT_FDMA_MAX_DCB / 2)
+
+#define OCELOT_FDMA_CH_SAFE_TIMEOUT_MS		100
+
+#define OCELOT_FDMA_RX_EXTRA_SIZE \
+				(OCELOT_TAG_LEN + ETH_FCS_LEN + ETH_HLEN)
+
+static int ocelot_fdma_rx_buf_size(int mtu)
+{
+	return ALIGN(mtu + OCELOT_FDMA_RX_EXTRA_SIZE, 4);
+}
+
+static void ocelot_fdma_writel(struct ocelot_fdma *fdma, u32 reg, u32 data)
+{
+	writel(data, fdma->base + reg);
+}
+
+static u32 ocelot_fdma_readl(struct ocelot_fdma *fdma, u32 reg)
+{
+	return readl(fdma->base + reg);
+}
+
+static unsigned int ocelot_fdma_idx_incr(unsigned int idx)
+{
+	idx++;
+	if (idx == OCELOT_FDMA_MAX_DCB)
+		idx = 0;
+
+	return idx;
+}
+
+static unsigned int ocelot_fdma_idx_decr(unsigned int idx)
+{
+	if (idx == 0)
+		idx = OCELOT_FDMA_MAX_DCB - 1;
+	else
+		idx--;
+
+	return idx;
+}
+
+static int ocelot_fdma_tx_free_count(struct ocelot_fdma *fdma)
+{
+	struct ocelot_fdma_ring *ring = &fdma->inj;
+
+	if (ring->tail >= ring->head)
+		return OCELOT_FDMA_MAX_DCB - (ring->tail - ring->head) - 1;
+	else
+		return ring->head - ring->tail - 1;
+}
+
+static bool ocelot_fdma_ring_empty(struct ocelot_fdma_ring *ring)
+{
+	return ring->head == ring->tail;
+}
+
+static void ocelot_fdma_activate_chan(struct ocelot_fdma *fdma,
+				      struct ocelot_fdma_dcb *dcb, int chan)
+{
+	ocelot_fdma_writel(fdma, MSCC_FDMA_DCB_LLP(chan), dcb->hw_dma);
+	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_ACTIVATE, BIT(chan));
+}
+
+static int ocelot_fdma_wait_chan_safe(struct ocelot_fdma *fdma, int chan)
+{
+	unsigned long timeout;
+	u32 safe;
+
+	timeout = jiffies + msecs_to_jiffies(OCELOT_FDMA_CH_SAFE_TIMEOUT_MS);
+	do {
+		safe = ocelot_fdma_readl(fdma, MSCC_FDMA_CH_SAFE);
+		if (safe & BIT(chan))
+			return 0;
+	} while (time_after(jiffies, timeout));
+
+	return -ETIMEDOUT;
+}
+
+static int ocelot_fdma_stop_channel(struct ocelot_fdma *fdma, int chan)
+{
+	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_DISABLE, BIT(chan));
+
+	return ocelot_fdma_wait_chan_safe(fdma, chan);
+}
+
+static bool ocelot_fdma_dcb_set_data(struct ocelot_fdma *fdma,
+				     struct ocelot_fdma_dcb *dcb,
+				     struct sk_buff *skb,
+				     size_t size, enum dma_data_direction dir)
+{
+	struct ocelot_fdma_dcb_hw_v2 *hw = dcb->hw;
+	u32 offset;
+
+	dcb->skb = skb;
+	dcb->mapped_size = size;
+	dcb->mapping = dma_map_single(fdma->dev, skb->data, size, dir);
+	if (unlikely(dma_mapping_error(fdma->dev, dcb->mapping)))
+		return false;
+
+	offset = dcb->mapping & 0x3;
+
+	hw->llp = 0;
+	hw->datap = ALIGN_DOWN(dcb->mapping, 4);
+	hw->datal = ALIGN_DOWN(size, 4);
+	hw->stat = MSCC_FDMA_DCB_STAT_BLOCKO(offset);
+
+	return true;
+}
+
+static bool ocelot_fdma_rx_set_skb(struct ocelot_fdma *fdma,
+				   struct ocelot_fdma_dcb *dcb,
+				   struct sk_buff *skb, size_t size)
+{
+	return ocelot_fdma_dcb_set_data(fdma, dcb, skb, size,
+					DMA_FROM_DEVICE);
+}
+
+static bool ocelot_fdma_tx_dcb_set_skb(struct ocelot_fdma *fdma,
+				       struct ocelot_fdma_dcb *dcb,
+				       struct sk_buff *skb)
+{
+	if (!ocelot_fdma_dcb_set_data(fdma, dcb, skb, skb->len,
+				      DMA_TO_DEVICE))
+		return false;
+
+	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_BLOCKL(skb->len);
+	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_SOF | MSCC_FDMA_DCB_STAT_EOF;
+
+	return true;
+}
+
+static void ocelot_fdma_rx_restart(struct ocelot_fdma *fdma)
+{
+	struct ocelot_fdma_ring *ring = &fdma->xtr;
+	struct ocelot_fdma_dcb *dcb, *last_dcb;
+	unsigned int idx;
+	int ret;
+	u32 llp;
+
+	/* Check if the FDMA hits the DCB with LLP == NULL */
+	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_DCB_LLP(MSCC_FDMA_XTR_CHAN));
+	if (llp)
+		return;
+
+	ret = ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
+	if (ret) {
+		dev_warn(fdma->dev, "Unable to stop RX channel\n");
+		return;
+	}
+
+	/* Chain the tail with the next DCB */
+	dcb = &ring->dcbs[ring->tail];
+	idx = ocelot_fdma_idx_incr(ring->tail);
+	dcb->hw->llp = ring->dcbs[idx].hw_dma;
+	dcb = &ring->dcbs[idx];
+
+	/* Place a NULL terminator in last DCB added (head - 1) */
+	idx = ocelot_fdma_idx_decr(ring->head);
+	last_dcb = &ring->dcbs[idx];
+	last_dcb->hw->llp = 0;
+	ring->tail = idx;
+
+	/* Finally reactivate the channel */
+	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_XTR_CHAN);
+}
+
+static bool ocelot_fdma_rx_get(struct ocelot_fdma *fdma, int budget)
+{
+	struct ocelot_fdma_ring *ring = &fdma->xtr;
+	struct ocelot_fdma_dcb *dcb, *next_dcb;
+	struct ocelot *ocelot = fdma->ocelot;
+	struct net_device *ndev;
+	struct sk_buff *skb;
+	bool valid = true;
+	u64 timestamp;
+	u64 src_port;
+	void *xfh;
+	u32 stat;
+
+	/* We should not go past the tail */
+	if (ring->head == ring->tail)
+		return false;
+
+	dcb = &ring->dcbs[ring->head];
+	stat = dcb->hw->stat;
+	if (MSCC_FDMA_DCB_STAT_BLOCKL(stat) == 0)
+		return false;
+
+	ring->head = ocelot_fdma_idx_incr(ring->head);
+
+	if (stat & MSCC_FDMA_DCB_STAT_ABORT || stat & MSCC_FDMA_DCB_STAT_PD)
+		valid = false;
+
+	if (!(stat & MSCC_FDMA_DCB_STAT_SOF) ||
+	    !(stat & MSCC_FDMA_DCB_STAT_EOF))
+		valid = false;
+
+	dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
+			 DMA_FROM_DEVICE);
+
+	skb = dcb->skb;
+
+	if (unlikely(!valid)) {
+		dev_warn(fdma->dev, "Invalid packet\n");
+		goto refill;
+	}
+
+	xfh = skb->data;
+	ocelot_xfh_get_src_port(xfh, &src_port);
+
+	if (WARN_ON(src_port >= ocelot->num_phys_ports))
+		goto refill;
+
+	ndev = ocelot_port_to_netdev(ocelot, src_port);
+	if (unlikely(!ndev))
+		goto refill;
+
+	skb_put(skb, MSCC_FDMA_DCB_STAT_BLOCKL(stat) - ETH_FCS_LEN);
+	skb_pull(skb, OCELOT_TAG_LEN);
+
+	skb->dev = ndev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	skb->dev->stats.rx_bytes += skb->len;
+	skb->dev->stats.rx_packets++;
+
+	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
+
+	if (!skb_defer_rx_timestamp(skb))
+		netif_receive_skb(skb);
+
+	skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);
+	if (!skb)
+		return false;
+
+refill:
+	if (!ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size))
+		return false;
+
+	/* Chain the next DCB */
+	next_dcb = &ring->dcbs[ring->head];
+	dcb->hw->llp = next_dcb->hw_dma;
+
+	return true;
+}
+
+static void ocelot_fdma_tx_cleanup(struct ocelot_fdma *fdma, int budget)
+{
+	struct ocelot_fdma_ring *ring = &fdma->inj;
+	unsigned int tmp_head, new_null_llp_idx;
+	struct ocelot_fdma_dcb *dcb;
+	bool end_of_list = false;
+	int ret;
+
+	spin_lock_bh(&fdma->xmit_lock);
+
+	/* Purge the TX packets that have been sent up to the NULL llp or the
+	 * end of done list.
+	 */
+	while (!ocelot_fdma_ring_empty(&fdma->inj)) {
+		dcb = &ring->dcbs[ring->head];
+		if (!(dcb->hw->stat & MSCC_FDMA_DCB_STAT_PD))
+			break;
+
+		tmp_head = ring->head;
+		ring->head = ocelot_fdma_idx_incr(ring->head);
+
+		dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
+				 DMA_TO_DEVICE);
+		napi_consume_skb(dcb->skb, budget);
+
+		/* If we hit the NULL LLP, stop, we might need to reload FDMA */
+		if (dcb->hw->llp == 0) {
+			end_of_list = true;
+			break;
+		}
+	}
+
+	/* If there is still some DCBs to be processed by the FDMA or if the
+	 * pending list is empty, there is no need to restart the FDMA.
+	 */
+	if (!end_of_list || ocelot_fdma_ring_empty(&fdma->inj))
+		goto out_unlock;
+
+	ret = ocelot_fdma_wait_chan_safe(fdma, MSCC_FDMA_INJ_CHAN);
+	if (ret) {
+		dev_warn(fdma->dev, "Failed to wait for TX channel to stop\n");
+		goto out_unlock;
+	}
+
+	/* Set NULL LLP */
+	new_null_llp_idx = ocelot_fdma_idx_decr(ring->tail);
+	dcb = &ring->dcbs[new_null_llp_idx];
+	dcb->hw->llp = 0;
+
+	dcb = &ring->dcbs[ring->head];
+	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
+
+out_unlock:
+	spin_unlock_bh(&fdma->xmit_lock);
+}
+
+static int ocelot_fdma_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct ocelot_fdma *fdma = container_of(napi, struct ocelot_fdma, napi);
+	int work_done = 0;
+
+	ocelot_fdma_tx_cleanup(fdma, budget);
+
+	while (work_done < budget) {
+		if (!ocelot_fdma_rx_get(fdma, budget))
+			break;
+
+		work_done++;
+	}
+
+	ocelot_fdma_rx_restart(fdma);
+
+	if (work_done < budget) {
+		napi_complete_done(&fdma->napi, work_done);
+		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
+				   BIT(MSCC_FDMA_INJ_CHAN) |
+				   BIT(MSCC_FDMA_XTR_CHAN));
+	}
+
+	return work_done;
+}
+
+static irqreturn_t ocelot_fdma_interrupt(int irq, void *dev_id)
+{
+	u32 ident, llp, frm, err, err_code;
+	struct ocelot_fdma *fdma = dev_id;
+
+	ident = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_IDENT);
+	frm = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_FRM);
+	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_LLP);
+
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, llp & ident);
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, frm & ident);
+	if (frm || llp) {
+		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
+		napi_schedule(&fdma->napi);
+	}
+
+	err = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR);
+	if (unlikely(err)) {
+		err_code = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR_CODE);
+		dev_err_ratelimited(fdma->dev,
+				    "Error ! chans mask: %#x, code: %#x\n",
+				    err, err_code);
+
+		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR, err);
+		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR_CODE, err_code);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void ocelot_fdma_send_skb(struct ocelot_fdma *fdma, struct sk_buff *skb)
+{
+	struct ocelot_fdma_ring *ring = &fdma->inj;
+	struct ocelot_fdma_dcb *dcb, *next;
+
+	dcb = &ring->dcbs[ring->tail];
+	if (!ocelot_fdma_tx_dcb_set_skb(fdma, dcb, skb)) {
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	if (ocelot_fdma_ring_empty(&fdma->inj)) {
+		ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
+	} else {
+		next = &ring->dcbs[ocelot_fdma_idx_incr(ring->tail)];
+		dcb->hw->llp = next->hw_dma;
+	}
+
+	ring->tail = ocelot_fdma_idx_incr(ring->tail);
+
+	skb_tx_timestamp(skb);
+}
+
+static int ocelot_fdma_prepare_skb(struct ocelot_fdma *fdma, int port,
+				   u32 rew_op, struct sk_buff *skb,
+				   struct net_device *dev)
+{
+	int needed_headroom = max_t(int, OCELOT_TAG_LEN - skb_headroom(skb), 0);
+	int needed_tailroom = max_t(int, ETH_FCS_LEN - skb_tailroom(skb), 0);
+	struct ocelot_port *ocelot_port = fdma->ocelot->ports[port];
+	void *ifh;
+	int err;
+
+	if (unlikely(needed_headroom || needed_tailroom ||
+		     skb_header_cloned(skb))) {
+		err = pskb_expand_head(skb, needed_headroom, needed_tailroom,
+				       GFP_ATOMIC);
+		if (unlikely(err)) {
+			dev_kfree_skb_any(skb);
+			return 1;
+		}
+	}
+
+	err = skb_linearize(skb);
+	if (err) {
+		net_err_ratelimited("%s: skb_linearize error (%d)!\n",
+				    dev->name, err);
+		dev_kfree_skb_any(skb);
+		return 1;
+	}
+
+	ifh = skb_push(skb, OCELOT_TAG_LEN);
+	skb_put(skb, ETH_FCS_LEN);
+	ocelot_ifh_port_set(ifh, ocelot_port, rew_op, skb_vlan_tag_get(skb));
+
+	return 0;
+}
+
+int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
+			     struct sk_buff *skb, struct net_device *dev)
+{
+	int ret = NETDEV_TX_OK;
+
+	spin_lock(&fdma->xmit_lock);
+
+	if (ocelot_fdma_tx_free_count(fdma) == 0) {
+		ret = NETDEV_TX_BUSY;
+		goto out;
+	}
+
+	if (ocelot_fdma_prepare_skb(fdma, port, rew_op, skb, dev))
+		goto out;
+
+	ocelot_fdma_send_skb(fdma, skb);
+
+out:
+	spin_unlock(&fdma->xmit_lock);
+
+	return ret;
+}
+
+static void ocelot_fdma_ring_free(struct ocelot_fdma *fdma,
+				  struct ocelot_fdma_ring *ring)
+{
+	dmam_free_coherent(fdma->dev, OCELOT_DCBS_HW_ALLOC_SIZE, ring->hw_dcbs,
+			   ring->hw_dcbs_dma);
+}
+
+static int ocelot_fdma_ring_alloc(struct ocelot_fdma *fdma,
+				  struct ocelot_fdma_ring *ring)
+{
+	struct ocelot_fdma_dcb_hw_v2 *hw_dcbs;
+	struct ocelot_fdma_dcb *dcb;
+	dma_addr_t hw_dcbs_dma;
+	unsigned int adjust;
+	int i;
+
+	/* Create a pool of consistent memory blocks for hardware descriptors */
+	ring->hw_dcbs = dmam_alloc_coherent(fdma->dev,
+					    OCELOT_DCBS_HW_ALLOC_SIZE,
+					    &ring->hw_dcbs_dma, GFP_KERNEL);
+	if (!ring->hw_dcbs)
+		return -ENOMEM;
+
+	/* DCBs must be aligned on a 32bit boundary */
+	hw_dcbs = ring->hw_dcbs;
+	hw_dcbs_dma = ring->hw_dcbs_dma;
+	if (!IS_ALIGNED(hw_dcbs_dma, 4)) {
+		adjust = hw_dcbs_dma & 0x3;
+		hw_dcbs_dma = ALIGN(hw_dcbs_dma, 4);
+		hw_dcbs = (void *)hw_dcbs + adjust;
+	}
+
+	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
+		dcb = &ring->dcbs[i];
+		dcb->hw = &hw_dcbs[i];
+		dcb->hw_dma = hw_dcbs_dma +
+			     i * sizeof(struct ocelot_fdma_dcb_hw_v2);
+	}
+
+	return 0;
+}
+
+static int ocelot_fdma_rx_skb_alloc(struct ocelot_fdma *fdma)
+{
+	struct ocelot_fdma_dcb *dcb, *prev_dcb = NULL;
+	struct ocelot_fdma_ring *ring = &fdma->xtr;
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
+		dcb = &ring->dcbs[i];
+		skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);
+		if (!skb)
+			goto skb_alloc_failed;
+
+		ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size);
+
+		if (prev_dcb)
+			prev_dcb->hw->llp = dcb->hw_dma;
+
+		prev_dcb = dcb;
+	}
+
+	ring->head = 0;
+	ring->tail = OCELOT_FDMA_MAX_DCB - 1;
+
+	return 0;
+
+skb_alloc_failed:
+	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
+		dcb = &ring->dcbs[i];
+		if (!dcb->skb)
+			break;
+
+		dev_kfree_skb_any(dcb->skb);
+	}
+
+	return -ENOMEM;
+}
+
+static int ocelot_fdma_rx_init(struct ocelot_fdma *fdma)
+{
+	int ret;
+
+	fdma->rx_buf_size = ocelot_fdma_rx_buf_size(OCELOT_FDMA_RX_MTU);
+
+	ret = ocelot_fdma_rx_skb_alloc(fdma);
+	if (ret) {
+		netif_napi_del(&fdma->napi);
+		return ret;
+	}
+
+	napi_enable(&fdma->napi);
+
+	ocelot_fdma_activate_chan(fdma, &fdma->xtr.dcbs[0],
+				  MSCC_FDMA_XTR_CHAN);
+
+	return 0;
+}
+
+void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
+{
+	dev->needed_headroom = OCELOT_TAG_LEN;
+	dev->needed_tailroom = ETH_FCS_LEN;
+
+	if (fdma->ndev)
+		return;
+
+	fdma->ndev = dev;
+	netif_napi_add(dev, &fdma->napi, ocelot_fdma_napi_poll,
+		       OCELOT_FDMA_WEIGHT);
+}
+
+void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma, struct net_device *dev)
+{
+	if (dev == fdma->ndev)
+		netif_napi_del(&fdma->napi);
+}
+
+struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
+				     struct ocelot *ocelot)
+{
+	struct ocelot_fdma *fdma;
+	void __iomem *base;
+	int ret;
+
+	base = devm_platform_ioremap_resource_byname(pdev, "fdma");
+	if (IS_ERR_OR_NULL(base))
+		return NULL;
+
+	fdma = devm_kzalloc(&pdev->dev, sizeof(*fdma), GFP_KERNEL);
+	if (!fdma)
+		goto err_release_resource;
+
+	fdma->ocelot = ocelot;
+	fdma->base = base;
+	fdma->dev = &pdev->dev;
+	fdma->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
+
+	fdma->irq = platform_get_irq_byname(pdev, "fdma");
+	ret = devm_request_irq(&pdev->dev, fdma->irq, ocelot_fdma_interrupt, 0,
+			       dev_name(&pdev->dev), fdma);
+	if (ret)
+		goto err_free_fdma;
+
+	ret = ocelot_fdma_ring_alloc(fdma, &fdma->inj);
+	if (ret)
+		goto err_free_irq;
+
+	ret = ocelot_fdma_ring_alloc(fdma, &fdma->xtr);
+	if (ret)
+		goto free_inj_ring;
+
+	return fdma;
+
+free_inj_ring:
+	ocelot_fdma_ring_free(fdma, &fdma->inj);
+err_free_irq:
+	devm_free_irq(&pdev->dev, fdma->irq, fdma);
+err_free_fdma:
+	devm_kfree(&pdev->dev, fdma);
+err_release_resource:
+	devm_iounmap(&pdev->dev, base);
+
+	return NULL;
+}
+
+int ocelot_fdma_start(struct ocelot_fdma *fdma)
+{
+	struct ocelot *ocelot = fdma->ocelot;
+	int ret;
+
+	ret = ocelot_fdma_rx_init(fdma);
+	if (ret)
+		return -EINVAL;
+
+	/* Reconfigure for extraction and injection using DMA */
+	ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_MODE(2), QS_INJ_GRP_CFG, 0);
+	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(0), QS_INJ_CTRL, 0);
+
+	ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_MODE(2), QS_XTR_GRP_CFG, 0);
+
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, 0xffffffff);
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, 0xffffffff);
+
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP_ENA,
+			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM_ENA, BIT(MSCC_FDMA_XTR_CHAN));
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
+			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
+
+	return 0;
+}
+
+int ocelot_fdma_stop(struct ocelot_fdma *fdma)
+{
+	struct ocelot_fdma_ring *ring = &fdma->xtr;
+	struct ocelot_fdma_dcb *dcb;
+	int i;
+
+	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
+
+	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
+	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_INJ_CHAN);
+
+	/* Free the SKB hold in the extraction ring */
+	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
+		dcb = &ring->dcbs[i];
+		dev_kfree_skb_any(dcb->skb);
+	}
+
+	napi_synchronize(&fdma->napi);
+	napi_disable(&fdma->napi);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.h b/drivers/net/ethernet/mscc/ocelot_fdma.h
new file mode 100644
index 000000000000..b6f1dda0e0c7
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_fdma.h
@@ -0,0 +1,96 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi SoCs FDMA driver
+ *
+ * Copyright (c) 2021 Microchip
+ */
+#ifndef _MSCC_OCELOT_FDMA_H_
+#define _MSCC_OCELOT_FDMA_H_
+
+#include "ocelot.h"
+
+#define OCELOT_FDMA_MAX_DCB		128
+/* +4 allows for word alignment after allocation */
+#define OCELOT_DCBS_HW_ALLOC_SIZE	(OCELOT_FDMA_MAX_DCB * \
+					 sizeof(struct ocelot_fdma_dcb_hw_v2) + \
+					 4)
+
+struct ocelot_fdma_dcb_hw_v2 {
+	u32 llp;
+	u32 datap;
+	u32 datal;
+	u32 stat;
+};
+
+/**
+ * struct ocelot_fdma_dcb - Software DCBs description
+ *
+ * @hw: hardware DCB used by hardware(coherent memory)
+ * @hw_dma: DMA address of the DCB
+ * @skb: skb associated with the DCB
+ * @mapping: Address of the skb data mapping
+ * @mapped_size: Mapped size
+ */
+struct ocelot_fdma_dcb {
+	struct ocelot_fdma_dcb_hw_v2	*hw;
+	dma_addr_t			hw_dma;
+	struct sk_buff			*skb;
+	dma_addr_t			mapping;
+	size_t				mapped_size;
+};
+
+/**
+ * struct ocelot_fdma_ring - "Ring" description of DCBs
+ *
+ * @hw_dcbs: Hardware DCBs allocated for the ring
+ * @hw_dcbs_dma: DMA address of the DCBs
+ * @dcbs: List of software DCBs
+ * @head: pointer to first available DCB
+ * @tail: pointer to last available DCB
+ */
+struct ocelot_fdma_ring {
+	struct ocelot_fdma_dcb_hw_v2	*hw_dcbs;
+	dma_addr_t			hw_dcbs_dma;
+	struct ocelot_fdma_dcb		dcbs[OCELOT_FDMA_MAX_DCB];
+	unsigned int			head;
+	unsigned int			tail;
+};
+
+/**
+ * struct ocelot_fdma - FMDA struct
+ *
+ * @ocelot: Pointer to ocelot struct
+ * @base: base address of FDMA registers
+ * @irq: FDMA interrupt
+ * @dev: Ocelot device
+ * @napi: napi handle
+ * @rx_buf_size: Size of RX buffer
+ * @inj: Injection ring
+ * @xtr: Extraction ring
+ * @xmit_lock: Xmit lock
+ *
+ */
+struct ocelot_fdma {
+	struct ocelot			*ocelot;
+	void __iomem			*base;
+	int				irq;
+	struct device			*dev;
+	struct napi_struct		napi;
+	struct net_device		*ndev;
+	size_t				rx_buf_size;
+	struct ocelot_fdma_ring		inj;
+	struct ocelot_fdma_ring		xtr;
+	spinlock_t			xmit_lock;
+};
+
+struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
+				     struct ocelot *ocelot);
+int ocelot_fdma_start(struct ocelot_fdma *fdma);
+int ocelot_fdma_stop(struct ocelot_fdma *fdma);
+int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
+			     struct sk_buff *skb, struct net_device *dev);
+void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev);
+void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma,
+			       struct net_device *dev);
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index b589ae95e29b..9dcaf421da12 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -15,6 +15,7 @@ 
 #include <net/pkt_cls.h>
 #include "ocelot.h"
 #include "ocelot_vcap.h"
+#include "ocelot_fdma.h"
 
 #define OCELOT_MAC_QUIRKS	OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP
 
@@ -457,7 +458,7 @@  static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 	int port = priv->chip_port;
 	u32 rew_op = 0;
 
-	if (!ocelot_can_inject(ocelot, 0))
+	if (!ocelot->fdma && !ocelot_can_inject(ocelot, 0))
 		return NETDEV_TX_BUSY;
 
 	/* Check if timestamping is needed */
@@ -475,9 +476,13 @@  static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 		rew_op = ocelot_ptp_rew_op(skb);
 	}
 
-	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
+	if (ocelot->fdma) {
+		ocelot_fdma_inject_frame(ocelot->fdma, port, rew_op, skb, dev);
+	} else {
+		ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
 
-	kfree_skb(skb);
+		consume_skb(skb);
+	}
 
 	return NETDEV_TX_OK;
 }
@@ -1717,6 +1722,9 @@  int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
 	if (err)
 		goto out;
 
+	if (ocelot->fdma)
+		ocelot_fdma_netdev_init(ocelot->fdma, dev);
+
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(ocelot->dev, "register_netdev failed\n");
@@ -1737,9 +1745,13 @@  void ocelot_release_port(struct ocelot_port *ocelot_port)
 	struct ocelot_port_private *priv = container_of(ocelot_port,
 						struct ocelot_port_private,
 						port);
+	struct ocelot_fdma *fdma = ocelot_port->ocelot->fdma;
 
 	unregister_netdev(priv->dev);
 
+	if (fdma)
+		ocelot_fdma_netdev_deinit(fdma, priv->dev);
+
 	if (priv->phylink) {
 		rtnl_lock();
 		phylink_disconnect_phy(priv->phylink);
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 38103b0255b0..fa68eb23a333 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -18,6 +18,7 @@ 
 
 #include <soc/mscc/ocelot_vcap.h>
 #include <soc/mscc/ocelot_hsio.h>
+#include "ocelot_fdma.h"
 #include "ocelot.h"
 
 static const u32 ocelot_ana_regmap[] = {
@@ -1080,6 +1081,10 @@  static int mscc_ocelot_probe(struct platform_device *pdev)
 		ocelot->targets[io_target[i].id] = target;
 	}
 
+	ocelot->fdma = ocelot_fdma_init(pdev, ocelot);
+	if (IS_ERR(ocelot->fdma))
+		ocelot->fdma = NULL;
+
 	hsio = syscon_regmap_lookup_by_compatible("mscc,ocelot-hsio");
 	if (IS_ERR(hsio)) {
 		dev_err(&pdev->dev, "missing hsio syscon\n");
@@ -1139,6 +1144,12 @@  static int mscc_ocelot_probe(struct platform_device *pdev)
 	if (err)
 		goto out_ocelot_devlink_unregister;
 
+	if (ocelot->fdma) {
+		err = ocelot_fdma_start(ocelot->fdma);
+		if (err)
+			goto out_ocelot_release_ports;
+	}
+
 	err = ocelot_devlink_sb_register(ocelot);
 	if (err)
 		goto out_ocelot_release_ports;
@@ -1179,6 +1190,8 @@  static int mscc_ocelot_remove(struct platform_device *pdev)
 {
 	struct ocelot *ocelot = platform_get_drvdata(pdev);
 
+	if (ocelot->fdma)
+		ocelot_fdma_stop(ocelot->fdma);
 	devlink_unregister(ocelot->devlink);
 	ocelot_deinit_timestamp(ocelot);
 	ocelot_devlink_sb_unregister(ocelot);
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index b3381c90ff3e..351ab385ab98 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -695,6 +695,8 @@  struct ocelot {
 	/* Protects the PTP clock */
 	spinlock_t			ptp_clock_lock;
 	struct ptp_pin_desc		ptp_pins[OCELOT_PTP_PINS_NUM];
+
+	struct ocelot_fdma		*fdma;
 };
 
 struct ocelot_policer {
@@ -761,6 +763,8 @@  void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
 			 u32 vlan_tag);
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
 void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
+void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
+			     u64 timestamp);
 
 /* Hardware initialization */
 int ocelot_regfields_init(struct ocelot *ocelot,