diff mbox series

[v2,3/6] net: ocelot: pre-compute injection frame header content

Message ID 20211103091943.3878621-4-clement.leger@bootlin.com (mailing list archive)
State Deferred
Delegated to: Netdev Maintainers
Headers show
Series Add FDMA support on ocelot switch driver | expand

Checks

Context Check Description
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/module_param success Was 0 now: 0
netdev/signed warning No signature found. Please sign patches: https://github.com/mricon/patatt
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 67 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Clément Léger Nov. 3, 2021, 9:19 a.m. UTC
IFH preparation can take quite some time on slow processors (up to 5% in
a iperf3 test for instance). In order to reduce the cost of this
preparation, pre-compute IFH since most of the parameters are fixed per
port. Only rew_op and vlan tag will be set when sending if different
than 0. This allows to remove entirely the calls to packing() with basic
usage. In the same time, export this function that will be used by FDMA.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++-----
 include/soc/mscc/ocelot.h          |  5 +++++
 2 files changed, 23 insertions(+), 5 deletions(-)

Comments

Vladimir Oltean Nov. 3, 2021, 12:38 p.m. UTC | #1
On Wed, Nov 03, 2021 at 10:19:40AM +0100, Clément Léger wrote:
> IFH preparation can take quite some time on slow processors (up to 5% in
> a iperf3 test for instance). In order to reduce the cost of this
> preparation, pre-compute IFH since most of the parameters are fixed per
> port. Only rew_op and vlan tag will be set when sending if different
> than 0. This allows to remove entirely the calls to packing() with basic
> usage. In the same time, export this function that will be used by FDMA.
> 
> Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> ---

Honestly, this feels a bit cheap/gimmicky, and not really the
fundamental thing to address. In my testing of a similar idea (see
commits 67c2404922c2 ("net: dsa: felix: create a template for the DSA
tags on xmit") and then 7c4bb540e917 ("net: dsa: tag_ocelot: create
separate tagger for Seville"), the net difference is not that stark,
considering that now you need to access one more memory region which you
did not need before, do a memcpy, and then patch the IFH anyway for the
non-constant stuff.

Certainly, for the calls to ocelot_port_inject_frame() from DSA, I would
prefer not having this pre-computed IFH.

Could you provide some before/after performance numbers and perf counters?

>  drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++-----
>  include/soc/mscc/ocelot.h          |  5 +++++
>  2 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
> index e6c18b598d5c..97693772595b 100644
> --- a/drivers/net/ethernet/mscc/ocelot.c
> +++ b/drivers/net/ethernet/mscc/ocelot.c
> @@ -1076,20 +1076,29 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp)
>  }
>  EXPORT_SYMBOL(ocelot_can_inject);
>  
> +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
> +			 u32 vlan_tag)
> +{
> +	memcpy(ifh, port->ifh, OCELOT_TAG_LEN);
> +
> +	if (vlan_tag)
> +		ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
> +	if (rew_op)
> +		ocelot_ifh_set_rew_op(ifh, rew_op);
> +}
> +EXPORT_SYMBOL(ocelot_ifh_port_set);
> +
>  void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
>  			      u32 rew_op, struct sk_buff *skb)
>  {
> +	struct ocelot_port *port_s = ocelot->ports[port];
>  	u32 ifh[OCELOT_TAG_LEN / 4] = {0};
>  	unsigned int i, count, last;
>  
>  	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
>  			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
>  
> -	ocelot_ifh_set_bypass(ifh, 1);
> -	ocelot_ifh_set_dest(ifh, BIT_ULL(port));
> -	ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
> -	ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb));
> -	ocelot_ifh_set_rew_op(ifh, rew_op);
> +	ocelot_ifh_port_set(ifh, port_s, rew_op, skb_vlan_tag_get(skb));
>  
>  	for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
>  		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
> @@ -2128,6 +2137,10 @@ void ocelot_init_port(struct ocelot *ocelot, int port)
>  
>  	skb_queue_head_init(&ocelot_port->tx_skbs);
>  
> +	ocelot_ifh_set_bypass(ocelot_port->ifh, 1);
> +	ocelot_ifh_set_dest(ocelot_port->ifh, BIT_ULL(port));
> +	ocelot_ifh_set_tag_type(ocelot_port->ifh, IFH_TAG_TYPE_C);
> +
>  	/* Basic L2 initialization */
>  
>  	/* Set MAC IFG Gaps
> diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> index fef3a36b0210..b3381c90ff3e 100644
> --- a/include/soc/mscc/ocelot.h
> +++ b/include/soc/mscc/ocelot.h
> @@ -6,6 +6,7 @@
>  #define _SOC_MSCC_OCELOT_H
>  
>  #include <linux/ptp_clock_kernel.h>
> +#include <linux/dsa/ocelot.h>
>  #include <linux/net_tstamp.h>
>  #include <linux/if_vlan.h>
>  #include <linux/regmap.h>
> @@ -623,6 +624,8 @@ struct ocelot_port {
>  
>  	struct net_device		*bridge;
>  	u8				stp_state;
> +
> +	u8				ifh[OCELOT_TAG_LEN];
>  };
>  
>  struct ocelot {
> @@ -754,6 +757,8 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
>  bool ocelot_can_inject(struct ocelot *ocelot, int grp);
>  void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
>  			      u32 rew_op, struct sk_buff *skb);
> +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
> +			 u32 vlan_tag);
>  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
>  void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
>  
> -- 
> 2.33.0
>
Clément Léger Nov. 3, 2021, 1:53 p.m. UTC | #2
Le Wed, 3 Nov 2021 12:38:12 +0000,
Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :

> On Wed, Nov 03, 2021 at 10:19:40AM +0100, Clément Léger wrote:
> > IFH preparation can take quite some time on slow processors (up to
> > 5% in a iperf3 test for instance). In order to reduce the cost of
> > this preparation, pre-compute IFH since most of the parameters are
> > fixed per port. Only rew_op and vlan tag will be set when sending
> > if different than 0. This allows to remove entirely the calls to
> > packing() with basic usage. In the same time, export this function
> > that will be used by FDMA.
> > 
> > Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> > ---  
> 
> Honestly, this feels a bit cheap/gimmicky, and not really the
> fundamental thing to address. In my testing of a similar idea (see
> commits 67c2404922c2 ("net: dsa: felix: create a template for the DSA
> tags on xmit") and then 7c4bb540e917 ("net: dsa: tag_ocelot: create
> separate tagger for Seville"), the net difference is not that stark,
> considering that now you need to access one more memory region which
> you did not need before, do a memcpy, and then patch the IFH anyway
> for the non-constant stuff.

The memcpy is neglectable and the patching happens only in a few
cases (at least vs the packing function call). The VSC7514 CPU is really
slow and lead to 2.5% up to 5% time spent in packing() when using iperf3
and depending on the use case (according to ftrace).

> 
> Certainly, for the calls to ocelot_port_inject_frame() from DSA, I
> would prefer not having this pre-computed IFH.
> 
> Could you provide some before/after performance numbers and perf
> counters?

I will make another round of measure to confirm my previous number and
check the impact on the injection rate on ocelot.

> 
> >  drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++-----
> >  include/soc/mscc/ocelot.h          |  5 +++++
> >  2 files changed, 23 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/mscc/ocelot.c
> > b/drivers/net/ethernet/mscc/ocelot.c index
> > e6c18b598d5c..97693772595b 100644 ---
> > a/drivers/net/ethernet/mscc/ocelot.c +++
> > b/drivers/net/ethernet/mscc/ocelot.c @@ -1076,20 +1076,29 @@ bool
> > ocelot_can_inject(struct ocelot *ocelot, int grp) }
> >  EXPORT_SYMBOL(ocelot_can_inject);
> >  
> > +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32
> > rew_op,
> > +			 u32 vlan_tag)
> > +{
> > +	memcpy(ifh, port->ifh, OCELOT_TAG_LEN);
> > +
> > +	if (vlan_tag)
> > +		ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
> > +	if (rew_op)
> > +		ocelot_ifh_set_rew_op(ifh, rew_op);
> > +}
> > +EXPORT_SYMBOL(ocelot_ifh_port_set);
> > +
> >  void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int
> > grp, u32 rew_op, struct sk_buff *skb)
> >  {
> > +	struct ocelot_port *port_s = ocelot->ports[port];
> >  	u32 ifh[OCELOT_TAG_LEN / 4] = {0};
> >  	unsigned int i, count, last;
> >  
> >  	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
> >  			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
> >  
> > -	ocelot_ifh_set_bypass(ifh, 1);
> > -	ocelot_ifh_set_dest(ifh, BIT_ULL(port));
> > -	ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
> > -	ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb));
> > -	ocelot_ifh_set_rew_op(ifh, rew_op);
> > +	ocelot_ifh_port_set(ifh, port_s, rew_op,
> > skb_vlan_tag_get(skb)); 
> >  	for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
> >  		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
> > @@ -2128,6 +2137,10 @@ void ocelot_init_port(struct ocelot *ocelot,
> > int port) 
> >  	skb_queue_head_init(&ocelot_port->tx_skbs);
> >  
> > +	ocelot_ifh_set_bypass(ocelot_port->ifh, 1);
> > +	ocelot_ifh_set_dest(ocelot_port->ifh, BIT_ULL(port));
> > +	ocelot_ifh_set_tag_type(ocelot_port->ifh, IFH_TAG_TYPE_C);
> > +
> >  	/* Basic L2 initialization */
> >  
> >  	/* Set MAC IFG Gaps
> > diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> > index fef3a36b0210..b3381c90ff3e 100644
> > --- a/include/soc/mscc/ocelot.h
> > +++ b/include/soc/mscc/ocelot.h
> > @@ -6,6 +6,7 @@
> >  #define _SOC_MSCC_OCELOT_H
> >  
> >  #include <linux/ptp_clock_kernel.h>
> > +#include <linux/dsa/ocelot.h>
> >  #include <linux/net_tstamp.h>
> >  #include <linux/if_vlan.h>
> >  #include <linux/regmap.h>
> > @@ -623,6 +624,8 @@ struct ocelot_port {
> >  
> >  	struct net_device		*bridge;
> >  	u8				stp_state;
> > +
> > +	u8				ifh[OCELOT_TAG_LEN];
> >  };
> >  
> >  struct ocelot {
> > @@ -754,6 +757,8 @@ void __ocelot_target_write_ix(struct ocelot
> > *ocelot, enum ocelot_target target, bool ocelot_can_inject(struct
> > ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct
> > ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb);
> > +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32
> > rew_op,
> > +			 u32 vlan_tag);
> >  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct
> > sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot,
> > int grp); 
> > -- 
> > 2.33.0
>
Clément Léger Nov. 15, 2021, 10:13 a.m. UTC | #3
Le Wed, 3 Nov 2021 14:53:51 +0100,
Clément Léger <clement.leger@bootlin.com> a écrit :

> Le Wed, 3 Nov 2021 12:38:12 +0000,
> Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :
> 
> > On Wed, Nov 03, 2021 at 10:19:40AM +0100, Clément Léger wrote:  
> > > IFH preparation can take quite some time on slow processors (up to
> > > 5% in a iperf3 test for instance). In order to reduce the cost of
> > > this preparation, pre-compute IFH since most of the parameters are
> > > fixed per port. Only rew_op and vlan tag will be set when sending
> > > if different than 0. This allows to remove entirely the calls to
> > > packing() with basic usage. In the same time, export this function
> > > that will be used by FDMA.
> > > 
> > > Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> > > ---    
> > 
> > Honestly, this feels a bit cheap/gimmicky, and not really the
> > fundamental thing to address. In my testing of a similar idea (see
> > commits 67c2404922c2 ("net: dsa: felix: create a template for the DSA
> > tags on xmit") and then 7c4bb540e917 ("net: dsa: tag_ocelot: create
> > separate tagger for Seville"), the net difference is not that stark,
> > considering that now you need to access one more memory region which
> > you did not need before, do a memcpy, and then patch the IFH anyway
> > for the non-constant stuff.  
> 
> The memcpy is neglectable and the patching happens only in a few
> cases (at least vs the packing function call). The VSC7514 CPU is really
> slow and lead to 2.5% up to 5% time spent in packing() when using iperf3
> and depending on the use case (according to ftrace).
> 
> > 
> > Certainly, for the calls to ocelot_port_inject_frame() from DSA, I
> > would prefer not having this pre-computed IFH.
> > 
> > Could you provide some before/after performance numbers and perf
> > counters?  
> 
> I will make another round of measure to confirm my previous number and
> check the impact on the injection rate on ocelot.

I checked again my bandwith numbers (obtained with iperf3) with and
without the pre-computed header:

Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
- With pre-computed header: UDP TX: 	33Mbit/s
- Without UDP TX: 			31Mbit/s
-> 6.5% improvement

Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
- With pre-computed header: UDP TX: 	15.8Mbit/s
- Without UDP TX: 			16.4Mbit/s
-> 4.3% improvement

The improvement might not be huge but also not negligible at all.
Please tell me if you want me to drop it or not based on those numbers.

> 
> >   
> > >  drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++-----
> > >  include/soc/mscc/ocelot.h          |  5 +++++
> > >  2 files changed, 23 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/mscc/ocelot.c
> > > b/drivers/net/ethernet/mscc/ocelot.c index
> > > e6c18b598d5c..97693772595b 100644 ---
> > > a/drivers/net/ethernet/mscc/ocelot.c +++
> > > b/drivers/net/ethernet/mscc/ocelot.c @@ -1076,20 +1076,29 @@ bool
> > > ocelot_can_inject(struct ocelot *ocelot, int grp) }
> > >  EXPORT_SYMBOL(ocelot_can_inject);
> > >  
> > > +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32
> > > rew_op,
> > > +			 u32 vlan_tag)
> > > +{
> > > +	memcpy(ifh, port->ifh, OCELOT_TAG_LEN);
> > > +
> > > +	if (vlan_tag)
> > > +		ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
> > > +	if (rew_op)
> > > +		ocelot_ifh_set_rew_op(ifh, rew_op);
> > > +}
> > > +EXPORT_SYMBOL(ocelot_ifh_port_set);
> > > +
> > >  void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int
> > > grp, u32 rew_op, struct sk_buff *skb)
> > >  {
> > > +	struct ocelot_port *port_s = ocelot->ports[port];
> > >  	u32 ifh[OCELOT_TAG_LEN / 4] = {0};
> > >  	unsigned int i, count, last;
> > >  
> > >  	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
> > >  			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
> > >  
> > > -	ocelot_ifh_set_bypass(ifh, 1);
> > > -	ocelot_ifh_set_dest(ifh, BIT_ULL(port));
> > > -	ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
> > > -	ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb));
> > > -	ocelot_ifh_set_rew_op(ifh, rew_op);
> > > +	ocelot_ifh_port_set(ifh, port_s, rew_op,
> > > skb_vlan_tag_get(skb)); 
> > >  	for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
> > >  		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
> > > @@ -2128,6 +2137,10 @@ void ocelot_init_port(struct ocelot *ocelot,
> > > int port) 
> > >  	skb_queue_head_init(&ocelot_port->tx_skbs);
> > >  
> > > +	ocelot_ifh_set_bypass(ocelot_port->ifh, 1);
> > > +	ocelot_ifh_set_dest(ocelot_port->ifh, BIT_ULL(port));
> > > +	ocelot_ifh_set_tag_type(ocelot_port->ifh, IFH_TAG_TYPE_C);
> > > +
> > >  	/* Basic L2 initialization */
> > >  
> > >  	/* Set MAC IFG Gaps
> > > diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> > > index fef3a36b0210..b3381c90ff3e 100644
> > > --- a/include/soc/mscc/ocelot.h
> > > +++ b/include/soc/mscc/ocelot.h
> > > @@ -6,6 +6,7 @@
> > >  #define _SOC_MSCC_OCELOT_H
> > >  
> > >  #include <linux/ptp_clock_kernel.h>
> > > +#include <linux/dsa/ocelot.h>
> > >  #include <linux/net_tstamp.h>
> > >  #include <linux/if_vlan.h>
> > >  #include <linux/regmap.h>
> > > @@ -623,6 +624,8 @@ struct ocelot_port {
> > >  
> > >  	struct net_device		*bridge;
> > >  	u8				stp_state;
> > > +
> > > +	u8				ifh[OCELOT_TAG_LEN];
> > >  };
> > >  
> > >  struct ocelot {
> > > @@ -754,6 +757,8 @@ void __ocelot_target_write_ix(struct ocelot
> > > *ocelot, enum ocelot_target target, bool ocelot_can_inject(struct
> > > ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct
> > > ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb);
> > > +void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32
> > > rew_op,
> > > +			 u32 vlan_tag);
> > >  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct
> > > sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot,
> > > int grp); 
> > > -- 
> > > 2.33.0  
> >     
> 
> 
>
Vladimir Oltean Nov. 15, 2021, 10:51 a.m. UTC | #4
On Mon, Nov 15, 2021 at 11:13:44AM +0100, Clément Léger wrote:
> Le Wed, 3 Nov 2021 14:53:51 +0100,
> Clément Léger <clement.leger@bootlin.com> a écrit :
> 
> > Le Wed, 3 Nov 2021 12:38:12 +0000,
> > Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :
> > 
> > > On Wed, Nov 03, 2021 at 10:19:40AM +0100, Clément Léger wrote:  
> > > > IFH preparation can take quite some time on slow processors (up to
> > > > 5% in a iperf3 test for instance). In order to reduce the cost of
> > > > this preparation, pre-compute IFH since most of the parameters are
> > > > fixed per port. Only rew_op and vlan tag will be set when sending
> > > > if different than 0. This allows to remove entirely the calls to
> > > > packing() with basic usage. In the same time, export this function
> > > > that will be used by FDMA.
> > > > 
> > > > Signed-off-by: Clément Léger <clement.leger@bootlin.com>
> > > > ---    
> > > 
> > > Honestly, this feels a bit cheap/gimmicky, and not really the
> > > fundamental thing to address. In my testing of a similar idea (see
> > > commits 67c2404922c2 ("net: dsa: felix: create a template for the DSA
> > > tags on xmit") and then 7c4bb540e917 ("net: dsa: tag_ocelot: create
> > > separate tagger for Seville"), the net difference is not that stark,
> > > considering that now you need to access one more memory region which
> > > you did not need before, do a memcpy, and then patch the IFH anyway
> > > for the non-constant stuff.  
> > 
> > The memcpy is neglectable and the patching happens only in a few
> > cases (at least vs the packing function call). The VSC7514 CPU is really
> > slow and lead to 2.5% up to 5% time spent in packing() when using iperf3
> > and depending on the use case (according to ftrace).
> > 
> > > 
> > > Certainly, for the calls to ocelot_port_inject_frame() from DSA, I
> > > would prefer not having this pre-computed IFH.
> > > 
> > > Could you provide some before/after performance numbers and perf
> > > counters?  
> > 
> > I will make another round of measure to confirm my previous number and
> > check the impact on the injection rate on ocelot.
> 
> I checked again my bandwith numbers (obtained with iperf3) with and
> without the pre-computed header:
> 
> Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> - With pre-computed header: UDP TX: 	33Mbit/s
> - Without UDP TX: 			31Mbit/s
> -> 6.5% improvement
> 
> Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> - With pre-computed header: UDP TX: 	15.8Mbit/s
> - Without UDP TX: 			16.4Mbit/s
> -> 4.3% improvement
> 
> The improvement might not be huge but also not negligible at all.
> Please tell me if you want me to drop it or not based on those numbers.

Is this with manual injection or with FDMA? Do you have before/after
numbers with FDMA as well? At 31 vs 33 Mbps, this isn't going to compete
for any races anyway :)
Clément Léger Nov. 15, 2021, 10:58 a.m. UTC | #5
Le Mon, 15 Nov 2021 10:51:45 +0000,
Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :

> > I checked again my bandwith numbers (obtained with iperf3) with and
> > without the pre-computed header:
> > 
> > Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> > - With pre-computed header: UDP TX: 	33Mbit/s
> > - Without UDP TX: 			31Mbit/s  
> > -> 6.5% improvement  
> > 
> > Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> > - With pre-computed header: UDP TX: 	15.8Mbit/s
> > - Without UDP TX: 			16.4Mbit/s  
> > -> 4.3% improvement  
> > 
> > The improvement might not be huge but also not negligible at all.
> > Please tell me if you want me to drop it or not based on those numbers.  
> 
> Is this with manual injection or with FDMA? Do you have before/after
> numbers with FDMA as well? At 31 vs 33 Mbps, this isn't going to compete
> for any races anyway :)

These numbers were for the FDMA, with the CPU, its even much lower
because more time is spent to push bytes through registers...
But agreed with that, this isn't going to beat any records !
Clément Léger Nov. 15, 2021, 2:06 p.m. UTC | #6
Le Mon, 15 Nov 2021 06:08:00 -0800,
Jakub Kicinski <kuba@kernel.org> a écrit :

> On Mon, 15 Nov 2021 11:13:44 +0100 Clément Léger wrote:
> > Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> > - With pre-computed header: UDP TX: 	33Mbit/s
> > - Without UDP TX: 			31Mbit/s  
> > -> 6.5% improvement    
> > 
> > Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> > - With pre-computed header: UDP TX: 	15.8Mbit/s
> > - Without UDP TX: 			16.4Mbit/s  
> > -> 4.3% improvement    
> 
> Something's wrong with these numbers or I'm missing context.
> You say improvement in both cases yet in the latter case the 
> new number is lower?

You are right Jakub, I swapped the last two results, 

Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
 - With pre-computed header: UDP TX: 	16.4Mbit/s 
 - Without UDP TX: 			15.8Mbit/s
 -> 4.3% improvement
Jakub Kicinski Nov. 15, 2021, 2:08 p.m. UTC | #7
On Mon, 15 Nov 2021 11:13:44 +0100 Clément Léger wrote:
> Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> - With pre-computed header: UDP TX: 	33Mbit/s
> - Without UDP TX: 			31Mbit/s
> -> 6.5% improvement  
> 
> Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> - With pre-computed header: UDP TX: 	15.8Mbit/s
> - Without UDP TX: 			16.4Mbit/s
> -> 4.3% improvement  

Something's wrong with these numbers or I'm missing context.
You say improvement in both cases yet in the latter case the 
new number is lower?
Vladimir Oltean Nov. 15, 2021, 2:31 p.m. UTC | #8
On Mon, Nov 15, 2021 at 03:06:20PM +0100, Clément Léger wrote:
> Le Mon, 15 Nov 2021 06:08:00 -0800,
> Jakub Kicinski <kuba@kernel.org> a écrit :
>
> > On Mon, 15 Nov 2021 11:13:44 +0100 Clément Léger wrote:
> > > Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> > > - With pre-computed header: UDP TX: 	33Mbit/s
> > > - Without UDP TX: 			31Mbit/s
> > > -> 6.5% improvement
> > >
> > > Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> > > - With pre-computed header: UDP TX: 	15.8Mbit/s
> > > - Without UDP TX: 			16.4Mbit/s
> > > -> 4.3% improvement
> >
> > Something's wrong with these numbers or I'm missing context.
> > You say improvement in both cases yet in the latter case the
> > new number is lower?
>
> You are right Jakub, I swapped the last two results,
>
> Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
>  - With pre-computed header: UDP TX: 	16.4Mbit/s
>  - Without UDP TX: 			15.8Mbit/s
>  -> 4.3% improvement

Even in reverse, something still seems wrong with the numbers.
My DSPI controller can transfer at a higher data rate than that.
Where is the rest of the time spent? Computing checksums?
Clément Léger Nov. 15, 2021, 4:03 p.m. UTC | #9
Le Mon, 15 Nov 2021 14:31:06 +0000,
Vladimir Oltean <vladimir.oltean@nxp.com> a écrit :

> On Mon, Nov 15, 2021 at 03:06:20PM +0100, Clément Léger wrote:
> > Le Mon, 15 Nov 2021 06:08:00 -0800,
> > Jakub Kicinski <kuba@kernel.org> a écrit :
> >  
> > > On Mon, 15 Nov 2021 11:13:44 +0100 Clément Léger wrote:  
> > > > Test on standard packets with UDP (iperf3 -t 100 -l 1460 -u -b 0 -c *)
> > > > - With pre-computed header: UDP TX: 	33Mbit/s
> > > > - Without UDP TX: 			31Mbit/s  
> > > > -> 6.5% improvement  
> > > >
> > > > Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> > > > - With pre-computed header: UDP TX: 	15.8Mbit/s
> > > > - Without UDP TX: 			16.4Mbit/s  
> > > > -> 4.3% improvement  
> > >
> > > Something's wrong with these numbers or I'm missing context.
> > > You say improvement in both cases yet in the latter case the
> > > new number is lower?  
> >
> > You are right Jakub, I swapped the last two results,
> >
> > Test on small packets with UDP (iperf3 -t 100 -l 700 -u -b 0 -c *)
> >  - With pre-computed header: UDP TX: 	16.4Mbit/s
> >  - Without UDP TX: 			15.8Mbit/s  
> >  -> 4.3% improvement  
> 
> Even in reverse, something still seems wrong with the numbers.
> My DSPI controller can transfer at a higher data rate than that.
> Where is the rest of the time spent? Computing checksums?

While adding FDMA support, I was surprised by the low performances I
encountered so I spent some times trying to understand and find where
the time was spent. First, I ran a iperf in loopback (using lo) and it
yielded the following results (of course RX/TX runs on the same CPU in
this case):

TCP (iperf3 -c localhost):
 - RX/TX: 84.0Mbit/s

UDP (iperf3 -u -b 0 -c localhost):
 - RX/TX: 65.0Mbit/s

So even in localhost mode, the CPU is already really slow and can only
sustain a really small "throughput". I then tried to check the
performances using the CPU based injection/extraction, and I obtained
the following results:

TCP: (iperf3 -u -b 0 -c)
 - TX: 11.8MBit/s
 - RX: 21.6Mbit/s

UDP (iperf3 -u -b 0 -c)
 - TX: 13.4Mbit/s
 - RX: Not even possible, CPU never succeed to extract a single packet

I then tried to find where was the time spent with ftrace (I kept only
the relevant functions that consume most of the time), the following
results were recorded when using iperf3 with CPU based
injection/extraction.

In TCP TX, a lot of time is spent doing copy from user:

41.71%  iperf3   [kernel.kallsyms]  [k] __raw_copy_to_user
 6.65%  iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
 3.23%  iperf3   [kernel.kallsyms]  [k] do_ade
 2.10%  iperf3   [kernel.kallsyms]  [k] __ocelot_write_ix
 2.10%  iperf3   [kernel.kallsyms]  [k] handle_adel_int
 ...

In TCP RX, numbers are even worse for the time spent in
__raw_copy_to_user:

62.95% iperf3   [kernel.kallsyms]  [k] __raw_copy_to_user
 1.97% iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
 1.15% iperf3   [kernel.kallsyms]  [k] __copy_page_start
 1.07% iperf3   [kernel.kallsyms]  [k] __skb_datagram_iter
 ...


In UDP TX, some time is spent handling locking and unaligned copies
as well as pushing packets. Unaligned copies are due to the driver
accessing all directly the bytes of the packets as word whhich might be
bad when there is misalignement.

17.97%  iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
11.94%  iperf3   [kernel.kallsyms]  [k] do_ade
 9.07%  iperf3   [kernel.kallsyms]  [k] __ocelot_write_ix
 7.74%  iperf3   [kernel.kallsyms]  [k] handle_adel_int
 5.78%  iperf3   [kernel.kallsyms]  [k] copy_from_kernel_nofault
 4.71%  iperf3   [kernel.kallsyms]  [k] __compute_return_epc_for_insn
 2.51%  iperf3   [kernel.kallsyms]  [k] regmap_write
 2.31%  iperf3   [kernel.kallsyms]  [k] __compute_return_epc
 ...

In UDP RX (iperf3 with -b 5M to ensure packets are received), time is
spent in floating point emulation and other various function.

7.26% iperf3   [kernel.kallsyms]  [k] cop1Emulate
2.84% iperf3   [kernel.kallsyms]  [k] do_select
2.08% iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
2.06% iperf3   [kernel.kallsyms]  [k] fpu_emulator_cop1Handler
2.01% iperf3   [kernel.kallsyms]  [k] tcp_poll
2.00% iperf3   [kernel.kallsyms]  [k] __raw_copy_to_user


When using the FDMA, the results are the following:

In TCP TX, copy from user is still present and checksuming takes quite
some time. 

31.31% iperf3   [kernel.kallsyms]  [k] __raw_copy_to_user
10.48% iperf3   [kernel.kallsyms]  [k] __csum_partial_copy_to_user
 3.73% iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
 2.08% iperf3   [kernel.kallsyms]  [k] tcp_ack
 1.68% iperf3   [kernel.kallsyms]  [k] ocelot_fdma_napi_poll
 1.63% iperf3   [kernel.kallsyms]  [k] tcp_write_xmit
 1.05% iperf3   [kernel.kallsyms]  [k] finish_task_switch

In TCP RX, the majority of time is still taken by __raw_copy_to_user.

63.95%[[m  iperf3   [kernel.kallsyms]  [k] __raw_copy_to_user
 1.29%[[m  iperf3   [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
 1.23%[[m  iperf3   [kernel.kallsyms]  [k] tcp_recvmsg_locked
 1.23%[[m  iperf3   [kernel.kallsyms]  [k] __skb_datagram_iter
 1.07%[[m  iperf3   [kernel.kallsyms]  [k] vfs_read

In UDP TX, time is spent in softirq entry and in checksuming.

9.01% iperf3   [kernel.kallsyms]  [k] __softirqentry_text_start
7.07% iperf3   [kernel.kallsyms]  [k] __csum_partial_copy_to_user
2.28% iperf3   [kernel.kallsyms]  [k] __ip_append_data.isra.0
2.10% iperf3   [kernel.kallsyms]  [k] __dev_queue_xmit
2.08% iperf3   [kernel.kallsyms]  [k] siphash_3u32
2.06% iperf3   [kernel.kallsyms]  [k] udp_sendmsg

And in UDP RX, again, time is spent in floating point emulation and
cheksuming.

10.33% iperf3   [kernel.kallsyms]  [k] cop1Emulate
 7.62% iperf3   [kernel.kallsyms]  [k] csum_partial
 3.32% iperf3   [kernel.kallsyms]  [k] do_select
 2.69% iperf3   [kernel.kallsyms]  [k] ieee754dp_sub
 2.68% iperf3   [kernel.kallsyms]  [k] fpu_emulator_cop1Handler
 2.56% iperf3   [kernel.kallsyms]  [k] ieee754dp_add
 2.33% iperf3   [kernel.kallsyms]  [k] ieee754dp_div

After all these measurements, the CPU appears to be the bottleneck and
simply spend a lot of time in various functions. I did not went further
using perf events since there was no real reason to dig up more in that
way.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index e6c18b598d5c..97693772595b 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1076,20 +1076,29 @@  bool ocelot_can_inject(struct ocelot *ocelot, int grp)
 }
 EXPORT_SYMBOL(ocelot_can_inject);
 
+void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
+			 u32 vlan_tag)
+{
+	memcpy(ifh, port->ifh, OCELOT_TAG_LEN);
+
+	if (vlan_tag)
+		ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
+	if (rew_op)
+		ocelot_ifh_set_rew_op(ifh, rew_op);
+}
+EXPORT_SYMBOL(ocelot_ifh_port_set);
+
 void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
 			      u32 rew_op, struct sk_buff *skb)
 {
+	struct ocelot_port *port_s = ocelot->ports[port];
 	u32 ifh[OCELOT_TAG_LEN / 4] = {0};
 	unsigned int i, count, last;
 
 	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
 			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
 
-	ocelot_ifh_set_bypass(ifh, 1);
-	ocelot_ifh_set_dest(ifh, BIT_ULL(port));
-	ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
-	ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb));
-	ocelot_ifh_set_rew_op(ifh, rew_op);
+	ocelot_ifh_port_set(ifh, port_s, rew_op, skb_vlan_tag_get(skb));
 
 	for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
 		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
@@ -2128,6 +2137,10 @@  void ocelot_init_port(struct ocelot *ocelot, int port)
 
 	skb_queue_head_init(&ocelot_port->tx_skbs);
 
+	ocelot_ifh_set_bypass(ocelot_port->ifh, 1);
+	ocelot_ifh_set_dest(ocelot_port->ifh, BIT_ULL(port));
+	ocelot_ifh_set_tag_type(ocelot_port->ifh, IFH_TAG_TYPE_C);
+
 	/* Basic L2 initialization */
 
 	/* Set MAC IFG Gaps
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index fef3a36b0210..b3381c90ff3e 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -6,6 +6,7 @@ 
 #define _SOC_MSCC_OCELOT_H
 
 #include <linux/ptp_clock_kernel.h>
+#include <linux/dsa/ocelot.h>
 #include <linux/net_tstamp.h>
 #include <linux/if_vlan.h>
 #include <linux/regmap.h>
@@ -623,6 +624,8 @@  struct ocelot_port {
 
 	struct net_device		*bridge;
 	u8				stp_state;
+
+	u8				ifh[OCELOT_TAG_LEN];
 };
 
 struct ocelot {
@@ -754,6 +757,8 @@  void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
 bool ocelot_can_inject(struct ocelot *ocelot, int grp);
 void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
 			      u32 rew_op, struct sk_buff *skb);
+void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
+			 u32 vlan_tag);
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
 void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);