diff mbox series

[net-next,v4,11/11] net: dsa: realtek: rtl8365mb: multiple cpu ports, non cpu extint

Message ID 20220105031515.29276-12-luizluca@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net: dsa: realtek: MDIO interface and RTL8367S | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1 this patch: 0
netdev/cc_maintainers warning 2 maintainers not CCed: davem@davemloft.net kuba@kernel.org
netdev/build_clang success Errors and warnings before: 2 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 0
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Luiz Angelo Daros de Luca Jan. 5, 2022, 3:15 a.m. UTC
Now CPU port is not limited to a single port. Also, extint can be used
as non-cpu ports, as long as it defines relatek,ext-int. The last cpu
port will be used as trap_port.

The CPU information was dropped from chip data as it was not used
outside setup. The only other place it was used is when it wrongly
checks for CPU port when it should check for extint.

realtek_priv->cpu_port is now only used by rtl8366rb.c

Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
---
 drivers/net/dsa/realtek/rtl8365mb.c | 53 +++++++++++++++--------------
 1 file changed, 27 insertions(+), 26 deletions(-)

Comments

Alvin Šipraga Jan. 10, 2022, 1:39 p.m. UTC | #1
Luiz Angelo Daros de Luca <luizluca@gmail.com> writes:

> Now CPU port is not limited to a single port. Also, extint can be used
> as non-cpu ports, as long as it defines relatek,ext-int. The last cpu
> port will be used as trap_port.
>
> The CPU information was dropped from chip data as it was not used
> outside setup. The only other place it was used is when it wrongly
> checks for CPU port when it should check for extint.
>
> realtek_priv->cpu_port is now only used by rtl8366rb.c

Great work with this series! If I understood correctly from your last
emails, you weren't actually able to test this due to hardware
constraints. While I think this change is not going to introduce any
surprises, I think you should still mention that it is not tested.

Some more comments below but in general the change makes sense to me.

>
> Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
> ---
>  drivers/net/dsa/realtek/rtl8365mb.c | 53 +++++++++++++++--------------
>  1 file changed, 27 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c
> index 59e08b192c06..6a00a162b2ac 100644
> --- a/drivers/net/dsa/realtek/rtl8365mb.c
> +++ b/drivers/net/dsa/realtek/rtl8365mb.c
> @@ -556,7 +556,6 @@ struct rtl8365mb_port {
>   * @chip_ver: chip silicon revision
>   * @port_mask: mask of all ports
>   * @learn_limit_max: maximum number of L2 addresses the chip can learn
> - * @cpu: CPU tagging and CPU port configuration for this chip
>   * @mib_lock: prevent concurrent reads of MIB counters
>   * @ports: per-port data
>   * @jam_table: chip-specific initialization jam table
> @@ -571,7 +570,6 @@ struct rtl8365mb {
>  	u32 chip_ver;
>  	u32 port_mask;
>  	u32 learn_limit_max;
> -	struct rtl8365mb_cpu cpu;
>  	struct mutex mib_lock;
>  	struct rtl8365mb_port ports[RTL8365MB_MAX_NUM_PORTS];
>  	const struct rtl8365mb_jam_tbl_entry *jam_table;
> @@ -769,17 +767,20 @@ static int rtl8365mb_ext_config_rgmii(struct realtek_priv *priv, int port,
>  	u32 val;
>  	int ret;
>  
> -	if (port != priv->cpu_port) {
> -		dev_err(priv->dev, "only one EXT interface is currently supported\n");
> +	mb = priv->chip_data;
> +	p = &mb->ports[port];
> +	ext_int = p->ext_int;
> +
> +	if (ext_int == RTL8365MB_NOT_EXT) {
> +		dev_err(priv->dev,
> +			"Port %d is not identified as extenal interface.\n",

Maybe just a warning?
also: s/as extenal/as an external/

> +			port);
>  		return -EINVAL;
>  	}
>  
>  	dp = dsa_to_port(priv->ds, port);
>  	dn = dp->dn;
>  
> -	mb = priv->chip_data;
> -	p = &mb->ports[port];
> -	ext_int = p->ext_int;
>  
>  	/* Set the RGMII TX/RX delay
>  	 *
> @@ -859,15 +860,17 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_priv *priv, int port,
>  	int val;
>  	int ret;
>  
> -	if (port != priv->cpu_port) {
> -		dev_err(priv->dev, "only one EXT interface is currently supported\n");
> -		return -EINVAL;
> -	}
> -
>  	mb = priv->chip_data;
>  	p = &mb->ports[port];
>  	ext_int = p->ext_int;
>  
> +	if (ext_int == RTL8365MB_NOT_EXT) {
> +		dev_err(priv->dev,
> +			"Port %d is not identified as extenal interface.\n",

ditto

> +			port);
> +		return -EINVAL;
> +	}
> +
>  	if (link) {
>  		/* Force the link up with the desired configuration */
>  		r_link = 1;
> @@ -1734,10 +1737,8 @@ static void rtl8365mb_irq_teardown(struct realtek_priv *priv)
>  	}
>  }
>  
> -static int rtl8365mb_cpu_config(struct realtek_priv *priv)
> +static int rtl8365mb_cpu_config(struct realtek_priv *priv, struct rtl8365mb_cpu *cpu)

const struct rtl8365mb_cpu?

>  {
> -	struct rtl8365mb *mb = priv->chip_data;
> -	struct rtl8365mb_cpu *cpu = &mb->cpu;
>  	u32 val;
>  	int ret;
>  
> @@ -1839,11 +1840,17 @@ static int rtl8365mb_setup(struct dsa_switch *ds)
>  		dev_info(priv->dev, "no interrupt support\n");
>  
>  	/* Configure CPU tagging */
> +	cpu.mask = 0;

I guess the unused cpu variable in the earlier patch belongs in this
one, in which case you can just initialize it = { 0 } so that you don't
need to explicitly set cpu.mask = 0.

>  	dsa_switch_for_each_cpu_port(cpu_dp, priv->ds) {
> -		priv->cpu_port = cpu_dp->index;
> -		mb->cpu.mask = BIT(priv->cpu_port);
> -		mb->cpu.trap_port = priv->cpu_port;
> -		ret = rtl8365mb_cpu_config(priv);
> +		cpu.enable = 1;
> +		cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
> +		cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
> +		cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
> +		cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
> +		cpu.trap_port = cpu_dp->index;

If you are going to do this, perhaps it's better specified as a device
tree property like the external interface index? Making the "last" CPU
port the trap port is not incorrect, but it seems quite arbitrary.

> +		cpu.mask |= BIT(cpu_dp->index);
> +
> +		ret = rtl8365mb_cpu_config(priv, &cpu);

Shouldn't this go outside the loop to avoid potentially calling it twice
in a row?

>  		if (ret)
>  			goto out_teardown_irq;
>  
> @@ -1862,7 +1869,7 @@ static int rtl8365mb_setup(struct dsa_switch *ds)
>  		dn = dsa_to_port(priv->ds, i)->dn;
>  
>  		/* Forward only to the CPU */
> -		ret = rtl8365mb_port_set_isolation(priv, i, BIT(priv->cpu_port));
> +		ret = rtl8365mb_port_set_isolation(priv, i, cpu.mask);
>  		if (ret)
>  			goto out_teardown_irq;
>  
> @@ -2003,12 +2010,6 @@ static int rtl8365mb_detect(struct realtek_priv *priv)
>  		mb->jam_table = rtl8365mb_init_jam_8365mb_vc;
>  		mb->jam_size = ARRAY_SIZE(rtl8365mb_init_jam_8365mb_vc);
>  
> -		mb->cpu.enable = 1;
> -		mb->cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
> -		mb->cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
> -		mb->cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
> -		mb->cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
> -
>  		break;
>  	default:
>  		dev_err(priv->dev,
Frank Wunderlich Jan. 10, 2022, 1:53 p.m. UTC | #2
Hi,

i have 2 devices here i currently try this series.

1x Bananapi R64 v0.1 (mt7622 SOC) with rtl8367s (sgmii+rgmii) - configured to use extport 2 in rgmii mode
1x Bananapi R2 Pro v0 (rk3568 SOC) with rtl8367RB (rgmii+rgmii) - configured to use extport 1 in rgmii mode

on both devices i get mdio running after additional reset in probe and ports are
recognizing link up (got the real port-reg-mapping)

on r64 i get pings working but tcp (ssh, http) seems not working.
on r2pro i cannot get even ping working (but rk3568 gmac seems to come up).

but i'm not deep enough in driver coding to find out whats wrong not having technical documents for checking registers to values needed.

so i need support from anyone to test it further, but devices are here ;)

regards Frank

> Gesendet: Montag, 10. Januar 2022 um 14:39 Uhr
> Von: "Alvin Šipraga" <ALSI@bang-olufsen.dk>
> Great work with this series! If I understood correctly from your last
> emails, you weren't actually able to test this due to hardware
> constraints. While I think this change is not going to introduce any
> surprises, I think you should still mention that it is not tested.
Alvin Šipraga Jan. 11, 2022, 6:17 p.m. UTC | #3
Frank Wunderlich <frank-w@public-files.de> writes:

> Hi,
>
> i have 2 devices here i currently try this series.
>
> 1x Bananapi R64 v0.1 (mt7622 SOC) with rtl8367s (sgmii+rgmii) - configured to use extport 2 in rgmii mode
> 1x Bananapi R2 Pro v0 (rk3568 SOC) with rtl8367RB (rgmii+rgmii) - configured to use extport 1 in rgmii mode
>
> on both devices i get mdio running after additional reset in probe and ports are
> recognizing link up (got the real port-reg-mapping)
>
> on r64 i get pings working but tcp (ssh, http) seems not working.
> on r2pro i cannot get even ping working (but rk3568 gmac seems to come
> up).

Luiz, any comments regarding this? I suppose if the chip ID/revision is
the same for both 67S and 67RB, they should work pretty much the same,
right?

>
> but i'm not deep enough in driver coding to find out whats wrong not having technical documents for checking registers to values needed.

Ping working but TCP not working is a bit strange. You could check the
output of ethtool -S and see if that meets your expectations. If you
have a relatively modern ethtool you can also append --all-groups to the
comment to get a more standard output.

You can also try adjusting the RGMII TX/RX delay and pause settings -
that might help for the R2 where you aren't getting any packets
through.
Frank Wunderlich Jan. 11, 2022, 6:45 p.m. UTC | #4
Hi,

> Gesendet: Dienstag, 11. Januar 2022 um 19:17 Uhr
> Von: "Alvin Šipraga" <ALSI@bang-olufsen.dk>

> Luiz, any comments regarding this? I suppose if the chip ID/revision is
> the same for both 67S and 67RB, they should work pretty much the same,
> right?

my phy driver is same for both devices and afaik only do different RX/TX delays. With the chip-rev-patch 0x0020 i can init the switch, but have no technical documentation except the phy driver code.

> Ping working but TCP not working is a bit strange. You could check the
> output of ethtool -S and see if that meets your expectations. If you
> have a relatively modern ethtool you can also append --all-groups to the
> comment to get a more standard output.

as far as i see in tcpdump (suggested by luiz) on target it is a checksum error where checksum is always 0x8382 (maybe some kind of fixed tag).

16:39:07.994825 IP (tos 0x10, ttl 64, id 54002, offset 0, flags [DF], proto TCP (6), length 60)
    192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382 (incorrect -> 0xa6f6), seq 3231275121, win 64240, options [mss 1460,sackOK,TS val 1615921214 ecr 0,nop,wscale 7], length 0
16:39:12.154790 IP (tos 0x10, ttl 64, id 54003, offset 0, flags [DF], proto TCP (6), length 60)
    192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382 (incorrect -> 0x96b6), seq 3231275121, win 64240, options [mss 1460,sackOK,TS val 1615925374 ecr 0,nop,wscale 7], length 0

> You can also try adjusting the RGMII TX/RX delay and pause settings -
> that might help for the R2 where you aren't getting any packets
> through.

r2pro i got working by setting both delays to 0 as phy-driver does the same (after some calculation).

on r64 this is a bit more tricky, because the phy driver uses  tx=1 and rx=3 with this calculation for reg-value

regData = (regData & 0xFFF0) | ((txDelay << 3) & 0x0008) | (rxDelay & 0x0007);

but in dts i need the values in picosends (?) and here i do not know how to calculate them

regards Frank
Alvin Šipraga Jan. 13, 2022, 12:37 p.m. UTC | #5
Frank Wunderlich <frank-w@public-files.de> writes:

> Hi,
>
>> Gesendet: Dienstag, 11. Januar 2022 um 19:17 Uhr
>> Von: "Alvin Šipraga" <ALSI@bang-olufsen.dk>
>
>> Luiz, any comments regarding this? I suppose if the chip ID/revision is
>> the same for both 67S and 67RB, they should work pretty much the same,
>> right?
>
> my phy driver is same for both devices and afaik only do different
> RX/TX delays. With the chip-rev-patch 0x0020 i can init the switch,
> but have no technical documentation except the phy driver code.
>
>> Ping working but TCP not working is a bit strange. You could check the
>> output of ethtool -S and see if that meets your expectations. If you
>> have a relatively modern ethtool you can also append --all-groups to the
>> comment to get a more standard output.
>
> as far as i see in tcpdump (suggested by luiz) on target it is a checksum error where checksum is always 0x8382 (maybe some kind of fixed tag).
>
> 16:39:07.994825 IP (tos 0x10, ttl 64, id 54002, offset 0, flags [DF], proto TCP (6), length 60)
>     192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382
> (incorrect -> 0xa6f6), seq 3231275121, win 64240, options [mss
> 1460,sackOK,TS val 1615921214 ecr 0,nop,wscale 7], length 0
> 16:39:12.154790 IP (tos 0x10, ttl 64, id 54003, offset 0, flags [DF], proto TCP (6), length 60)
>     192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382
> (incorrect -> 0x96b6), seq 3231275121, win 64240, options [mss
> 1460,sackOK,TS val 1615925374 ecr 0,nop,wscale 7], length 0

That's weird, I must admit I do not recognize this issue at all. Try
dumping the whole packet with -x and maybe you can see what kind of data
you are getting.

>
>> You can also try adjusting the RGMII TX/RX delay and pause settings -
>> that might help for the R2 where you aren't getting any packets
>> through.
>
> r2pro i got working by setting both delays to 0 as phy-driver does the same (after some calculation).
>
> on r64 this is a bit more tricky, because the phy driver uses  tx=1 and rx=3 with this calculation for reg-value
>
> regData = (regData & 0xFFF0) | ((txDelay << 3) & 0x0008) | (rxDelay & 0x0007);
>
> but in dts i need the values in picosends (?) and here i do not know
> how to calculate them

Try:

    tx-internal-delay-ps = <2000>;
    rx-internal-delay-ps = <1000>;

This should correspond to internal values tx=1 and rx=3.

Kind regards,
Alvin
Frank Wunderlich Jan. 13, 2022, 3:56 p.m. UTC | #6
Hi,

the problem is checksum offloading on the gmac (soc-side)

root@bpi-r64:~# ethtool -k eth1 | grep checksum                                                                                                  
rx-checksumming: on                                                                                                                              
tx-checksumming: on                                                                                                                              
        tx-checksum-ipv4: on    #<<<<<<<<<<<<<                           
        tx-checksum-ip-generic: off [fixed]                                                                                                      
        tx-checksum-ipv6: on    #<<<<<<<<<<<<<                          
        tx-checksum-fcoe-crc: off [fixed]                                                                                                        
        tx-checksum-sctp: off [fixed]

in my case i tried ipv4....and after disabling the offload i get a connection

root@bpi-r64:~# ethtool -K eth1 rx off tx off                                                                                                    
Actual changes:                                                                                                                                  
tx-checksum-ipv4: off                                                                                                                            
tx-checksum-ipv6: off                                                                                                                            
tx-tcp-segmentation: off [not requested]                                                                                                         
tx-tcp6-segmentation: off [not requested]                                                                                                        
rx-checksum: off                                                                                                                                 
root@bpi-r64:~# telnet 192.168.1.1 22                                                                                                            
Trying 192.168.1.1...                                                                                                                            
Connected to 192.168.1.1.                                                                                                                        
Escape character is '^]'.                                                                                                                        
SSH-2.0-OpenSSH_8.2p1 Ubuntu-4ubuntu0.3                                                                                                          
^C

regards Frank


> Gesendet: Donnerstag, 13. Januar 2022 um 13:37 Uhr
> Von: "Alvin Šipraga" <ALSI@bang-olufsen.dk>
> An: "Frank Wunderlich" <frank-w@public-files.de>
> Cc: "Luiz Angelo Daros de Luca" <luizluca@gmail.com>, "netdev@vger.kernel.org" <netdev@vger.kernel.org>, "linus.walleij@linaro.org" <linus.walleij@linaro.org>, "andrew@lunn.ch" <andrew@lunn.ch>, "vivien.didelot@gmail.com" <vivien.didelot@gmail.com>, "f.fainelli@gmail.com" <f.fainelli@gmail.com>, "olteanv@gmail.com" <olteanv@gmail.com>, "arinc.unal@arinc9.com" <arinc.unal@arinc9.com>
> Betreff: Re: Aw: Re:  Re: [PATCH net-next v4 11/11] net: dsa: realtek: rtl8365mb: multiple cpu ports, non cpu extint
>
> Frank Wunderlich <frank-w@public-files.de> writes:
> 
> > Hi,
> >
> >> Gesendet: Dienstag, 11. Januar 2022 um 19:17 Uhr
> >> Von: "Alvin Šipraga" <ALSI@bang-olufsen.dk>
> >
> >> Luiz, any comments regarding this? I suppose if the chip ID/revision is
> >> the same for both 67S and 67RB, they should work pretty much the same,
> >> right?
> >
> > my phy driver is same for both devices and afaik only do different
> > RX/TX delays. With the chip-rev-patch 0x0020 i can init the switch,
> > but have no technical documentation except the phy driver code.
> >
> >> Ping working but TCP not working is a bit strange. You could check the
> >> output of ethtool -S and see if that meets your expectations. If you
> >> have a relatively modern ethtool you can also append --all-groups to the
> >> comment to get a more standard output.
> >
> > as far as i see in tcpdump (suggested by luiz) on target it is a checksum error where checksum is always 0x8382 (maybe some kind of fixed tag).
> >
> > 16:39:07.994825 IP (tos 0x10, ttl 64, id 54002, offset 0, flags [DF], proto TCP (6), length 60)
> >     192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382
> > (incorrect -> 0xa6f6), seq 3231275121, win 64240, options [mss
> > 1460,sackOK,TS val 1615921214 ecr 0,nop,wscale 7], length 0
> > 16:39:12.154790 IP (tos 0x10, ttl 64, id 54003, offset 0, flags [DF], proto TCP (6), length 60)
> >     192.168.1.2.43284 > 192.168.1.1.22: Flags [S], cksum 0x8382
> > (incorrect -> 0x96b6), seq 3231275121, win 64240, options [mss
> > 1460,sackOK,TS val 1615925374 ecr 0,nop,wscale 7], length 0
> 
> That's weird, I must admit I do not recognize this issue at all. Try
> dumping the whole packet with -x and maybe you can see what kind of data
> you are getting.

2 example packets from tcpdump (if you still want to see it)

$ sudo tcpdump -i enx00131100063c -vvv -nn -x
tcpdump: listening on enx00131100063c, link-type EN10MB (Ethernet), capture size 262144 bytes


16:43:50.297259 IP (tos 0x10, ttl 64, id 19802, offset 0, flags [DF], proto TCP (6), length 60)
    192.168.1.2.38278 > 192.168.1.1.22: Flags [S], cksum 0x8382 (incorrect -> 0xb704), seq 2565260294, win 64240, options [mss 1460,sackOK,TS val 2917954112 ecr 0,nop,wscale 7], length 0
	0x0000:  4510 003c 4d5a 4000 4006 69fe c0a8 0102
	0x0010:  c0a8 0101 9586 0016 98e6 c406 0000 0000
	0x0020:  a002 faf0 8382 0000 0204 05b4 0402 080a
	0x0030:  adec 7240 0000 0000 0103 0307
16:43:51.324255 IP (tos 0x10, ttl 64, id 19803, offset 0, flags [DF], proto TCP (6), length 60)
    192.168.1.2.38278 > 192.168.1.1.22: Flags [S], cksum 0x8382 (incorrect -> 0xb300), seq 2565260294, win 64240, options [mss 1460,sackOK,TS val 2917955140 ecr 0,nop,wscale 7], length 0
	0x0000:  4510 003c 4d5b 4000 4006 69fd c0a8 0102
	0x0010:  c0a8 0101 9586 0016 98e6 c406 0000 0000
	0x0020:  a002 faf0 8382 0000 0204 05b4 0402 080a
	0x0030:  adec 7644 0000 0000 0103 0307


> >> You can also try adjusting the RGMII TX/RX delay and pause settings -
> >> that might help for the R2 where you aren't getting any packets
> >> through.
> >
> > r2pro i got working by setting both delays to 0 as phy-driver does the same (after some calculation).
> >
> > on r64 this is a bit more tricky, because the phy driver uses  tx=1 and rx=3 with this calculation for reg-value
> >
> > regData = (regData & 0xFFF0) | ((txDelay << 3) & 0x0008) | (rxDelay & 0x0007);
> >
> > but in dts i need the values in picosends (?) and here i do not know
> > how to calculate them
> 
> Try:
> 
>     tx-internal-delay-ps = <2000>;
>     rx-internal-delay-ps = <1000>;
> 
> This should correspond to internal values tx=1 and rx=3.

thanks i've found out and used tx=2000 and rx=900 (your 1000 is rounded to 3), but only disabling checksum-offloading fixed the problem. need to look how to make it persistent.

Afaik switch driver does not do any Checksum-handling so problem lies in the SOC ethernet driver (here i guess the mtk_soc_eth.c for mt7622). maybe i find an option to disable the offloading in dts because boards with mt7531 switch  working. maybe DSA-Tag handling can be changed, but this is no breaking point from my POV.

regards Frank
Luiz Angelo Daros de Luca Jan. 18, 2022, 4:58 a.m. UTC | #7
> the problem is checksum offloading on the gmac (soc-side)

I suggested it might be checksum problem because I'm also affected. In
my case, I have an mt7620a SoC connected to the rtl8367s switch. The
OS offloads checksum to HW but the mt7620a cannot calculate the
checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
move the CPU tag to test if the mt7620a will then digest the frame
correctly.

Regards,
Alvin Šipraga Jan. 18, 2022, 10:13 a.m. UTC | #8
Luiz Angelo Daros de Luca <luizluca@gmail.com> writes:

>> the problem is checksum offloading on the gmac (soc-side)
>
> I suggested it might be checksum problem because I'm also affected. In
> my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> OS offloads checksum to HW but the mt7620a cannot calculate the
> checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> move the CPU tag to test if the mt7620a will then digest the frame
> correctly.

You have two choices:

    enum rtl8365mb_cpu_position {
            RTL8365MB_CPU_POS_AFTER_SA = 0,
            RTL8365MB_CPU_POS_BEFORE_CRC = 1,
    };

I hardcoded it to AFTER_SA but if you find that this solves the problem
for some MACs then it might be worth adding a device tree property for
this to make it configurable. Of course remember to keep it
backward-compatible, and add a note to future travellers in the bindings
that this might solve checksum errors :-)

Kind regards,
Alvin
Andrew Lunn Jan. 18, 2022, 1:20 p.m. UTC | #9
On Tue, Jan 18, 2022 at 01:58:39AM -0300, Luiz Angelo Daros de Luca wrote:
> > the problem is checksum offloading on the gmac (soc-side)
> 
> I suggested it might be checksum problem because I'm also affected. In
> my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> OS offloads checksum to HW but the mt7620a cannot calculate the
> checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> move the CPU tag to test if the mt7620a will then digest the frame
> correctly.

Some MAC hardware you can tell it where the ether type value is in the
frame. This is often used to skip over the VLAN header, but it can
also be used to skip DSA headers. Check the datasheet for the hardware
and see if there is anything like that.

    Andrew
Vladimir Oltean Jan. 20, 2022, 3:12 p.m. UTC | #10
On Tue, Jan 18, 2022 at 02:20:57PM +0100, Andrew Lunn wrote:
> On Tue, Jan 18, 2022 at 01:58:39AM -0300, Luiz Angelo Daros de Luca wrote:
> > > the problem is checksum offloading on the gmac (soc-side)
> > 
> > I suggested it might be checksum problem because I'm also affected. In
> > my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> > OS offloads checksum to HW but the mt7620a cannot calculate the
> > checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> > move the CPU tag to test if the mt7620a will then digest the frame
> > correctly.
> 
> Some MAC hardware you can tell it where the ether type value is in the
> frame. This is often used to skip over the VLAN header, but it can
> also be used to skip DSA headers. Check the datasheet for the hardware
> and see if there is anything like that.
> 
>     Andrew

And what is the problem if the hardware cannot calculate the checksum
with an unknown EtherType? Is it the DSA master that drops the packets
in hardware? What is the reported error counter?
Luiz Angelo Daros de Luca Jan. 20, 2022, 11:35 p.m. UTC | #11
> And what is the problem if the hardware cannot calculate the checksum
> with an unknown EtherType? Is it the DSA master that drops the packets
> in hardware? What is the reported error counter?

No, the issue is with outgoing packets and nothing is dropped inside
the DSA device.

If the OS is configured to offload (I'm using OpenWrt.), it will send
a packet with the wrong checksum expecting that the HW will fix that.
After DSA is brought up, the OS is still expecting the HW to calculate
the checksums. However, with the EtherType DSA tag from a , it cannot
understand it anymore, leaving the checksum as is. The DSA switch
(Realtek) passes the packet to the network and the other end receives
a broken packet. Maybe if the DSA knew that the CPU Ethernet HW cannot
handle that DSA tag, it could disable checksums by default. But it is
difficult to foresee how each offload HW will digest each type of CPU
tag.

Is the kernel enabling checksum by default when the driver reports it
is supported? If so, it would be nice to somehow disable offloading
with some kind of device-tree dsa cpu port property.

Regards,

Luiz
Vladimir Oltean Jan. 21, 2022, 2:06 a.m. UTC | #12
On Thu, Jan 20, 2022 at 08:35:54PM -0300, Luiz Angelo Daros de Luca wrote:
> > And what is the problem if the hardware cannot calculate the checksum
> > with an unknown EtherType? Is it the DSA master that drops the packets
> > in hardware? What is the reported error counter?
> 
> No, the issue is with outgoing packets and nothing is dropped inside
> the DSA device.

Ah, sorry, I missed that.

> If the OS is configured to offload (I'm using OpenWrt.), it will send
> a packet with the wrong checksum expecting that the HW will fix that.
> After DSA is brought up, the OS is still expecting the HW to calculate
> the checksums. However, with the EtherType DSA tag from a , it cannot
> understand it anymore, leaving the checksum as is. The DSA switch
> (Realtek) passes the packet to the network and the other end receives
> a broken packet. Maybe if the DSA knew that the CPU Ethernet HW cannot
> handle that DSA tag, it could disable checksums by default. But it is
> difficult to foresee how each offload HW will digest each type of CPU
> tag.
> 
> Is the kernel enabling checksum by default when the driver reports it
> is supported? If so, it would be nice to somehow disable offloading
> with some kind of device-tree dsa cpu port property.

:) device tree properties are not the fix for everything!

I think I know what the problem is. But I'd need to know what the driver
for the DSA master is, to confirm. To be precise, what I'd like to check
is the value of master->vlan_features.
Luiz Angelo Daros de Luca Jan. 21, 2022, 3:13 a.m. UTC | #13
> :) device tree properties are not the fix for everything!

I'm still getting used to it ;-)

In this thread, Alvin suggested adding a new property to define which
port will be used as trap_port instead of using the last CPU port.
Should I try something different?

        switch1 {
               compatible = "realtek,rtl8367s";
               reg = <29>;

               realtek,trap-port = <&port7>;

               ports {
                        ....
                        port7: port@7 {
                            ...
                       };
        };

Should I do something differently?

> I think I know what the problem is. But I'd need to know what the driver
> for the DSA master is, to confirm. To be precise, what I'd like to check
> is the value of master->vlan_features.

Here it is 0x1099513266227 (I hope). Oh, this DSA driver still does
not implement vlan nor bridge offload. Maybe it would matter.

Regards,

Luiz
Florian Fainelli Jan. 21, 2022, 3:22 a.m. UTC | #14
On 1/20/2022 7:13 PM, Luiz Angelo Daros de Luca wrote:
>> :) device tree properties are not the fix for everything!
> 
> I'm still getting used to it ;-)
> 
> In this thread, Alvin suggested adding a new property to define which
> port will be used as trap_port instead of using the last CPU port.
> Should I try something different?
> 
>          switch1 {
>                 compatible = "realtek,rtl8367s";
>                 reg = <29>;
> 
>                 realtek,trap-port = <&port7>;
> 
>                 ports {
>                          ....
>                          port7: port@7 {
>                              ...
>                         };
>          };
> 
> Should I do something differently?
> 
>> I think I know what the problem is. But I'd need to know what the driver
>> for the DSA master is, to confirm. To be precise, what I'd like to check
>> is the value of master->vlan_features.
> 
> Here it is 0x1099513266227 (I hope). Oh, this DSA driver still does
> not implement vlan nor bridge offload. Maybe it would matter.

Are we talking about an in tree driver? If so which is it?
Luiz Angelo Daros de Luca Jan. 21, 2022, 3:42 a.m. UTC | #15
> Are we talking about an in tree driver? If so which is it?

Yes, the one the patch touches: rtl8365mb.

My device uses a mt7620a SoC and traffic passes through its mt7530
switch with vlan disabled before reaching the realtek switch. It still
loads a swconfig driver but I think it might work without one.
I just didn't stop to try it yet.
Florian Fainelli Jan. 21, 2022, 3:50 a.m. UTC | #16
On 1/20/2022 7:42 PM, Luiz Angelo Daros de Luca wrote:
>> Are we talking about an in tree driver? If so which is it?
> 
> Yes, the one the patch touches: rtl8365mb.

I meant the DSA master network device, but you answered that, it uses a 
mt7260a SoC, but there is no Ethernet driver upstream for it yet?

git grep ralink,mt7620-gsw *
Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt: 
compatible = "ralink,mt7620-gsw";

> 
> My device uses a mt7620a SoC and traffic passes through its mt7530
> switch with vlan disabled before reaching the realtek switch. It still
> loads a swconfig driver but I think it might work without one.

Ah so you have a cascade of switches here, that could confuse your 
Ethernet MAC. Do you have a knob to adjust where to calculate the 
checksum from, say a L2 or L3 offset for instance?
Luiz Angelo Daros de Luca Jan. 21, 2022, 4:37 a.m. UTC | #17
> >> Are we talking about an in tree driver? If so which is it?
> >
> > Yes, the one the patch touches: rtl8365mb.
>
> I meant the DSA master network device, but you answered that, it uses a
> mt7260a SoC, but there is no Ethernet driver upstream for it yet?
>
> git grep ralink,mt7620-gsw *
> Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt:
> compatible = "ralink,mt7620-gsw";
>
> >
> > My device uses a mt7620a SoC and traffic passes through its mt7530
> > switch with vlan disabled before reaching the realtek switch. It still
> > loads a swconfig driver but I think it might work without one.
>
> Ah so you have a cascade of switches here, that could confuse your
> Ethernet MAC. Do you have a knob to adjust where to calculate the
> checksum from, say a L2 or L3 offset for instance?

Not that I could find in any docs. I just found registers to set it on
and off. However, Realtek supports two locations for the CPU tag. I'll
try the RTL8365MB_CPU_POS_BEFORE_CRC and hope the checksum will work
as expected. But I might leave that test for a moment after this
series is solved.

> --
> Florian
Arınç ÜNAL Jan. 21, 2022, 9:07 a.m. UTC | #18
On 21/01/2022 06:50, Florian Fainelli wrote:
> 
> 
> On 1/20/2022 7:42 PM, Luiz Angelo Daros de Luca wrote:
>>> Are we talking about an in tree driver? If so which is it?
>>
>> Yes, the one the patch touches: rtl8365mb.
> 
> I meant the DSA master network device, but you answered that, it uses a 
> mt7260a SoC, but there is no Ethernet driver upstream for it yet?
> 
> git grep ralink,mt7620-gsw *
> Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt: 
> compatible = "ralink,mt7620-gsw";
> 
>>
>> My device uses a mt7620a SoC and traffic passes through its mt7530
>> switch with vlan disabled before reaching the realtek switch. It still
>> loads a swconfig driver but I think it might work without one.
> 
> Ah so you have a cascade of switches here, that could confuse your 
> Ethernet MAC. Do you have a knob to adjust where to calculate the 
> checksum from, say a L2 or L3 offset for instance?

The company I currently work for has got their own mt7621a board with an 
external rtl8367s switch.

According to Documentation/devicetree/bindings/net/dsa/mt7530.txt I can 
either connect the rtl switch directly to the second GMAC of the mt7621 
SoC or to MT7530's GMAC5 to create a cascade.

I've been running gregkh/staging staging-next branch but I can't seem to 
have traffic flow on the RGMII2 bus which is shared by the 2nd GMAC of 
the SoC, MT7530's GMAC5 and an external phy (rtl switch in this case).

None of the documented configurations work:
PHY0/4 <-> 2nd GMAC
External phy <-> 2nd GMAC
External phy <-> MT7530's GMAC5

Arınç
Vladimir Oltean Jan. 21, 2022, 6:50 p.m. UTC | #19
On Fri, Jan 21, 2022 at 12:13:58AM -0300, Luiz Angelo Daros de Luca wrote:
> > :) device tree properties are not the fix for everything!
> 
> I'm still getting used to it ;-)
> 
> In this thread, Alvin suggested adding a new property to define which
> port will be used as trap_port instead of using the last CPU port.
> Should I try something different?
> 
>         switch1 {
>                compatible = "realtek,rtl8367s";
>                reg = <29>;
> 
>                realtek,trap-port = <&port7>;
> 
>                ports {
>                         ....
>                         port7: port@7 {
>                             ...
>                        };
>         };
> 
> Should I do something differently?

To clarify, I don't know what a trap_port is. I just saw this
description in rtl8365mb.c:

 * @trap_port: forward trapped frames to this port

but I still don't know to which packets does this configuration apply
(where are the packet traps installed, and for what kind of packets).

Speculating here, but it appears quite arbitrary, and I'd guess also
broken, to make the trap_port the last CPU port. Is this also part of
the things which you didn't really test? See commit 8d5f7954b7c8 ("net:
dsa: felix: break at first CPU port during init and teardown") for a
similar issue with this. When there are multiple 'ethernet = <&phandle>'
properties in the device tree, DSA makes the owners of all those
phandles a DSA master, and all those switch ports as CPU ports. But out
of all those CPU ports, only the first one is an active CPU port. The
others have no dp->cpu_dp pointing to them.
See dsa_tree_setup_default_cpu() -> dsa_tree_find_first_cpu().
Even when DSA gets full-blown support for multiple CPU ports, I think
it's safe to say that this default will remain the way it is: a single
CPU port will be active to begin with: the first one. Given that fact
(and depending on what you need to do with the trap_port info exactly),
it might be broken to set as the trap port a CPU port that isn't used.
Stuff like dsa_port_host_fdb_add()/dsa_port_host_fdb_del() will be
broken, because they rely on the dp->cpu_dp association, and
dp->cpu_dp->index will be != trap_port.

> > I think I know what the problem is. But I'd need to know what the driver
> > for the DSA master is, to confirm. To be precise, what I'd like to check
> > is the value of master->vlan_features.
> 
> Here it is 0x1099513266227 (I hope).

That's quite an extraordinary set of vlan_features. In that number, I
notice BIT(2) is set, which corresponds to __UNUSED_NETIF_F_1. So it
probably isn't correctly printed.

This is what I would have liked to see:

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 22241afcac81..b41f1b414c69 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1909,6 +1909,7 @@ void dsa_slave_setup_tagger(struct net_device *slave)
 	p->xmit = cpu_dp->tag_ops->xmit;
 
 	slave->features = master->vlan_features | NETIF_F_HW_TC;
+	netdev_err(slave, "master %s vlan_features 0x%llx\n", master->name, master->vlan_features);
 	slave->hw_features |= NETIF_F_HW_TC;
 	slave->features |= NETIF_F_LLTX;
 	if (slave->needed_tailroom)

And I don't think you fully answered Florian's questions either, really.
Can we see the a link to the code of the Ethernet controller whose role
is to be a host port (DSA master) for the rtl8365mb switch? If that DSA
master is a DSA switch itself, could you please unroll the chain all the
way with more links to drivers? No matter whether upstream or downstream,
just what you use.

I hate to guess, but since both you and Arınç have mentioned the
mt7620a/mt7621 SoCs, I'd guess that the top-most DSA driver in both
cases is "mediatek,eth-mac" (drivers/net/ethernet/mediatek/mtk_eth_soc.c).
If so, this would confirm my suspicions, since it sets its vlan_features
to include NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Please confirm that
master->vlan_features contains these 2 bits.

> Oh, this DSA driver still does not implement vlan nor bridge offload.
> Maybe it would matter.

It doesn't matter. The vlan_features is a confusing name for what it
really does here. I'll explain in a bit once you clarify the other
things I asked for.
Luiz Angelo Daros de Luca Jan. 21, 2022, 9:51 p.m. UTC | #20
> > I'm still getting used to it ;-)
> >
> > In this thread, Alvin suggested adding a new property to define which
> > port will be used as trap_port instead of using the last CPU port.
> > Should I try something different?
> >
> >         switch1 {
> >                compatible = "realtek,rtl8367s";
> >                reg = <29>;
> >
> >                realtek,trap-port = <&port7>;
> >
> >                ports {
> >                         ....
> >                         port7: port@7 {
> >                             ...
> >                        };
> >         };
> >
> > Should I do something differently?
>
> To clarify, I don't know what a trap_port is. I just saw this
> description in rtl8365mb.c:
>
>  * @trap_port: forward trapped frames to this port
>
> but I still don't know to which packets does this configuration apply
> (where are the packet traps installed, and for what kind of packets).

Thank you, Vladimir.

trap_port seems to be where the switch will send any packet captured
from LAN ports. There are a couple of situations it will be used like:
1) untagged or unmatched vlan packets (if configured to do so)
2) some multicasting packets (Reserved Multicast Address), for some
cases like capturing STP or LACP
3) IGMP and 802.1X EAPOL packets
4) Switch ACL rules that could match a packet and send it to the trap port.

In my early tests, I only saw some IGMP packets trapped to CPU. I also
do not know how important they are.

> Speculating here, but it appears quite arbitrary, and I'd guess also
> broken, to make the trap_port the last CPU port. Is this also part of
> the things which you didn't really test? See commit 8d5f7954b7c8 ("net:
> dsa: felix: break at first CPU port during init and teardown") for a
> similar issue with this. When there are multiple 'ethernet = <&phandle>'
> properties in the device tree, DSA makes the owners of all those
> phandles a DSA master, and all those switch ports as CPU ports. But out
> of all those CPU ports, only the first one is an active CPU port. The
> others have no dp->cpu_dp pointing to them.
> See dsa_tree_setup_default_cpu() -> dsa_tree_find_first_cpu().
> Even when DSA gets full-blown support for multiple CPU ports, I think
> it's safe to say that this default will remain the way it is: a single
> CPU port will be active to begin with: the first one. Given that fact
> (and depending on what you need to do with the trap_port info exactly),
> it might be broken to set as the trap port a CPU port that isn't used.
> Stuff like dsa_port_host_fdb_add()/dsa_port_host_fdb_del() will be
> broken, because they rely on the dp->cpu_dp association, and
> dp->cpu_dp->index will be != trap_port.

Although it would be interesting to have some sniffed traffic sent to
a second CPU port, I agree it might break more things than
it will help. Until multiple CPU ports can be used as first-class
citizens, I'll simply force it to be the first CPU port.

The multiple CPU port is not a target but a byproduct of removing the
assumption that "CPU port" is equal to "external interface port".
The real change is to allow an external interface to be configured,
even if it is not the CPU port, as it could be used to stack a second
switch.
I'll leave the multiple CPU as a note in the commit message and not
the subject. It was wrong to emphasize that.

> > > I think I know what the problem is. But I'd need to know what the driver
> > > for the DSA master is, to confirm. To be precise, what I'd like to check
> > > is the value of master->vlan_features.
> >
> > Here it is 0x1099513266227 (I hope).
>
> That's quite an extraordinary set of vlan_features. In that number, I
> notice BIT(2) is set, which corresponds to __UNUSED_NETIF_F_1. So it
> probably isn't correctly printed.

Oh my... I printed it as an unsigned decimal. Sorry.

>
> This is what I would have liked to see:
>
> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
> index 22241afcac81..b41f1b414c69 100644
> --- a/net/dsa/slave.c
> +++ b/net/dsa/slave.c
> @@ -1909,6 +1909,7 @@ void dsa_slave_setup_tagger(struct net_device *slave)
>         p->xmit = cpu_dp->tag_ops->xmit;
>
>         slave->features = master->vlan_features | NETIF_F_HW_TC;
> +       netdev_err(slave, "master %s vlan_features 0x%llx\n", master->name, master->vlan_features);
>         slave->hw_features |= NETIF_F_HW_TC;
>         slave->features |= NETIF_F_LLTX;
>         if (slave->needed_tailroom)

0x10000190033. If I got it right:

NETIF_F_SG_BIT
NETIF_F_IP_CSUM_BIT
NETIF_F_IPV6_CSUM_BIT
NETIF_F_HIGHDMA_BIT
NETIF_F_GSO_SHIFT
NETIF_F_TSO_MANGLEID_BIT
NETIF_F_TSO6_BIT
NETIF_F_RXCSUM_BIT

> And I don't think you fully answered Florian's questions either, really.
> Can we see the a link to the code of the Ethernet controller whose role
> is to be a host port (DSA master) for the rtl8365mb switch?

The code is from the OpenWrt tree.
https://github.com/openwrt/openwrt/tree/master/target/linux/ramips/files/drivers/net/ethernet/ralink

I only patched it to accept Jumbo Frames (it was dropping incoming
packets with MTU 1508)
https://patchwork.ozlabs.org/project/openwrt/list/?series=279773

> If that DSA
> master is a DSA switch itself, could you please unroll the chain all the
> way with more links to drivers? No matter whether upstream or downstream,
> just what you use.

OpenWrt (soc mt7620a) eth0 (mtk_eth_soc) connected to internal SoC
MT7530 switch port 6 (, mediatek,mt7620-gsw).
MT7530 port 5 connected to RTL8367S port 7 (RGMII).

The internal SoC switch is behaving as an unmanaged switch, with no
vlans. It would be just extra overhead to have it working as a DSA
switch, specially
as those two switches tags are not compatible. I still have the
swconfig driver installed but I was only using it for some debugging
(checking metrics). I think that the state the bootloader leaves that
switchis enough to make it forward packets to the Realtek switch. In
device-tree conf, I'm directly using that eth0 as the CPU port.

> I hate to guess, but since both you and Arınç have mentioned the
> mt7620a/mt7621 SoCs,

Sorry for the incomplete answer. If it helps, this is my device
https://github.com/luizluca/openwrt/blob/tplink_c5v4_dsa/target/linux/ramips/dts/mt7620a_tplink_archer-c5-v4.dts

I try to keep my remote branch updated, although it has some dirty changes:
https://github.com/luizluca/openwrt/tree/tplink_c5v4_dsa

> I'd guess that the top-most DSA driver in both cases is "mediatek,eth-mac" (drivers/net/ethernet/mediatek/mtk_eth_soc.c).

Not in my case. The driver I use also supports mt7621 but the upstream
driver skipped the mt7620a support.

> If so, this would confirm my suspicions, since it sets its vlan_features
> to include NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Please confirm that
> master->vlan_features contains these 2 bits.

Yes.

> > Oh, this DSA driver still does not implement vlan nor bridge offload.
> > Maybe it would matter.
>
> It doesn't matter. The vlan_features is a confusing name for what it
> really does here. I'll explain in a bit once you clarify the other
> things I asked for.

That is good news as we can deal with it independently. I wish to
focus on that afterwards.

Regards,

Luiz
Vladimir Oltean Jan. 21, 2022, 10:49 p.m. UTC | #21
On Fri, Jan 21, 2022 at 06:51:14PM -0300, Luiz Angelo Daros de Luca wrote:
> The code is from the OpenWrt tree.
> https://github.com/openwrt/openwrt/tree/master/target/linux/ramips/files/drivers/net/ethernet/ralink
> 
> I only patched it to accept Jumbo Frames (it was dropping incoming
> packets with MTU 1508)
> https://patchwork.ozlabs.org/project/openwrt/list/?series=279773
> 
> > If that DSA
> > master is a DSA switch itself, could you please unroll the chain all the
> > way with more links to drivers? No matter whether upstream or downstream,
> > just what you use.
> 
> OpenWrt (soc mt7620a) eth0 (mtk_eth_soc) connected to internal SoC
> MT7530 switch port 6 (, mediatek,mt7620-gsw).
> MT7530 port 5 connected to RTL8367S port 7 (RGMII).
> 
> The internal SoC switch is behaving as an unmanaged switch, with no
> vlans. It would be just extra overhead to have it working as a DSA
> switch, specially
> as those two switches tags are not compatible. I still have the
> swconfig driver installed but I was only using it for some debugging
> (checking metrics). I think that the state the bootloader leaves that
> switchis enough to make it forward packets to the Realtek switch. In
> device-tree conf, I'm directly using that eth0 as the CPU port.

There could be value in managing the internal switch with DSA too, for
example in a situation like this:

 +-------------------------------------------------+
 |  SoC                                            |
 |                                                 |
 |  +----------------+--------+---------------+    |
 |  |                |        |               |    |
 |  | Internal       |        |               |    |
 |  |  switch        +--------+               |    |
 |  | (dsa,member = <0 0>;)                   |    |
 |  | +-------+ +-------+ +-------+ +-------+ |    |
 |  | |       | |       | |       | |       | |    |
 |  | | sw0p0 | | sw0p1 | | sw0p2 | | sw0p3 | |    |
 |  | |       | |       | |       | |       | |    |
 +--+-+-------+-+-------+-+-------+-+-------+-+----+

 +----+--------+------------------+
 |    |        |                  |
 |    +--------+                  |
 | External switch                |
 | (dsa,member = <1 0>;)          |
 |  +-------+ +-------+ +-------+ |
 |  |       | |       | |       | |
 |  | sw1p0 | | sw1p1 | | sw1p2 | |
 |  |       | |       | |       | |
 +--+-------+-+-------+-+-------+-+

where you'd create a bridge spanning all of sw0p1, sw0p2, sw0p3, sw1p0,
sw1p1, sw1p2. Forwarding between the internal and the external switch is
done in software, and that deals with the "impedance matching" between
the tagging protocols too - first the packet is stripped of the DSA tag
of the ingress switch, then the DSA tag of the egress switch is added.
With a transparent internal switch (no driver), ports sw0p1, sw0p2,
sw0p3 are dead, since if you'd connect them to a PHY, they'd spit out
DSA-tagged packets from the external switch.

> > I hate to guess, but since both you and Arınç have mentioned the
> > mt7620a/mt7621 SoCs,
> 
> Sorry for the incomplete answer. If it helps, this is my device
> https://github.com/luizluca/openwrt/blob/tplink_c5v4_dsa/target/linux/ramips/dts/mt7620a_tplink_archer-c5-v4.dts
> 
> I try to keep my remote branch updated, although it has some dirty changes:
> https://github.com/luizluca/openwrt/tree/tplink_c5v4_dsa
> 
> > I'd guess that the top-most DSA driver in both cases is "mediatek,eth-mac" (drivers/net/ethernet/mediatek/mtk_eth_soc.c).
> 
> Not in my case. The driver I use also supports mt7621 but the upstream
> driver skipped the mt7620a support.
> 
> > If so, this would confirm my suspicions, since it sets its vlan_features
> > to include NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Please confirm that
> > master->vlan_features contains these 2 bits.
> 
> Yes.

Ok. See the discussion with Lino Sanfilippo here:
https://lore.kernel.org/netdev/YPAzZXaC%2FEn3s4ly@lunn.ch/
Basically, the moving parts of this mechanism are:

- when the DSA master doesn't understand DSA tags, the transmit
  checksums must be calculated in software.

- this is already supported, we just need to make sure that the DSA
  slave->features does not include any checksum offload bits
  (NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, NETIF_F_IPV6_CSUM), otherwise that
  will be delegated to the device driver. The place that checks that
  condition and calculates the checksum in software is validate_xmit_skb() ->
  skb_csum_hwoffload_help().

- the checksum is evaluated on the skb before the DSA tag is even
  inserted, and is preserved when DSA inserts the header, and is
  therefore still correct by the time the skb reaches the DSA master
  driver. A DSA-unaware master doesn't have to do anything for this
  packet, the IP header checksum will still be correct despite the
  hardware not recognizing the IP header.

- the way DSA populates slave->features is by inheriting master->vlan_features
  (vlan_features means "netdev features which are inheritable by VLAN
  upper interfaces"). This directly makes or breaks what happens in
  validate_xmit_skb() on a DSA slave interface.

- the problem occurs when the DSA master puts checksum offload bits in
  both dev->features and dev->vlan_features. The master thinks this
  means: "I can offload IP checksumming for myself and for VLAN upper
  interfaces (I can recognize the IP header past the VLAN header)."
  Little does it know that DSA assumes this means it can also offload
  checksumming in the presence of switch tags.

So just stop inheriting NETIF_F_HW_CSUM and friends from
master->vlan_features, right?

Well, you can't help but wonder a bit how come it's 2022 and we could
still have an obvious omission like that? And at the same time: but why
does the mt7530 DSA driver work with the same DSA master, but not rtl8365mb?
The answer to both, I think, is "some DSA masters do understand a
particular DSA switch tag, particularly the one from the same vendor".
So if we stop inheriting the checksum offload bits from vlan_features,
we introduce a performance regression for those.

We should instead ask the DSA master "for this DSA tagging protocol,
what netdev features can DSA inherit"? Because of the variability per
tagging protocol, this probably needs to be done through a new netdev
operation, I don't know of any cleaner way.
The complicated part is that we'd need to correctly identify the pairs
of DSA master drivers and tagging protocols where some features can be
safely inherited. Then, it's not too clear whether we want this new ndo
to cover other functionality as well, or if netdev features are enough.

So the sad news for you is that this is pretty much "net-next" material,
even if it fixes what is essentially a design shortcoming. If we're
quick, we could start doing this right as net-next reopens, and that
would give other developers maximum opportunity to fix up the
performance regressions caused by lack of TX checksumming.

> > > Oh, this DSA driver still does not implement vlan nor bridge offload.
> > > Maybe it would matter.
> >
> > It doesn't matter. The vlan_features is a confusing name for what it
> > really does here. I'll explain in a bit once you clarify the other
> > things I asked for.
> 
> That is good news as we can deal with it independently. I wish to
> focus on that afterwards.
> 
> Regards,
> 
> Luiz
Andrew Lunn Jan. 22, 2022, 3:51 p.m. UTC | #22
> trap_port seems to be where the switch will send any packet captured
> from LAN ports. There are a couple of situations it will be used like:
> 1) untagged or unmatched vlan packets (if configured to do so)
> 2) some multicasting packets (Reserved Multicast Address), for some
> cases like capturing STP or LACP
> 3) IGMP and 802.1X EAPOL packets
> 4) Switch ACL rules that could match a packet and send it to the trap port.
> 
> In my early tests, I only saw some IGMP packets trapped to CPU. I also
> do not know how important they are.

STP is important for detecting loops in the ethernet traffic and
blocking ports. The linux software bridge will want to see these
packets.

IGMP will become important when you implement multicast support in the
switch. It will allow you to optimize the distribution of multicast to
only ports which have expressed an interest in receiving the group.

Currently we don't have any switch driver making use of 802.1x. It is
something which many switches have, but so far nobody has spent the
time to implement an interface to wpa_supplicant etc.

     Andrew
Luiz Angelo Daros de Luca Jan. 22, 2022, 8:12 p.m. UTC | #23
> > The internal SoC switch is behaving as an unmanaged switch, with no
> > vlans. It would be just extra overhead to have it working as a DSA
> > switch, specially
> > as those two switches tags are not compatible. I still have the
> > swconfig driver installed but I was only using it for some debugging
> > (checking metrics). I think that the state the bootloader leaves that
> > switchis enough to make it forward packets to the Realtek switch. In
> > device-tree conf, I'm directly using that eth0 as the CPU port.
>
> There could be value in managing the internal switch with DSA too, for
> example in a situation like this:
>
>  +-------------------------------------------------+
>  |  SoC                                            |
>  |                                                 |
>  |  +----------------+--------+---------------+    |
>  |  |                |        |               |    |
>  |  | Internal       |        |               |    |
>  |  |  switch        +--------+               |    |
>  |  | (dsa,member = <0 0>;)                   |    |
>  |  | +-------+ +-------+ +-------+ +-------+ |    |
>  |  | |       | |       | |       | |       | |    |
>  |  | | sw0p0 | | sw0p1 | | sw0p2 | | sw0p3 | |    |
>  |  | |       | |       | |       | |       | |    |
>  +--+-+-------+-+-------+-+-------+-+-------+-+----+
>
>  +----+--------+------------------+
>  |    |        |                  |
>  |    +--------+                  |
>  | External switch                |
>  | (dsa,member = <1 0>;)          |
>  |  +-------+ +-------+ +-------+ |
>  |  |       | |       | |       | |
>  |  | sw1p0 | | sw1p1 | | sw1p2 | |
>  |  |       | |       | |       | |
>  +--+-------+-+-------+-+-------+-+
>
> where you'd create a bridge spanning all of sw0p1, sw0p2, sw0p3, sw1p0,
> sw1p1, sw1p2. Forwarding between the internal and the external switch is
> done in software, and that deals with the "impedance matching" between
> the tagging protocols too - first the packet is stripped of the DSA tag
> of the ingress switch, then the DSA tag of the egress switch is added.
> With a transparent internal switch (no driver), ports sw0p1, sw0p2,
> sw0p3 are dead, since if you'd connect them to a PHY, they'd spit out
> DSA-tagged packets from the external switch.

Oh, any other internal switch ports are physically not in use.  Those
ports are 10/100. I think that in my device, some of its pins are even
used as GPIO.
And the offload issue will remain as the HW will not be able to
offload the second layer of DSA tag.

> > > I hate to guess, but since both you and Arınç have mentioned the
> > > mt7620a/mt7621 SoCs,
> >
> > Sorry for the incomplete answer. If it helps, this is my device
> > https://github.com/luizluca/openwrt/blob/tplink_c5v4_dsa/target/linux/ramips/dts/mt7620a_tplink_archer-c5-v4.dts
> >
> > I try to keep my remote branch updated, although it has some dirty changes:
> > https://github.com/luizluca/openwrt/tree/tplink_c5v4_dsa
> >
> > > I'd guess that the top-most DSA driver in both cases is "mediatek,eth-mac" (drivers/net/ethernet/mediatek/mtk_eth_soc.c).
> >
> > Not in my case. The driver I use also supports mt7621 but the upstream
> > driver skipped the mt7620a support.
> >
> > > If so, this would confirm my suspicions, since it sets its vlan_features
> > > to include NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Please confirm that
> > > master->vlan_features contains these 2 bits.
> >
> > Yes.
>
> Ok. See the discussion with Lino Sanfilippo here:
> https://lore.kernel.org/netdev/YPAzZXaC%2FEn3s4ly@lunn.ch/
> Basically, the moving parts of this mechanism are:
>
> - when the DSA master doesn't understand DSA tags, the transmit
>   checksums must be calculated in software.
>
> - this is already supported, we just need to make sure that the DSA
>   slave->features does not include any checksum offload bits
>   (NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, NETIF_F_IPV6_CSUM), otherwise that
>   will be delegated to the device driver. The place that checks that
>   condition and calculates the checksum in software is validate_xmit_skb() ->
>   skb_csum_hwoffload_help().
>
> - the checksum is evaluated on the skb before the DSA tag is even
>   inserted, and is preserved when DSA inserts the header, and is
>   therefore still correct by the time the skb reaches the DSA master
>   driver. A DSA-unaware master doesn't have to do anything for this
>   packet, the IP header checksum will still be correct despite the
>   hardware not recognizing the IP header.
>
> - the way DSA populates slave->features is by inheriting master->vlan_features
>   (vlan_features means "netdev features which are inheritable by VLAN
>   upper interfaces"). This directly makes or breaks what happens in
>   validate_xmit_skb() on a DSA slave interface.
>
> - the problem occurs when the DSA master puts checksum offload bits in
>   both dev->features and dev->vlan_features. The master thinks this
>   means: "I can offload IP checksumming for myself and for VLAN upper
>   interfaces (I can recognize the IP header past the VLAN header)."
>   Little does it know that DSA assumes this means it can also offload
>   checksumming in the presence of switch tags.
>
> So just stop inheriting NETIF_F_HW_CSUM and friends from
> master->vlan_features, right?
>
> Well, you can't help but wonder a bit how come it's 2022 and we could
> still have an obvious omission like that? And at the same time: but why
> does the mt7530 DSA driver work with the same DSA master, but not rtl8365mb?
> The answer to both, I think, is "some DSA masters do understand a
> particular DSA switch tag, particularly the one from the same vendor".
> So if we stop inheriting the checksum offload bits from vlan_features,
> we introduce a performance regression for those.
>
> We should instead ask the DSA master "for this DSA tagging protocol,
> what netdev features can DSA inherit"? Because of the variability per
> tagging protocol, this probably needs to be done through a new netdev
> operation, I don't know of any cleaner way.
> The complicated part is that we'd need to correctly identify the pairs
> of DSA master drivers and tagging protocols where some features can be
> safely inherited. Then, it's not too clear whether we want this new ndo
> to cover other functionality as well, or if netdev features are enough.

I'm new to DSA but I think that a solution like that might not scale
well. For every possible master network driver, it needs to know if
its offload feature will handle every different tag.
Imagining that both new offload HW and new switch tags will still
appear in the kernel, it might be untreatable.

I know dsa properties are not the solution for everything (and I'm
still adapting to where that border is) but, in this case, it is a
device specific arrangement between the ethernet device and the
switch. Wouldn't it be better to allow the
one writing the device-tree description inform if a master feature
cannot be copied to slave devices?

I checked DSA doc again and it says:

"Since tagging protocols in category 1 and 2 break software (and most
often also hardware) packet dissection on the DSA master, features
such as RPS (Receive Packet Steering) on the DSA master would be
broken. The DSA framework deals with this by hooking into the flow
dissector and shifting the offset at which the IP header is to be
found in the tagged frame as seen by the DSA master. This behavior is
automatic based on the overhead value of the tagging protocol. If not
all packets are of equal size, the tagger can implement the
flow_dissect method of the struct dsa_device_ops and override this
default behavior by specifying the correct offset incurred by each
individual RX packet. Tail taggers do not cause issues to the flow
dissector."

It makes me think that it is the master network driver that does not
implement that IP header location shift. Anyway, I believe it also
depends on HW capabilities to inform that shift, right?

I'm trying to think as a DSA newbie (which is exactly what I am).
Differently from an isolated ethernet driver, with DSA, the system
does have control of "something" after the offload should be applied:
the dsa switch. Can't we have a generic way to send a packet to the
switch and make it bounce back to the CPU (passing through the offload
engine)? Would it work if I set the destination port as the CPU port?
This way, we could simply detect if the offload worked and disable
those features that did not work. It could work with a generic
implementation or, if needed, a specialized optional ds_switch_ops
function just to setup that temporary lookback forwarding rule.

> So the sad news for you is that this is pretty much "net-next" material,
> even if it fixes what is essentially a design shortcoming. If we're
> quick, we could start doing this right as net-next reopens, and that
> would give other developers maximum opportunity to fix up the
> performance regressions caused by lack of TX checksumming.

No problem. I'm already playing the long game. I'm just trying to fix
a device I own using my free time and I don't have any manager with
impossible deadlines.
However, any solution with a performance regression would break the
kernel API. I would rather add a new device-tree option :-)

> > > > Oh, this DSA driver still does not implement vlan nor bridge offload.
> > > > Maybe it would matter.
> > >
> > > It doesn't matter. The vlan_features is a confusing name for what it
> > > really does here. I'll explain in a bit once you clarify the other
> > > things I asked for.
> >
> > That is good news as we can deal with it independently. I wish to
> > focus on that afterwards.
> >
> > Regards,
> >
> > Luiz
Vladimir Oltean Jan. 24, 2022, 3:31 p.m. UTC | #24
On Sat, Jan 22, 2022 at 05:12:28PM -0300, Luiz Angelo Daros de Luca wrote:
> I'm new to DSA but I think that a solution like that might not scale
> well. For every possible master network driver, it needs to know if
> its offload feature will handle every different tag.

Correct, with the sensible default being that no checksum offloading in
the presence of DSA tags is supported.

> Imagining that both new offload HW and new switch tags will still
> appear in the kernel, it might be untreatable.

You don't see DSA masters understanding DSA tagging protocols every day,
I think you're overstating this. We'd have to cover Marvell-on-Marvell,
Broadcom-on-Broadcom, Mediatek-on-Mediatek, and the rest will have to
add their support when they add the hardware.

> I know dsa properties are not the solution for everything (and I'm
> still adapting to where that border is) but, in this case, it is a
> device specific arrangement between the ethernet device and the
> switch. Wouldn't it be better to allow the
> one writing the device-tree description inform if a master feature
> cannot be copied to slave devices?

Assuming an ultra-generic Ethernet controller with advanced soft parser
capabilities, you'd have to teach it the format of each DSA tagging
protocol you intend it to understand anyway, so this doesn't appear the
kind of thing best described in the device tree, since it may easily be
out of sync with what the driver is able to tell the hardware to do.

> 
> I checked DSA doc again and it says:
> 
> "Since tagging protocols in category 1 and 2 break software (and most
> often also hardware) packet dissection on the DSA master, features
> such as RPS (Receive Packet Steering) on the DSA master would be
> broken. The DSA framework deals with this by hooking into the flow
> dissector and shifting the offset at which the IP header is to be
> found in the tagged frame as seen by the DSA master. This behavior is
> automatic based on the overhead value of the tagging protocol. If not
> all packets are of equal size, the tagger can implement the
> flow_dissect method of the struct dsa_device_ops and override this
> default behavior by specifying the correct offset incurred by each
> individual RX packet. Tail taggers do not cause issues to the flow
> dissector."
> 
> It makes me think that it is the master network driver that does not
> implement that IP header location shift. Anyway, I believe it also
> depends on HW capabilities to inform that shift, right?
> 
> I'm trying to think as a DSA newbie (which is exactly what I am).
> Differently from an isolated ethernet driver, with DSA, the system
> does have control of "something" after the offload should be applied:
> the dsa switch. Can't we have a generic way to send a packet to the
> switch and make it bounce back to the CPU (passing through the offload
> engine)? Would it work if I set the destination port as the CPU port?
> This way, we could simply detect if the offload worked and disable
> those features that did not work. It could work with a generic
> implementation or, if needed, a specialized optional ds_switch_ops
> function just to setup that temporary lookback forwarding rule.

To be clear, do you consider this simpler than an ndo operation that
returns true or false if a certain DSA master can offload a certain
netdev feature in the presence of a certain DSA tag?

Ignoring the fact that there are subtly different ways in which various
hardware manufacturers implement packet injection from the CPU (and this
is reflected in the various struct dsa_device_ops :: xmit operation),
plus the fact that dsa_device_ops :: xmit takes a slave net_device as
argument, for which there is none to represent the CPU port. These
points mean that you'd need to implement a separate, hardware-specific
loopback xmit for each tagging protocol. But again, ignoring that for a
second.

When would be a good time to probe for DSA master features? The DSA
master might be down when DSA switches probe. What should we do with
packets sent on a DSA port until we've finished probing for DSA master
capabilities?

> > So the sad news for you is that this is pretty much "net-next" material,
> > even if it fixes what is essentially a design shortcoming. If we're
> > quick, we could start doing this right as net-next reopens, and that
> > would give other developers maximum opportunity to fix up the
> > performance regressions caused by lack of TX checksumming.
> 
> No problem. I'm already playing the long game. I'm just trying to fix
> a device I own using my free time and I don't have any manager with
> impossible deadlines.
> However, any solution with a performance regression would break the
> kernel API. I would rather add a new device-tree option :-)

Feel free to do whatever you want in OpenWRT, but as a general rule of
thumb, if something can be solved without involving the device tree,
then involving the device tree is probably the wrong approach.
Jakub Kicinski Jan. 24, 2022, 4:46 p.m. UTC | #25
On Mon, 24 Jan 2022 17:31:47 +0200 Vladimir Oltean wrote:
> > I checked DSA doc again and it says:
> > 
> > "Since tagging protocols in category 1 and 2 break software (and most
> > often also hardware) packet dissection on the DSA master, features
> > such as RPS (Receive Packet Steering) on the DSA master would be
> > broken. The DSA framework deals with this by hooking into the flow
> > dissector and shifting the offset at which the IP header is to be
> > found in the tagged frame as seen by the DSA master. This behavior is
> > automatic based on the overhead value of the tagging protocol. If not
> > all packets are of equal size, the tagger can implement the
> > flow_dissect method of the struct dsa_device_ops and override this
> > default behavior by specifying the correct offset incurred by each
> > individual RX packet. Tail taggers do not cause issues to the flow
> > dissector."
> > 
> > It makes me think that it is the master network driver that does not
> > implement that IP header location shift. Anyway, I believe it also
> > depends on HW capabilities to inform that shift, right?
> > 
> > I'm trying to think as a DSA newbie (which is exactly what I am).
> > Differently from an isolated ethernet driver, with DSA, the system
> > does have control of "something" after the offload should be applied:
> > the dsa switch. Can't we have a generic way to send a packet to the
> > switch and make it bounce back to the CPU (passing through the offload
> > engine)? Would it work if I set the destination port as the CPU port?
> > This way, we could simply detect if the offload worked and disable
> > those features that did not work. It could work with a generic
> > implementation or, if needed, a specialized optional ds_switch_ops
> > function just to setup that temporary lookback forwarding rule.  
> 
> To be clear, do you consider this simpler than an ndo operation that
> returns true or false if a certain DSA master can offload a certain
> netdev feature in the presence of a certain DSA tag?
> 
> Ignoring the fact that there are subtly different ways in which various
> hardware manufacturers implement packet injection from the CPU (and this
> is reflected in the various struct dsa_device_ops :: xmit operation),
> plus the fact that dsa_device_ops :: xmit takes a slave net_device as
> argument, for which there is none to represent the CPU port. These
> points mean that you'd need to implement a separate, hardware-specific
> loopback xmit for each tagging protocol. But again, ignoring that for a
> second.
> 
> When would be a good time to probe for DSA master features? The DSA
> master might be down when DSA switches probe. What should we do with
> packets sent on a DSA port until we've finished probing for DSA master
> capabilities?

I thought for drivers setting the legacy NETIF_F_IP*_CSUM feature
it's driver's responsibility to validate the geometry of the packet
will work with the parser the device has. Or at least I think that's
what Tom was pushing for when he was cleaning up the checksumming last
(and wrote the long comment on the subject in skbuff.h).
Vladimir Oltean Jan. 24, 2022, 4:55 p.m. UTC | #26
On Mon, Jan 24, 2022 at 08:46:49AM -0800, Jakub Kicinski wrote:
> I thought for drivers setting the legacy NETIF_F_IP*_CSUM feature
> it's driver's responsibility to validate the geometry of the packet
> will work with the parser the device has. Or at least I think that's
> what Tom was pushing for when he was cleaning up the checksumming last
> (and wrote the long comment on the subject in skbuff.h).

Sorry Jakub, I don't understand what you mean to say when applied to the
context discussed here?
Florian Fainelli Jan. 24, 2022, 5:01 p.m. UTC | #27
On 1/24/2022 8:55 AM, Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 08:46:49AM -0800, Jakub Kicinski wrote:
>> I thought for drivers setting the legacy NETIF_F_IP*_CSUM feature
>> it's driver's responsibility to validate the geometry of the packet
>> will work with the parser the device has. Or at least I think that's
>> what Tom was pushing for when he was cleaning up the checksumming last
>> (and wrote the long comment on the subject in skbuff.h).
> 
> Sorry Jakub, I don't understand what you mean to say when applied to the
> context discussed here?

I believe what Jakub meant to say is that if a DSA conduit device driver 
advertises any of the NETIF_F_IP*_CSUM feature bits, then the driver's 
transmit path has the responsibility of checking that the payload being 
transmitted has a chance of being checksummed properly by the hardware. 
The problem here is not so much the geometry itself (linear or not, 
number/size of fragments, etc.) as much as the placement of the L2/L3 
headers usually.

DSA conduit network device drivers do not have the ability today to 
determine what type of DSA tagging is being applied onto the DSA master 
but they do know whether DSA tagging is in use or not which may be 
enough to be overly compatible.

It is not clear to me whether we can solve this generically within the 
DSA framework or even if this is desirable, but once we have identified 
a problematic association of DSA tagger and DSA conduit, we can always 
have the DSA conduit driver do something like:

if (netdev_uses_dsa(dev))
	skb_checksum_help()

or have a fix_features callback which does reject the enabling of 
NETIF_F_IP*_CSUM if netdev_uses_dsa() becomes true.
Vladimir Oltean Jan. 24, 2022, 5:21 p.m. UTC | #28
On Mon, Jan 24, 2022 at 09:01:20AM -0800, Florian Fainelli wrote:
> On 1/24/2022 8:55 AM, Vladimir Oltean wrote:
> > On Mon, Jan 24, 2022 at 08:46:49AM -0800, Jakub Kicinski wrote:
> > > I thought for drivers setting the legacy NETIF_F_IP*_CSUM feature
> > > it's driver's responsibility to validate the geometry of the packet
> > > will work with the parser the device has. Or at least I think that's
> > > what Tom was pushing for when he was cleaning up the checksumming last
> > > (and wrote the long comment on the subject in skbuff.h).
> > 
> > Sorry Jakub, I don't understand what you mean to say when applied to the
> > context discussed here?
> 
> I believe what Jakub meant to say is that if a DSA conduit device driver
> advertises any of the NETIF_F_IP*_CSUM feature bits, then the driver's
> transmit path has the responsibility of checking that the payload being
> transmitted has a chance of being checksummed properly by the hardware. The
> problem here is not so much the geometry itself (linear or not, number/size
> of fragments, etc.) as much as the placement of the L2/L3 headers usually.
> 
> DSA conduit network device drivers do not have the ability today to
> determine what type of DSA tagging is being applied onto the DSA master but
> they do know whether DSA tagging is in use or not which may be enough to be
> overly compatible.
> 
> It is not clear to me whether we can solve this generically within the DSA
> framework or even if this is desirable, but once we have identified a
> problematic association of DSA tagger and DSA conduit, we can always have
> the DSA conduit driver do something like:
> 
> if (netdev_uses_dsa(dev))
> 	skb_checksum_help()
> 
> or have a fix_features callback which does reject the enabling of
> NETIF_F_IP*_CSUM if netdev_uses_dsa() becomes true.

Yes, but as you point out, the DSA master driver doesn't know what
header/trailer format it's dealing with. We could use netdev_uses_dsa()
as a very rough approximation, and that might work when we know that the
particular Ethernet controller is used only in conjunction with a single
type of DSA switch [from the same vendor], but I think we're just
delaying the inevitable, which is to treat the case where an Ethernet
controller can be a DSA master for more than one switch type, and it
understands some protocols but not others.
Also, scattering "if (netdev_uses_dsa(dev)) skb_checksum_help()" in
DSA-unaware drivers (the common case) seems like the improper approach.
We might end up seeing this pattern quite a lot, so DSA-unaware drivers
won't be DSA-unaware any longer.
It's still possible I'm misunderstanding something...
Florian Fainelli Jan. 24, 2022, 5:30 p.m. UTC | #29
On 1/24/2022 9:21 AM, Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 09:01:20AM -0800, Florian Fainelli wrote:
>> On 1/24/2022 8:55 AM, Vladimir Oltean wrote:
>>> On Mon, Jan 24, 2022 at 08:46:49AM -0800, Jakub Kicinski wrote:
>>>> I thought for drivers setting the legacy NETIF_F_IP*_CSUM feature
>>>> it's driver's responsibility to validate the geometry of the packet
>>>> will work with the parser the device has. Or at least I think that's
>>>> what Tom was pushing for when he was cleaning up the checksumming last
>>>> (and wrote the long comment on the subject in skbuff.h).
>>>
>>> Sorry Jakub, I don't understand what you mean to say when applied to the
>>> context discussed here?
>>
>> I believe what Jakub meant to say is that if a DSA conduit device driver
>> advertises any of the NETIF_F_IP*_CSUM feature bits, then the driver's
>> transmit path has the responsibility of checking that the payload being
>> transmitted has a chance of being checksummed properly by the hardware. The
>> problem here is not so much the geometry itself (linear or not, number/size
>> of fragments, etc.) as much as the placement of the L2/L3 headers usually.
>>
>> DSA conduit network device drivers do not have the ability today to
>> determine what type of DSA tagging is being applied onto the DSA master but
>> they do know whether DSA tagging is in use or not which may be enough to be
>> overly compatible.
>>
>> It is not clear to me whether we can solve this generically within the DSA
>> framework or even if this is desirable, but once we have identified a
>> problematic association of DSA tagger and DSA conduit, we can always have
>> the DSA conduit driver do something like:
>>
>> if (netdev_uses_dsa(dev))
>> 	skb_checksum_help()
>>
>> or have a fix_features callback which does reject the enabling of
>> NETIF_F_IP*_CSUM if netdev_uses_dsa() becomes true.
> 
> Yes, but as you point out, the DSA master driver doesn't know what
> header/trailer format it's dealing with. We could use netdev_uses_dsa()
> as a very rough approximation, and that might work when we know that the
> particular Ethernet controller is used only in conjunction with a single
> type of DSA switch [from the same vendor], but I think we're just
> delaying the inevitable, which is to treat the case where an Ethernet
> controller can be a DSA master for more than one switch type, and it
> understands some protocols but not others.
> Also, scattering "if (netdev_uses_dsa(dev)) skb_checksum_help()" in
> DSA-unaware drivers (the common case) seems like the improper approach.
> We might end up seeing this pattern quite a lot, so DSA-unaware drivers
> won't be DSA-unaware any longer.
> It's still possible I'm misunderstanding something...

I don't think you are, and my crude proposal was just so we have it 
working, and then we can think about having it work fast.

A long time (but in the same galaxy) DSA used to set skb->protocol to 
the value of the DSA tagging protocol used (say ETH_P_EDSA), long before 
they were all consolidated within ETH_P_XDSA, but this would be breaking 
any checksum setting up that looks at skb->protocol to determine if it 
is IP, IPv6 or else, so in a way it might have done what we wanted it to 
do, but this was mostly by accident.

The tagger on transmit can definitively tell us via an out of band 
signaling what type of tagging protocol is being used and where it is 
located within the packet if necessary, and I suppose we can then update 
the DSA conduit in order to not ask the HW to checksum if this is deemed 
problematic. Doing that for every single packet transmitted however may 
not be very efficient given that usually we set-up one tagging protocol, 
then we set-up another one (possibly), but it won't change on a packet 
by packet basis. So maybe what we need to do is at the time we "connect" 
the tagger we inform the DSA master that from there on, all that is 
supposed to go through that interface will look that way, along with a 
description of the tagger offset and length?
Jakub Kicinski Jan. 24, 2022, 5:35 p.m. UTC | #30
On Mon, 24 Jan 2022 19:21:58 +0200 Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 09:01:20AM -0800, Florian Fainelli wrote:
> > On 1/24/2022 8:55 AM, Vladimir Oltean wrote:  
> > > Sorry Jakub, I don't understand what you mean to say when applied to the
> > > context discussed here?  
> > 
> > I believe what Jakub meant to say is that if a DSA conduit device driver
> > advertises any of the NETIF_F_IP*_CSUM feature bits, then the driver's
> > transmit path has the responsibility of checking that the payload being
> > transmitted has a chance of being checksummed properly by the hardware. The
> > problem here is not so much the geometry itself (linear or not, number/size
> > of fragments, etc.) as much as the placement of the L2/L3 headers usually.

Sorry I used "geometry" loosely.

What I meant is simply that if the driver uses NETIF_F_IP*_CSUM 
it should parse the packet before it hands it off to the HW.

There is infinity of protocols users can come up with, while the device
parser is very much finite, so it's only practical to check compliance
with the HW parser in the driver. The reverse approach of adding
per-protocol caps is a dead end IMO. And we should not bloat the stack
when NETIF_F_HW_CSUM exists and the memo that parsing packets on Tx is
bad b/c of protocol ossification went out a decade ago.

> > DSA conduit network device drivers do not have the ability today to
> > determine what type of DSA tagging is being applied onto the DSA master but
> > they do know whether DSA tagging is in use or not which may be enough to be
> > overly compatible.
> > 
> > It is not clear to me whether we can solve this generically within the DSA
> > framework or even if this is desirable, but once we have identified a
> > problematic association of DSA tagger and DSA conduit, we can always have
> > the DSA conduit driver do something like:
> > 
> > if (netdev_uses_dsa(dev))
> > 	skb_checksum_help()
> > 
> > or have a fix_features callback which does reject the enabling of
> > NETIF_F_IP*_CSUM if netdev_uses_dsa() becomes true.  
> 
> Yes, but as you point out, the DSA master driver doesn't know what
> header/trailer format it's dealing with. We could use netdev_uses_dsa()
> as a very rough approximation, and that might work when we know that the
> particular Ethernet controller is used only in conjunction with a single
> type of DSA switch [from the same vendor], but I think we're just
> delaying the inevitable, which is to treat the case where an Ethernet
> controller can be a DSA master for more than one switch type, and it
> understands some protocols but not others.
> Also, scattering "if (netdev_uses_dsa(dev)) skb_checksum_help()" in
> DSA-unaware drivers (the common case) seems like the improper approach.
> We might end up seeing this pattern quite a lot, so DSA-unaware drivers
> won't be DSA-unaware any longer.

It's not about DSA. The driver should not check

if (dsa())
	blah;

it should check 

if (!(eth [-> vlan] -> ip -> tcp/udp))
	csum_help();

> It's still possible I'm misunderstanding something...
Jakub Kicinski Jan. 24, 2022, 6:20 p.m. UTC | #31
On Mon, 24 Jan 2022 09:35:56 -0800 Jakub Kicinski wrote:
> Sorry I used "geometry" loosely.
> 
> What I meant is simply that if the driver uses NETIF_F_IP*_CSUM 
> it should parse the packet before it hands it off to the HW.
> 
> There is infinity of protocols users can come up with, while the device
> parser is very much finite, so it's only practical to check compliance
> with the HW parser in the driver. The reverse approach of adding
> per-protocol caps is a dead end IMO. And we should not bloat the stack
> when NETIF_F_HW_CSUM exists and the memo that parsing packets on Tx is
> bad b/c of protocol ossification went out a decade ago.

> It's not about DSA. The driver should not check
> 
> if (dsa())
> 	blah;
> 
> it should check 
> 
> if (!(eth [-> vlan] -> ip -> tcp/udp))
> 	csum_help();

Admittedly on a quick look thru the drivers which already do this 
I only see L3, L4 and GRE/UDP encap checks. Nothing validates L2.
Vladimir Oltean Jan. 24, 2022, 7:08 p.m. UTC | #32
On Mon, Jan 24, 2022 at 10:20:51AM -0800, Jakub Kicinski wrote:
> On Mon, 24 Jan 2022 09:35:56 -0800 Jakub Kicinski wrote:
> > Sorry I used "geometry" loosely.
> >
> > What I meant is simply that if the driver uses NETIF_F_IP*_CSUM
> > it should parse the packet before it hands it off to the HW.
> >
> > There is infinity of protocols users can come up with, while the device
> > parser is very much finite, so it's only practical to check compliance
> > with the HW parser in the driver. The reverse approach of adding
> > per-protocol caps is a dead end IMO. And we should not bloat the stack
> > when NETIF_F_HW_CSUM exists and the memo that parsing packets on Tx is
> > bad b/c of protocol ossification went out a decade ago.
>
> > It's not about DSA. The driver should not check
> >
> > if (dsa())
> > 	blah;
> >
> > it should check
> >
> > if (!(eth [-> vlan] -> ip -> tcp/udp))
> > 	csum_help();
>
> Admittedly on a quick look thru the drivers which already do this
> I only see L3, L4 and GRE/UDP encap checks. Nothing validates L2.

So before we declare that any given Ethernet driver is buggy for declaring
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
points where it expects it to (taking into consideration potential VLAN
headers, IPv6 extension headers), is there any driver that _does_
perform these checks correctly, that could be used as an example?
Jakub Kicinski Jan. 24, 2022, 7:38 p.m. UTC | #33
On Mon, 24 Jan 2022 21:08:45 +0200 Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 10:20:51AM -0800, Jakub Kicinski wrote:
> > On Mon, 24 Jan 2022 09:35:56 -0800 Jakub Kicinski wrote:  
> > > Sorry I used "geometry" loosely.
> > >
> > > What I meant is simply that if the driver uses NETIF_F_IP*_CSUM
> > > it should parse the packet before it hands it off to the HW.
> > >
> > > There is infinity of protocols users can come up with, while the device
> > > parser is very much finite, so it's only practical to check compliance
> > > with the HW parser in the driver. The reverse approach of adding
> > > per-protocol caps is a dead end IMO. And we should not bloat the stack
> > > when NETIF_F_HW_CSUM exists and the memo that parsing packets on Tx is
> > > bad b/c of protocol ossification went out a decade ago.  
> >  
> > > It's not about DSA. The driver should not check
> > >
> > > if (dsa())
> > > 	blah;
> > >
> > > it should check
> > >
> > > if (!(eth [-> vlan] -> ip -> tcp/udp))
> > > 	csum_help();  
> >
> > Admittedly on a quick look thru the drivers which already do this
> > I only see L3, L4 and GRE/UDP encap checks. Nothing validates L2.  
> 
> So before we declare that any given Ethernet driver is buggy for declaring
> NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
> points where it expects it to (taking into consideration potential VLAN
> headers, IPv6 extension headers), 

Extension headers are explicitly not supported by NETIF_F_IPV6_CSUM.

IIRC Tom's hope was to delete NETIF_F_IP*_CSUM completely once all
drivers are converted to parsing and therefore can use NETIF_F_HW_CSUM.

> is there any driver that _does_ perform these checks correctly, that
> could be used as an example?

I don't think so. Let me put it this way - my understanding is that up
until now we had been using the vlan_features, mpls_features etc to
perform L2/L2.5/below-IP feature stripping. This scales poorly to DSA
tags, as discussed in this thread.

I'm suggesting we extend the kind of checking we already do to work
around inevitable deficiencies of device parsers for tunnels to DSA
tags.

We can come up with various schemes of expressing capabilities
between underlying driver and tag driver. I'm not aware of similar
out-of-band schemes existing today so it'd be "DSA doing it's own
thing", which does not seem great.
Vladimir Oltean Jan. 24, 2022, 8:56 p.m. UTC | #34
On Mon, Jan 24, 2022 at 11:38:12AM -0800, Jakub Kicinski wrote:
> On Mon, 24 Jan 2022 21:08:45 +0200 Vladimir Oltean wrote:
> > On Mon, Jan 24, 2022 at 10:20:51AM -0800, Jakub Kicinski wrote:
> > > On Mon, 24 Jan 2022 09:35:56 -0800 Jakub Kicinski wrote:
> > > > Sorry I used "geometry" loosely.
> > > >
> > > > What I meant is simply that if the driver uses NETIF_F_IP*_CSUM
> > > > it should parse the packet before it hands it off to the HW.
> > > >
> > > > There is infinity of protocols users can come up with, while the device
> > > > parser is very much finite, so it's only practical to check compliance
> > > > with the HW parser in the driver. The reverse approach of adding
> > > > per-protocol caps is a dead end IMO. And we should not bloat the stack
> > > > when NETIF_F_HW_CSUM exists and the memo that parsing packets on Tx is
> > > > bad b/c of protocol ossification went out a decade ago.
> > >
> > > > It's not about DSA. The driver should not check
> > > >
> > > > if (dsa())
> > > > 	blah;
> > > >
> > > > it should check
> > > >
> > > > if (!(eth [-> vlan] -> ip -> tcp/udp))
> > > > 	csum_help();
> > >
> > > Admittedly on a quick look thru the drivers which already do this
> > > I only see L3, L4 and GRE/UDP encap checks. Nothing validates L2.
> >
> > So before we declare that any given Ethernet driver is buggy for declaring
> > NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
> > points where it expects it to (taking into consideration potential VLAN
> > headers, IPv6 extension headers),
>
> Extension headers are explicitly not supported by NETIF_F_IPV6_CSUM.
>
> IIRC Tom's hope was to delete NETIF_F_IP*_CSUM completely once all
> drivers are converted to parsing and therefore can use NETIF_F_HW_CSUM.

IIUC, NETIF_F_IP*_CSUM vs NETIF_F_HW_CSUM doesn't make that big of a
difference in terms of what the driver should check for, if the hardware
checksum offload engine can't directly be given the csum_start and
csum_offset, wherever they may be.

> > is there any driver that _does_ perform these checks correctly, that
> > could be used as an example?
>
> I don't think so. Let me put it this way - my understanding is that up
> until now we had been using the vlan_features, mpls_features etc to
> perform L2/L2.5/below-IP feature stripping. This scales poorly to DSA
> tags, as discussed in this thread.
>
> I'm suggesting we extend the kind of checking we already do to work
> around inevitable deficiencies of device parsers for tunnels to DSA
> tags.

Sorry, I'm very tired and I probably don't understand what you're
saying, so excuse the extra clarification questions.

The typical protocol checking that drivers with NETIF_F_HW_CSUM do seems
to be based on vlan_get_protocol()/skb->protocol/skb_network_header()/
skb_transport_header() values, all of which make DSA invisible. So they
don't work if the underlying hardware really doesn't like seeing an
unexpected DSA header.

When you say "I'm suggesting we extend the kind of checking we already do",
do you mean we should modify the likes of e1000e and igb such that, if
they're ever used as DSA masters, they do a full header parse of the
packet (struct ethhdr :: h_proto, check if VLAN, struct iphdr/ipv6hdr,
etc.) instead of the current logic? It will be pretty convoluted unless
we have some helper. Because if I follow through, for a DSA-tagged IP
packet on xmit, skb->protocol is certainly htons(ETH_P_IP):

ntohs(skb->protocol) = 0x800, csum_offset = 16, csum_start = 280, skb_checksum_start_offset = 54, skb->network_header = 260, skb_network_header_len = 20

skb_dump output:
skb len=94 headroom=226 headlen=94 tailroom=384
mac=(226,34) net=(260,20) trans=280
shinfo(txflags=0 nr_frags=0 gso(size=0 type=1 segs=1))
csum(0x100118 ip_summed=3 complete_sw=0 valid=0 level=0)
hash(0x7710ee84 sw=0 l4=1) proto=0x0800 pkttype=0 iif=0
dev name=eno2 feat=0x00020100001149a9
sk family=2 type=1 proto=6
skb headroom: 00000000: 6c 00 03 02 64 65 76 00 fe ed ca fe 28 00 00 00
...(junk)...
skb headroom: 000000e0: 5f 43
                        20 byte DSA tag
                        |
                        v
skb linear:   00000000: 88 80 00 0a 80 00 00 00 00 00 00 00 08 00 30 00
                                    skb_mac_header()
                                    |
                                    v
skb linear:   00000010: 00 00 00 00 68 05 ca 92 af 20 00 04 9f 05 f6 28
                              skb_network_header()
                              |
                              v
skb linear:   00000020: 08 00 45 00 00 3c 26 47 40 00 40 06 00 49 0a 00
                                          skb_checksum_start_offset
                                          |
                                          |                       csum_offset
                                          v                       v
skb linear:   00000030: 00 2c 0a 00 00 01 b6 08 14 51 11 1f 91 4f 00 00
skb linear:   00000040: 00 00 a0 02 fa f0 14 5b 00 00 02 04 05 b4 04 02
skb linear:   00000050: 08 0a 2e 00 e5 b8 00 00 00 00 01 03 03 07

I don't know, I just don't expect that non-DSA users of those drivers
will be very happy about such changes. Do these existing protocol
checking schemes qualify as buggy?

If this is the convention that we want to enforce, then I can't really
help Luiz with fixing the OpenWRT mtk_eth_soc.c - he'll have to figure
out a way to parse the packets for which his hardware will accept the
checksumming offload, and call skb_checksum_help() otherwise.

> We can come up with various schemes of expressing capabilities
> between underlying driver and tag driver. I'm not aware of similar
> out-of-band schemes existing today so it'd be "DSA doing it's own
> thing", which does not seem great.

It at least seems less complex to me, and less checking in the fast path
if I understand everything that's been said correctly.
Jakub Kicinski Jan. 24, 2022, 9:42 p.m. UTC | #35
On Mon, 24 Jan 2022 22:56:07 +0200 Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 11:38:12AM -0800, Jakub Kicinski wrote:
> > On Mon, 24 Jan 2022 21:08:45 +0200 Vladimir Oltean wrote:  
> > > So before we declare that any given Ethernet driver is buggy for declaring
> > > NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
> > > points where it expects it to (taking into consideration potential VLAN
> > > headers, IPv6 extension headers),  
> >
> > Extension headers are explicitly not supported by NETIF_F_IPV6_CSUM.
> >
> > IIRC Tom's hope was to delete NETIF_F_IP*_CSUM completely once all
> > drivers are converted to parsing and therefore can use NETIF_F_HW_CSUM.  
> 
> IIUC, NETIF_F_IP*_CSUM vs NETIF_F_HW_CSUM doesn't make that big of a
> difference in terms of what the driver should check for, if the hardware
> checksum offload engine can't directly be given the csum_start and
> csum_offset, wherever they may be.
> 
> > > is there any driver that _does_ perform these checks correctly, that
> > > could be used as an example?  
> >
> > I don't think so. Let me put it this way - my understanding is that up
> > until now we had been using the vlan_features, mpls_features etc to
> > perform L2/L2.5/below-IP feature stripping. This scales poorly to DSA
> > tags, as discussed in this thread.
> >
> > I'm suggesting we extend the kind of checking we already do to work
> > around inevitable deficiencies of device parsers for tunnels to DSA
> > tags.  
> 
> Sorry, I'm very tired and I probably don't understand what you're
> saying, so excuse the extra clarification questions.
> 
> The typical protocol checking that drivers with NETIF_F_HW_CSUM do seems
> to be based on vlan_get_protocol()/skb->protocol/skb_network_header()/
> skb_transport_header() values, all of which make DSA invisible. So they
> don't work if the underlying hardware really doesn't like seeing an
> unexpected DSA header.
> 
> When you say "I'm suggesting we extend the kind of checking we already do",
> do you mean we should modify the likes of e1000e and igb such that, if
> they're ever used as DSA masters, they do a full header parse of the
> packet (struct ethhdr :: h_proto, check if VLAN, struct iphdr/ipv6hdr,
> etc.) instead of the current logic?

That was my thinking, yes. The exact amount of work depends on the
driver, I believe that more recent Intel parts (igb, ixgbe and newer)
pass a L3 offset to the HW. They treat L2 as opaque, ergo no patches
needed. At a glance e1000e passes the full skb_checksum_start_offset()
to HW, so likely even better. 

It's only drivers for devices which actually want to parse the Ethertype
that would need extra checks. (Coincidentally such devices can't support
MPLS given the lack of L3 indication in the frame.)

> It will be pretty convoluted unless
> we have some helper. Because if I follow through, for a DSA-tagged IP
> packet on xmit, skb->protocol is certainly htons(ETH_P_IP):
> 
> ntohs(skb->protocol) = 0x800, csum_offset = 16, csum_start = 280, skb_checksum_start_offset = 54, skb->network_header = 260, skb_network_header_len = 20
> 
> skb_dump output:
> skb len=94 headroom=226 headlen=94 tailroom=384
> mac=(226,34) net=(260,20) trans=280
> shinfo(txflags=0 nr_frags=0 gso(size=0 type=1 segs=1))
> csum(0x100118 ip_summed=3 complete_sw=0 valid=0 level=0)
> hash(0x7710ee84 sw=0 l4=1) proto=0x0800 pkttype=0 iif=0
> dev name=eno2 feat=0x00020100001149a9
> sk family=2 type=1 proto=6
> skb headroom: 00000000: 6c 00 03 02 64 65 76 00 fe ed ca fe 28 00 00 00
> ...(junk)...
> skb headroom: 000000e0: 5f 43
>                         20 byte DSA tag
>                         |
>                         v
> skb linear:   00000000: 88 80 00 0a 80 00 00 00 00 00 00 00 08 00 30 00
>                                     skb_mac_header()
>                                     |
>                                     v
> skb linear:   00000010: 00 00 00 00 68 05 ca 92 af 20 00 04 9f 05 f6 28
>                               skb_network_header()
>                               |
>                               v
> skb linear:   00000020: 08 00 45 00 00 3c 26 47 40 00 40 06 00 49 0a 00
>                                           skb_checksum_start_offset
>                                           |
>                                           |                       csum_offset
>                                           v                       v
> skb linear:   00000030: 00 2c 0a 00 00 01 b6 08 14 51 11 1f 91 4f 00 00
> skb linear:   00000040: 00 00 a0 02 fa f0 14 5b 00 00 02 04 05 b4 04 02
> skb linear:   00000050: 08 0a 2e 00 e5 b8 00 00 00 00 01 03 03 07

Oof, so in this case the DSA tag is _before_ the skb_mac_header()?
Or the prepend is supposed to be parsable as a Ethernet header?
Seems like any device that can do csum over this packet must already 
use L3/L4 offsets or have explicit knowledge of DSA, right?

> I don't know, I just don't expect that non-DSA users of those drivers
> will be very happy about such changes. Do these existing protocol
> checking schemes qualify as buggy?

Unfortunate reality of the checksum offloads is that most drivers for
devices which parse on Tx are buggy, it's more of a question of whether
anyone tried to use an unsupported protocol stack :( Recent example
that comes to mind is 1698d600b361 ("bnxt_en: Implement
.ndo_features_check().").

> If this is the convention that we want to enforce, then I can't really
> help Luiz with fixing the OpenWRT mtk_eth_soc.c - he'll have to figure
> out a way to parse the packets for which his hardware will accept the
> checksumming offload, and call skb_checksum_help() otherwise.
> 
> > We can come up with various schemes of expressing capabilities
> > between underlying driver and tag driver. I'm not aware of similar
> > out-of-band schemes existing today so it'd be "DSA doing it's own
> > thing", which does not seem great.  
> 
> It at least seems less complex to me, and less checking in the fast path
> if I understand everything that's been said correctly.

I understand, I'm primarily trying to share some context and prior work.
I don't mean to nack all other approaches.

I believe writing a parser matching the device behavior would be easier
for a driver author than interpreting the runes of our csum offload API
and getting thru the thicket of all the bits. If that's not the case my
argument is likely defeated.
Vladimir Oltean Jan. 24, 2022, 10:30 p.m. UTC | #36
On Mon, Jan 24, 2022 at 01:42:42PM -0800, Jakub Kicinski wrote:
> On Mon, 24 Jan 2022 22:56:07 +0200 Vladimir Oltean wrote:
> > On Mon, Jan 24, 2022 at 11:38:12AM -0800, Jakub Kicinski wrote:
> > > On Mon, 24 Jan 2022 21:08:45 +0200 Vladimir Oltean wrote:
> > > > So before we declare that any given Ethernet driver is buggy for declaring
> > > > NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
> > > > points where it expects it to (taking into consideration potential VLAN
> > > > headers, IPv6 extension headers),
> > >
> > > Extension headers are explicitly not supported by NETIF_F_IPV6_CSUM.
> > >
> > > IIRC Tom's hope was to delete NETIF_F_IP*_CSUM completely once all
> > > drivers are converted to parsing and therefore can use NETIF_F_HW_CSUM.
> >
> > IIUC, NETIF_F_IP*_CSUM vs NETIF_F_HW_CSUM doesn't make that big of a
> > difference in terms of what the driver should check for, if the hardware
> > checksum offload engine can't directly be given the csum_start and
> > csum_offset, wherever they may be.
> >
> > > > is there any driver that _does_ perform these checks correctly, that
> > > > could be used as an example?
> > >
> > > I don't think so. Let me put it this way - my understanding is that up
> > > until now we had been using the vlan_features, mpls_features etc to
> > > perform L2/L2.5/below-IP feature stripping. This scales poorly to DSA
> > > tags, as discussed in this thread.
> > >
> > > I'm suggesting we extend the kind of checking we already do to work
> > > around inevitable deficiencies of device parsers for tunnels to DSA
> > > tags.
> >
> > Sorry, I'm very tired and I probably don't understand what you're
> > saying, so excuse the extra clarification questions.
> >
> > The typical protocol checking that drivers with NETIF_F_HW_CSUM do seems
> > to be based on vlan_get_protocol()/skb->protocol/skb_network_header()/
> > skb_transport_header() values, all of which make DSA invisible. So they
> > don't work if the underlying hardware really doesn't like seeing an
> > unexpected DSA header.
> >
> > When you say "I'm suggesting we extend the kind of checking we already do",
> > do you mean we should modify the likes of e1000e and igb such that, if
> > they're ever used as DSA masters, they do a full header parse of the
> > packet (struct ethhdr :: h_proto, check if VLAN, struct iphdr/ipv6hdr,
> > etc.) instead of the current logic?
>
> That was my thinking, yes. The exact amount of work depends on the
> driver, I believe that more recent Intel parts (igb, ixgbe and newer)
> pass a L3 offset to the HW. They treat L2 as opaque, ergo no patches
> needed. At a glance e1000e passes the full skb_checksum_start_offset()
> to HW, so likely even better.

Ah, right, I missed that. I agree that a driver that uses
skb->csum_start is very likely to work unmodified with DSA headers
(not trailers). I didn't notice this in e1000 because I was just
searching for csum_start.

> It's only drivers for devices which actually want to parse the Ethertype
> that would need extra checks. (Coincidentally such devices can't support
> MPLS given the lack of L3 indication in the frame.)
>
> > It will be pretty convoluted unless
> > we have some helper. Because if I follow through, for a DSA-tagged IP
> > packet on xmit, skb->protocol is certainly htons(ETH_P_IP):
> >
> > ntohs(skb->protocol) = 0x800, csum_offset = 16, csum_start = 280, skb_checksum_start_offset = 54, skb->network_header = 260, skb_network_header_len = 20
> >
> > skb_dump output:
> > skb len=94 headroom=226 headlen=94 tailroom=384
> > mac=(226,34) net=(260,20) trans=280
> > shinfo(txflags=0 nr_frags=0 gso(size=0 type=1 segs=1))
> > csum(0x100118 ip_summed=3 complete_sw=0 valid=0 level=0)
> > hash(0x7710ee84 sw=0 l4=1) proto=0x0800 pkttype=0 iif=0
> > dev name=eno2 feat=0x00020100001149a9
> > sk family=2 type=1 proto=6
> > skb headroom: 00000000: 6c 00 03 02 64 65 76 00 fe ed ca fe 28 00 00 00
> > ...(junk)...
> > skb headroom: 000000e0: 5f 43
> >                         20 byte DSA tag
> >                         |
> >                         v
> > skb linear:   00000000: 88 80 00 0a 80 00 00 00 00 00 00 00 08 00 30 00
> >                                     skb_mac_header()
> >                                     |
> >                                     v
> > skb linear:   00000010: 00 00 00 00 68 05 ca 92 af 20 00 04 9f 05 f6 28
> >                               skb_network_header()
> >                               |
> >                               v
> > skb linear:   00000020: 08 00 45 00 00 3c 26 47 40 00 40 06 00 49 0a 00
> >                                           skb_checksum_start_offset
> >                                           |
> >                                           |                       csum_offset
> >                                           v                       v
> > skb linear:   00000030: 00 2c 0a 00 00 01 b6 08 14 51 11 1f 91 4f 00 00
> > skb linear:   00000040: 00 00 a0 02 fa f0 14 5b 00 00 02 04 05 b4 04 02
> > skb linear:   00000050: 08 0a 2e 00 e5 b8 00 00 00 00 01 03 03 07
>
> Oof, so in this case the DSA tag is _before_ the skb_mac_header()?
> Or the prepend is supposed to be parsable as a Ethernet header?
> Seems like any device that can do csum over this packet must already
> use L3/L4 offsets or have explicit knowledge of DSA, right?

I'm sorry, there's a mistake, skb_mac_header() points to the DSA tag
here (and skb_mac_header_len is 34), I wanted to say "real MAC header"
but failed to do so. This case shouldn't pose any special problems.

> > I don't know, I just don't expect that non-DSA users of those drivers
> > will be very happy about such changes. Do these existing protocol
> > checking schemes qualify as buggy?
>
> Unfortunate reality of the checksum offloads is that most drivers for
> devices which parse on Tx are buggy, it's more of a question of whether
> anyone tried to use an unsupported protocol stack :( Recent example
> that comes to mind is 1698d600b361 ("bnxt_en: Implement
> .ndo_features_check().").

Nice hook this ndo_features_check! In my reading of validate_xmit_skb()
I went right past it.

> > If this is the convention that we want to enforce, then I can't really
> > help Luiz with fixing the OpenWRT mtk_eth_soc.c - he'll have to figure
> > out a way to parse the packets for which his hardware will accept the
> > checksumming offload, and call skb_checksum_help() otherwise.
> >
> > > We can come up with various schemes of expressing capabilities
> > > between underlying driver and tag driver. I'm not aware of similar
> > > out-of-band schemes existing today so it'd be "DSA doing it's own
> > > thing", which does not seem great.
> >
> > It at least seems less complex to me, and less checking in the fast path
> > if I understand everything that's been said correctly.
>
> I understand, I'm primarily trying to share some context and prior work.
> I don't mean to nack all other approaches.
>
> I believe writing a parser matching the device behavior would be easier
> for a driver author than interpreting the runes of our csum offload API
> and getting thru the thicket of all the bits. If that's not the case my
> argument is likely defeated.

Ok, writing a parser might be needed if the DSA master is going, for
some reason, to support TX checksum offloading with some DSA headers but
not with others.

If that is not the case, and that Ethernet controller simply doesn't
support TX checksumming unless it's the plain old Ethernet {+ VLAN} +
IP/IPv6 + TCP/UDP, then this blanket patch below should fix the problem
almost elegantly, and parsing is a useless complication (warning, not
even compile-tested!):

-----------------------------[ cut here ]-----------------------------
From 5ef3d3cd8441d756933558212f518f48754c64d9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 25 Jan 2022 00:16:57 +0200
Subject: [PATCH] ramips: ethernet: ralink: disable TX checksumming on DSA
 masters

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 .../files/drivers/net/ethernet/ralink/mtk_eth_soc.c  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
index e07e5ed5a8f8..6ed9bc5942fd 100644
--- a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
+++ b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
@@ -31,6 +31,7 @@
 #include <linux/io.h>
 #include <linux/bug.h>
 #include <linux/netfilter.h>
+#include <net/dsa.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <linux/of_gpio.h>
 #include <linux/gpio.h>
@@ -1497,6 +1498,16 @@ static int fe_change_mtu(struct net_device *dev, int new_mtu)
 	return fe_open(dev);
 }
 
+static netdev_features_t fe_features_check(struct sk_buff *skb,
+					   struct net_device *dev,
+					   netdev_features_t features)
+{
+	if (netdev_uses_dsa(dev))
+		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+
+	return features;
+}
+
 static const struct net_device_ops fe_netdev_ops = {
 	.ndo_init		= fe_init,
 	.ndo_uninit		= fe_uninit,
@@ -1514,6 +1525,7 @@ static const struct net_device_ops fe_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= fe_poll_controller,
 #endif
+	.ndo_features_check	= fe_features_check,
 };
 
 static void fe_reset_pending(struct fe_priv *priv)
-----------------------------[ cut here ]-----------------------------

This is essentially what Florian said ages ago, it just took me a very
long time to process. I guess what hadn't fully clicked in my head is
that the TX checksumming offload being functional is more a matter of
telling the hardware what are the L3 and L4 offsets, and the csum_offset,
rather than it requiring any particular understanding of the DSA header.
In turn, it means that for "nice" Ethernet controller implementations
where that is the case, it would be actively detrimential to add a new
.ndo_get_dsa_features() or something like that - because such a driver
would report that it supports all DSA header-type formats (trailers are
still broken as long as there isn't a csum_end). And keeping that kind
of driver in sync with all DSA protocols that appear will become a
repetitive task.

So crisis averted, I guess?
Thanks a lot to both of you for the patient explanations!
I retract my proposal for a new ndo and also suggest that the DSA master
driver takes care to not leave as zero a TX checksum it can't offload.
Luiz Angelo Daros de Luca Jan. 25, 2022, 7:15 a.m. UTC | #37
Wow... that's a lot to digest.

> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  .../files/drivers/net/ethernet/ralink/mtk_eth_soc.c  | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
>
> diff --git a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> index e07e5ed5a8f8..6ed9bc5942fd 100644
> --- a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> +++ b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> @@ -31,6 +31,7 @@
>  #include <linux/io.h>
>  #include <linux/bug.h>
>  #include <linux/netfilter.h>
> +#include <net/dsa.h>
>  #include <net/netfilter/nf_flow_table.h>
>  #include <linux/of_gpio.h>
>  #include <linux/gpio.h>
> @@ -1497,6 +1498,16 @@ static int fe_change_mtu(struct net_device *dev, int new_mtu)
>         return fe_open(dev);
>  }
>
> +static netdev_features_t fe_features_check(struct sk_buff *skb,
> +                                          struct net_device *dev,
> +                                          netdev_features_t features)
> +{
> +       if (netdev_uses_dsa(dev))
> +               features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
> +
> +       return features;
> +}
> +
>  static const struct net_device_ops fe_netdev_ops = {
>         .ndo_init               = fe_init,
>         .ndo_uninit             = fe_uninit,
> @@ -1514,6 +1525,7 @@ static const struct net_device_ops fe_netdev_ops = {
>  #ifdef CONFIG_NET_POLL_CONTROLLER
>         .ndo_poll_controller    = fe_poll_controller,
>  #endif
> +       .ndo_features_check     = fe_features_check,
>  };
>
>  static void fe_reset_pending(struct fe_priv *priv)

Thanks, Vladimir. I'll try that patch soon. However, it will never be
accepted even in OpenWrt as is because it does offload its own
proprietary tag.
I might need to add another if like:

> +       if (netdev_uses_dsa(dev))
> +               if (skb->???->proto_in_use != DSA_TAG_PROTO_MTK)
> +                      features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);

I think it would need to save the used tag in some form of an oob
signal (as Florian suggested). In that case, even the
netdev_uses_dsa() test can be removed as a skb with an oob tag signal
is surely from a dsa device.

Anyway, even with existing offload code not doing exactly what they
should, they normally work given a normal device usage. The strange
part is that DSA assumes those features, copied from master to slave,
will still be the same even after a tag was injected into the packet.

Sorry for my arrogance being a newbie but I think the place to fix the
problem is still in slave feature list. It is better than having the
kernel repeat the test for every single packet. And nobody will be
willing to add extra overhead to a working code just because DSA needs
it. It is easier to add a new function that does not touch existing
code paths.

I believe that those drivers with NETIF_F_HW_CSUM are fine for every
type of DSA, right? So, just those with NETIF_F_IP_CSUM |
NETIF_F_IPV6_CSUM set needs to be adjusted. A fully implemented
ndo_features_check() will work but improving it for every driver will
add extra code/overhead for all packets, used with DSA or not. And
that extra code needed for DSA will either always keep or remove the
same features for a given slave.

I imagine that for NETIF_F_CSUM_MASK and NETIF_F_GSO_MASK, it would
not be too hard to build a set of candidate packets to test if that
feature is still valid after the tag was added. With that assumption,
a new ndo_features_check_offline(), similar to ndo_features_check()
but not be called by netif_skb_features, will test each candidate
during slave setup. If the check disagrees after the tag was added,
that feature should be disabled for that slave. Something like:

slave->features = master->features;
if (slave->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))
    if (dev->netdev_ops->ndo_features_check_offline)
        foreach (test_candidate)
            tagged_test_candidate = add_tag (test_candidate, slave->tag);

            slave->features &=
~(master->netdev_ops->ndo_features_check_offline(test_candidate,
master, slave->features) ^

master->netdev_ops->ndo_features_check_offline(tagged_test_candidate,
master, slave->features)
    else
        slave->features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK)

The only drivers that would have performance regression while used as
DSA master ports are those:
1) that does not have NETIF_F_HW_CSUM set
2) but could still offload after a particular DSA tag was added (when
tag vendor and HW matches)
3) and still didn't implement the new ndo_features_check_offline().

ndo_features_check_offline() would not be too much different from what
Vladmir suggested for the out-of-tree mtk_eth_soc driver.

ndo_features_check_offline(sbk, dev, features) {
    switch (sbk->oob->tag) {
    case SUPPORTED_TAG_1:
    case SUPPORTED_TAG_2:
    case SUPPORTED_TAG_3:
    case NO_TAG:
        break;
    default:
        features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
    }

    if (dev->netdev_ops->ndo_features_check)
         features &= dev->netdev_ops->ndo_features_check(skb, dev, features);

    /* some more test if needed*/

    return features;
}

If used exclusively by DSA, ndo_features_check_offline could also be
called ndo_dsa_features_check (or any better name than
ndo_features_check_offline). That is not far away from what Vladmir
suggested (and later retracted) in the first place.

Regards,

Luiz
Arınç ÜNAL Jan. 25, 2022, 9:44 a.m. UTC | #38
On 25/01/2022 01:30, Vladimir Oltean wrote:
> On Mon, Jan 24, 2022 at 01:42:42PM -0800, Jakub Kicinski wrote:
>> On Mon, 24 Jan 2022 22:56:07 +0200 Vladimir Oltean wrote:
>>> On Mon, Jan 24, 2022 at 11:38:12AM -0800, Jakub Kicinski wrote:
>>>> On Mon, 24 Jan 2022 21:08:45 +0200 Vladimir Oltean wrote:
>>>>> So before we declare that any given Ethernet driver is buggy for declaring
>>>>> NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and not checking that skb->csum_start
>>>>> points where it expects it to (taking into consideration potential VLAN
>>>>> headers, IPv6 extension headers),
>>>>
>>>> Extension headers are explicitly not supported by NETIF_F_IPV6_CSUM.
>>>>
>>>> IIRC Tom's hope was to delete NETIF_F_IP*_CSUM completely once all
>>>> drivers are converted to parsing and therefore can use NETIF_F_HW_CSUM.
>>>
>>> IIUC, NETIF_F_IP*_CSUM vs NETIF_F_HW_CSUM doesn't make that big of a
>>> difference in terms of what the driver should check for, if the hardware
>>> checksum offload engine can't directly be given the csum_start and
>>> csum_offset, wherever they may be.
>>>
>>>>> is there any driver that _does_ perform these checks correctly, that
>>>>> could be used as an example?
>>>>
>>>> I don't think so. Let me put it this way - my understanding is that up
>>>> until now we had been using the vlan_features, mpls_features etc to
>>>> perform L2/L2.5/below-IP feature stripping. This scales poorly to DSA
>>>> tags, as discussed in this thread.
>>>>
>>>> I'm suggesting we extend the kind of checking we already do to work
>>>> around inevitable deficiencies of device parsers for tunnels to DSA
>>>> tags.
>>>
>>> Sorry, I'm very tired and I probably don't understand what you're
>>> saying, so excuse the extra clarification questions.
>>>
>>> The typical protocol checking that drivers with NETIF_F_HW_CSUM do seems
>>> to be based on vlan_get_protocol()/skb->protocol/skb_network_header()/
>>> skb_transport_header() values, all of which make DSA invisible. So they
>>> don't work if the underlying hardware really doesn't like seeing an
>>> unexpected DSA header.
>>>
>>> When you say "I'm suggesting we extend the kind of checking we already do",
>>> do you mean we should modify the likes of e1000e and igb such that, if
>>> they're ever used as DSA masters, they do a full header parse of the
>>> packet (struct ethhdr :: h_proto, check if VLAN, struct iphdr/ipv6hdr,
>>> etc.) instead of the current logic?
>>
>> That was my thinking, yes. The exact amount of work depends on the
>> driver, I believe that more recent Intel parts (igb, ixgbe and newer)
>> pass a L3 offset to the HW. They treat L2 as opaque, ergo no patches
>> needed. At a glance e1000e passes the full skb_checksum_start_offset()
>> to HW, so likely even better.
> 
> Ah, right, I missed that. I agree that a driver that uses
> skb->csum_start is very likely to work unmodified with DSA headers
> (not trailers). I didn't notice this in e1000 because I was just
> searching for csum_start.
> 
>> It's only drivers for devices which actually want to parse the Ethertype
>> that would need extra checks. (Coincidentally such devices can't support
>> MPLS given the lack of L3 indication in the frame.)
>>
>>> It will be pretty convoluted unless
>>> we have some helper. Because if I follow through, for a DSA-tagged IP
>>> packet on xmit, skb->protocol is certainly htons(ETH_P_IP):
>>>
>>> ntohs(skb->protocol) = 0x800, csum_offset = 16, csum_start = 280, skb_checksum_start_offset = 54, skb->network_header = 260, skb_network_header_len = 20
>>>
>>> skb_dump output:
>>> skb len=94 headroom=226 headlen=94 tailroom=384
>>> mac=(226,34) net=(260,20) trans=280
>>> shinfo(txflags=0 nr_frags=0 gso(size=0 type=1 segs=1))
>>> csum(0x100118 ip_summed=3 complete_sw=0 valid=0 level=0)
>>> hash(0x7710ee84 sw=0 l4=1) proto=0x0800 pkttype=0 iif=0
>>> dev name=eno2 feat=0x00020100001149a9
>>> sk family=2 type=1 proto=6
>>> skb headroom: 00000000: 6c 00 03 02 64 65 76 00 fe ed ca fe 28 00 00 00
>>> ...(junk)...
>>> skb headroom: 000000e0: 5f 43
>>>                          20 byte DSA tag
>>>                          |
>>>                          v
>>> skb linear:   00000000: 88 80 00 0a 80 00 00 00 00 00 00 00 08 00 30 00
>>>                                      skb_mac_header()
>>>                                      |
>>>                                      v
>>> skb linear:   00000010: 00 00 00 00 68 05 ca 92 af 20 00 04 9f 05 f6 28
>>>                                skb_network_header()
>>>                                |
>>>                                v
>>> skb linear:   00000020: 08 00 45 00 00 3c 26 47 40 00 40 06 00 49 0a 00
>>>                                            skb_checksum_start_offset
>>>                                            |
>>>                                            |                       csum_offset
>>>                                            v                       v
>>> skb linear:   00000030: 00 2c 0a 00 00 01 b6 08 14 51 11 1f 91 4f 00 00
>>> skb linear:   00000040: 00 00 a0 02 fa f0 14 5b 00 00 02 04 05 b4 04 02
>>> skb linear:   00000050: 08 0a 2e 00 e5 b8 00 00 00 00 01 03 03 07
>>
>> Oof, so in this case the DSA tag is _before_ the skb_mac_header()?
>> Or the prepend is supposed to be parsable as a Ethernet header?
>> Seems like any device that can do csum over this packet must already
>> use L3/L4 offsets or have explicit knowledge of DSA, right?
> 
> I'm sorry, there's a mistake, skb_mac_header() points to the DSA tag
> here (and skb_mac_header_len is 34), I wanted to say "real MAC header"
> but failed to do so. This case shouldn't pose any special problems.
> 
>>> I don't know, I just don't expect that non-DSA users of those drivers
>>> will be very happy about such changes. Do these existing protocol
>>> checking schemes qualify as buggy?
>>
>> Unfortunate reality of the checksum offloads is that most drivers for
>> devices which parse on Tx are buggy, it's more of a question of whether
>> anyone tried to use an unsupported protocol stack :( Recent example
>> that comes to mind is 1698d600b361 ("bnxt_en: Implement
>> .ndo_features_check().").
> 
> Nice hook this ndo_features_check! In my reading of validate_xmit_skb()
> I went right past it.
> 
>>> If this is the convention that we want to enforce, then I can't really
>>> help Luiz with fixing the OpenWRT mtk_eth_soc.c - he'll have to figure
>>> out a way to parse the packets for which his hardware will accept the
>>> checksumming offload, and call skb_checksum_help() otherwise.
>>>
>>>> We can come up with various schemes of expressing capabilities
>>>> between underlying driver and tag driver. I'm not aware of similar
>>>> out-of-band schemes existing today so it'd be "DSA doing it's own
>>>> thing", which does not seem great.
>>>
>>> It at least seems less complex to me, and less checking in the fast path
>>> if I understand everything that's been said correctly.
>>
>> I understand, I'm primarily trying to share some context and prior work.
>> I don't mean to nack all other approaches.
>>
>> I believe writing a parser matching the device behavior would be easier
>> for a driver author than interpreting the runes of our csum offload API
>> and getting thru the thicket of all the bits. If that's not the case my
>> argument is likely defeated.
> 
> Ok, writing a parser might be needed if the DSA master is going, for
> some reason, to support TX checksum offloading with some DSA headers but
> not with others.
> 
> If that is not the case, and that Ethernet controller simply doesn't
> support TX checksumming unless it's the plain old Ethernet {+ VLAN} +
> IP/IPv6 + TCP/UDP, then this blanket patch below should fix the problem
> almost elegantly, and parsing is a useless complication (warning, not
> even compile-tested!):

I tried it on the upstream mtk_eth_soc on an mt7621a board with an 
rtl8367s switch connected to the 2nd GMAC of the SoC running 5.17-rc1.

Although "ethtool --show-offload eth1" does not show anything different, 
I can now ssh to the device (only way that came into my mind to quickly 
check TCP traffic).

--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1986,6 +1986,16 @@ static int mtk_hwlro_get_fdir_all(struct
  	return 0;
  }

+static netdev_features_t fe_features_check(struct sk_buff *skb,
+					   struct net_device *dev,
+					   netdev_features_t features)
+{
+	if (netdev_uses_dsa(dev))
+		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+
+	return features;
+}
+
  static netdev_features_t mtk_fix_features(struct net_device *dev,
  					  netdev_features_t features)
  {
@@ -2906,6 +2916,7 @@ static const struct net_device_ops mtk_n
  #ifdef CONFIG_NET_POLL_CONTROLLER
  	.ndo_poll_controller	= mtk_poll_controller,
  #endif
+	.ndo_features_check	= fe_features_check,
  	.ndo_setup_tc		= mtk_eth_setup_tc,
  };


> 
> -----------------------------[ cut here ]-----------------------------
>  From 5ef3d3cd8441d756933558212f518f48754c64d9 Mon Sep 17 00:00:00 2001
> From: Vladimir Oltean <vladimir.oltean@nxp.com>
> Date: Tue, 25 Jan 2022 00:16:57 +0200
> Subject: [PATCH] ramips: ethernet: ralink: disable TX checksumming on DSA
>   masters
> 
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>   .../files/drivers/net/ethernet/ralink/mtk_eth_soc.c  | 12 ++++++++++++
>   1 file changed, 12 insertions(+)
> 
> diff --git a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> index e07e5ed5a8f8..6ed9bc5942fd 100644
> --- a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> +++ b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
> @@ -31,6 +31,7 @@
>   #include <linux/io.h>
>   #include <linux/bug.h>
>   #include <linux/netfilter.h>
> +#include <net/dsa.h>
>   #include <net/netfilter/nf_flow_table.h>
>   #include <linux/of_gpio.h>
>   #include <linux/gpio.h>
> @@ -1497,6 +1498,16 @@ static int fe_change_mtu(struct net_device *dev, int new_mtu)
>   	return fe_open(dev);
>   }
>   
> +static netdev_features_t fe_features_check(struct sk_buff *skb,
> +					   struct net_device *dev,
> +					   netdev_features_t features)
> +{
> +	if (netdev_uses_dsa(dev))
> +		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
> +
> +	return features;
> +}
> +
>   static const struct net_device_ops fe_netdev_ops = {
>   	.ndo_init		= fe_init,
>   	.ndo_uninit		= fe_uninit,
> @@ -1514,6 +1525,7 @@ static const struct net_device_ops fe_netdev_ops = {
>   #ifdef CONFIG_NET_POLL_CONTROLLER
>   	.ndo_poll_controller	= fe_poll_controller,
>   #endif
> +	.ndo_features_check	= fe_features_check,
>   };
>   
>   static void fe_reset_pending(struct fe_priv *priv)
> -----------------------------[ cut here ]-----------------------------
> 
> This is essentially what Florian said ages ago, it just took me a very
> long time to process. I guess what hadn't fully clicked in my head is
> that the TX checksumming offload being functional is more a matter of
> telling the hardware what are the L3 and L4 offsets, and the csum_offset,
> rather than it requiring any particular understanding of the DSA header.
> In turn, it means that for "nice" Ethernet controller implementations
> where that is the case, it would be actively detrimential to add a new
> .ndo_get_dsa_features() or something like that - because such a driver
> would report that it supports all DSA header-type formats (trailers are
> still broken as long as there isn't a csum_end). And keeping that kind
> of driver in sync with all DSA protocols that appear will become a
> repetitive task.
> 
> So crisis averted, I guess?
> Thanks a lot to both of you for the patient explanations!
> I retract my proposal for a new ndo and also suggest that the DSA master
> driver takes care to not leave as zero a TX checksum it can't offload.
Vladimir Oltean Jan. 25, 2022, 9:47 a.m. UTC | #39
Hi Luiz,

On Tue, Jan 25, 2022 at 04:15:23AM -0300, Luiz Angelo Daros de Luca wrote:
> I believe that those drivers with NETIF_F_HW_CSUM are fine for every
> type of DSA, right? So, just those with NETIF_F_IP_CSUM |
> NETIF_F_IPV6_CSUM set needs to be adjusted. A fully implemented
> ndo_features_check() will work but improving it for every driver will
> add extra code/overhead for all packets, used with DSA or not. And
> that extra code needed for DSA will either always keep or remove the
> same features for a given slave.

Could you implement a prototype of packet parsing in ndo_features_check,
which checks for the known DSA EtherType and clears the offload bit for
unsupported packets, and do some performance testing before and after,
to lean the argument in your favor with some numbers? I've no problem if
you test for the worst case, i.e. line rate with small UDP packets
encapsulated with the known (offload-capable) DSA tag format, where
there is little benefit for offloading TX checksumming.
Luiz Angelo Daros de Luca Jan. 25, 2022, 10:29 p.m. UTC | #40
> Could you implement a prototype of packet parsing in ndo_features_check,
> which checks for the known DSA EtherType and clears the offload bit for
> unsupported packets, and do some performance testing before and after,
> to lean the argument in your favor with some numbers? I've no problem if
> you test for the worst case, i.e. line rate with small UDP packets
> encapsulated with the known (offload-capable) DSA tag format, where
> there is little benefit for offloading TX checksumming.

There is no way to tell if a packet has a DSA tag only by parsing its
content. For Realtek and Marvel EDSA, there is a distinct ethertype
(although Marvel EDSA uses a non-registered number) that drivers can
check. For others, specially those that add the tag before the
ethernet header or after the payload, it might not have a magic
number. It is impossible to securely identify if and which DSA is in
use for some DSA tags from the packet alone. This is also the case for
mediatek. Although it places its tag just before ethertype (like
Realtek and Marvel), there is no magic number. It needs some context
to know what type of DSA was applied.

skb_buf today knows nothing about the added DSA tag. Although
net_device does know if it is a master port in a dsa tree, and it has
a default dsa tag, with multiple switches using different tags, it
cannot tell which dsa tag was added to that packet.
That is the information I need to test if that tag is supported or not
by this drive.

I believe once an offload HW can digest a dsa tag, it might support
the same type of protocols with or without the tag.
In the end, what really matters is if a driver supports a specific dsa tag.

Wouldn't it be much easier to have a dedicated optional
ndo_dsa_tag_supported()? It would be only needed for those drivers
that still use NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and only those that
can digest a tag.

Regards,
Florian Fainelli Jan. 25, 2022, 11:56 p.m. UTC | #41
On 1/25/2022 2:29 PM, Luiz Angelo Daros de Luca wrote:
>> Could you implement a prototype of packet parsing in ndo_features_check,
>> which checks for the known DSA EtherType and clears the offload bit for
>> unsupported packets, and do some performance testing before and after,
>> to lean the argument in your favor with some numbers? I've no problem if
>> you test for the worst case, i.e. line rate with small UDP packets
>> encapsulated with the known (offload-capable) DSA tag format, where
>> there is little benefit for offloading TX checksumming.
> 
> There is no way to tell if a packet has a DSA tag only by parsing its
> content. For Realtek and Marvel EDSA, there is a distinct ethertype
> (although Marvel EDSA uses a non-registered number) that drivers can
> check. For others, specially those that add the tag before the
> ethernet header or after the payload, it might not have a magic
> number. It is impossible to securely identify if and which DSA is in
> use for some DSA tags from the packet alone. This is also the case for
> mediatek. Although it places its tag just before ethertype (like
> Realtek and Marvel), there is no magic number. It needs some context
> to know what type of DSA was applied.

Looking at mtk_eth_soc.h TX_DMA_CHKSUM is 0x7 << 29 so we set 3 bits 
there, which makes me think that either we defined too many bits, or 
some of those bits have a compounded meaning. The rest of the bits do 
not seem to be defined, so maybe there is a programmable offset where to 
calculate the checksum from and deposit it. Is there a public 
programmable manual?

> 
> skb_buf today knows nothing about the added DSA tag. Although
> net_device does know if it is a master port in a dsa tree, and it has
> a default dsa tag, with multiple switches using different tags, it
> cannot tell which dsa tag was added to that packet.
> That is the information I need to test if that tag is supported or not
> by this drive.
> 
> I believe once an offload HW can digest a dsa tag, it might support
> the same type of protocols with or without the tag.
> In the end, what really matters is if a driver supports a specific dsa tag.

To be honest, I am not sure if we need to know about the specific 
details of the tag like is it Realtek, Broadcom, Mediatek, QCA, more 
than knowing whether the L3/L4 offsets will be at "expected" locations. 
By that I mean, located at 14 bytes from the start of the frame for IP 
without VLAN , and 18 bytes with VLAN, did we "stack" switch tags on top 
of another thus moving by another X bytes etc.

> 
> Wouldn't it be much easier to have a dedicated optional
> ndo_dsa_tag_supported()? It would be only needed for those drivers
> that still use NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and only those that
> can digest a tag.

I don't think we need to invent something new, we "just" need to tell 
the DSA conduit interface what type of switch tagger it is attached to 
and where it is in the Ethernet frame. Once we do that, the DSA conduit 
ought to be able to strip out features statically, or dynamically via 
ndo_features_check().
Luiz Angelo Daros de Luca Jan. 26, 2022, 10:49 p.m. UTC | #42
> On 1/25/2022 2:29 PM, Luiz Angelo Daros de Luca wrote:
> >> Could you implement a prototype of packet parsing in ndo_features_check,
> >> which checks for the known DSA EtherType and clears the offload bit for
> >> unsupported packets, and do some performance testing before and after,
> >> to lean the argument in your favor with some numbers? I've no problem if
> >> you test for the worst case, i.e. line rate with small UDP packets
> >> encapsulated with the known (offload-capable) DSA tag format, where
> >> there is little benefit for offloading TX checksumming.
> >
> > There is no way to tell if a packet has a DSA tag only by parsing its
> > content. For Realtek and Marvel EDSA, there is a distinct ethertype
> > (although Marvel EDSA uses a non-registered number) that drivers can
> > check. For others, specially those that add the tag before the
> > ethernet header or after the payload, it might not have a magic
> > number. It is impossible to securely identify if and which DSA is in
> > use for some DSA tags from the packet alone. This is also the case for
> > mediatek. Although it places its tag just before ethertype (like
> > Realtek and Marvel), there is no magic number. It needs some context
> > to know what type of DSA was applied.
>
> Looking at mtk_eth_soc.h TX_DMA_CHKSUM is 0x7 << 29 so we set 3 bits
> there, which makes me think that either we defined too many bits, or
> some of those bits have a compounded meaning. The rest of the bits do
> not seem to be defined, so maybe there is a programmable offset where to
> calculate the checksum from and deposit it. Is there a public
> programmable manual?

Thanks Florian, I'm using this that I googled ;-)

http://download.villagetelco.org/hardware/MT7620/MT7620_ProgrammingGuide.pdf
page 206?

It says:

DWORD0 31:0 SDP0 Segment Data Pointer0

DWORD1 31 DDONE DMA Done: Indicates DMA has transferred the segment
pointed to by this Tx
descriptor.
30 LS0 Last Segment0: Data pointed to by SDP0 is the last segment.
29:16 SDL0 Segment Data Length0: Segment data length for the data
pointed to by SDP0.
15 BURST When set, the scheduler cannot hand over to other Tx queues.
Should not transmit
the next packet.
14 LS1 Last Segment1: Data pointed to by SDP1 is the last segment.
13:0 SDL1 Segment Data Length1: Segment data length for the data
pointed to by SDP1.

DWORD2
31:0 SDP1 Segment Data Pointer1

DWORD3 (TXINFO)
31 ICO IP checksum offload enable
30 UCO UDP checksum offload enable
23 TCO TCP checksum offload enable
28 TSO TCP segmentation offload
27:20 FP_BMAP Forced destination port on GSW
bit[0:5]: Ports 0 to 5
bit[6]: CPU
bit[7]: PPE
FP_BMAP = 0: routing by DA
19:15 UDF User defined field
14 0 Reserved
13 0 Reserved
12 INSP Insert PPPoE header
11:8 SIDX PPPoE session index
7 INSV Insert VLAN tag
6:4 VPRI VLAN priority tag to be inserted
3:0 VIDX VLAN ID index

It looks like st->txd.txd4 is DWORD3. There is nothing too useful for
pointing L3 headers. The remaining bits are about vlan and pppoe
offload (and forcing the forwarding port).
There are those two segment 0 and 1 data pointers and their size in
txd{1,2,3} that I don't understand how it works (yet), but I guess
that is not what we are looking for.

There are also some offload settings at CDMA (page 217), but it is
simply an enable bit. (And I don't know what CDMA or GDMA are :-/).

> > skb_buf today knows nothing about the added DSA tag. Although
> > net_device does know if it is a master port in a dsa tree, and it has
> > a default dsa tag, with multiple switches using different tags, it
> > cannot tell which dsa tag was added to that packet.
> > That is the information I need to test if that tag is supported or not
> > by this drive.
> >
> > I believe once an offload HW can digest a dsa tag, it might support
> > the same type of protocols with or without the tag.
> > In the end, what really matters is if a driver supports a specific dsa tag.
>
> To be honest, I am not sure if we need to know about the specific
> details of the tag like is it Realtek, Broadcom, Mediatek, QCA, more
> than knowing whether the L3/L4 offsets will be at "expected" locations.
> By that I mean, located at 14 bytes from the start of the frame for IP
> without VLAN , and 18 bytes with VLAN, did we "stack" switch tags on top
> of another thus moving by another X bytes etc.

I would be perfect if the HW supported that (and I'm afraid it does not).

>
>
> >
> > Wouldn't it be much easier to have a dedicated optional
> > ndo_dsa_tag_supported()? It would be only needed for those drivers
> > that still use NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM and only those that
> > can digest a tag.
>
> I don't think we need to invent something new, we "just" need to tell
> the DSA conduit interface what type of switch tagger it is attached to
> and where it is in the Ethernet frame. Once we do that, the DSA conduit
> ought to be able to strip out features statically, or dynamically via
> ndo_features_check().

Is it a 1:1 relation between tags and the DSA conduit interface (I'm
guessing this is the CPU port Ethernet device)?
Anyway, this mediatek does not seem to support multiple tags. This
patch might disable offloading when it is using any DSA tags but the
mediatek one.
I never used the NONE tag but maybe I should also exempt that tag from
disabling offloading.

diff --git a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
index 0ae520183b..8eb5dd8721 100644
--- a/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
+++ b/target/linux/ramips/files/drivers/net/ethernet/ralink/mtk_eth_soc.c
@@ -31,6 +31,7 @@
#include <linux/io.h>
#include <linux/bug.h>
#include <linux/netfilter.h>
+#include <net/dsa.h>
#include <net/netfilter/nf_flow_table.h>
#include <linux/of_gpio.h>
#include <linux/gpio.h>
@@ -788,6 +789,27 @@ err_out:
       return -1;
}

+static netdev_features_t fe_features_check(struct sk_buff *skb,
+                                          struct net_device *dev,
+                                          netdev_features_t features)
+{
+       /* No point in doing any of this if neither checksum nor GSO are
+        * being requested for this frame. We can rule out both by just
+        * checking for CHECKSUM_PARTIAL
+        */
+       if (skb->ip_summed != CHECKSUM_PARTIAL)
+               return features;
+
+       if (netdev_uses_dsa(dev)) {
+               struct dsa_device_ops *tag_ops = dev->dsa_ptr->tag_ops;
+
+               if (tag_ops && (tag_ops->proto != DSA_TAG_PROTO_MTK))
+                       features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+       }
+
+       return features;
+}
+
static inline int fe_skb_padto(struct sk_buff *skb, struct fe_priv *priv)
{
       unsigned int len;
@@ -1523,6 +1545,7 @@ static const struct net_device_ops fe_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
       .ndo_poll_controller    = fe_poll_controller,
#endif
+       .ndo_features_check     = fe_features_check,
};

static void fe_reset_pending(struct fe_priv *priv)

However, I still feel odd to call a function for every single packet
when the return value is exclusively dependent on a state that will
change only when the CPU port joins or leaves the DSA switch (or when
tag is changed).

Regards,

Luiz
Luiz Angelo Daros de Luca Jan. 30, 2022, 1:54 a.m. UTC | #43
> I suggested it might be checksum problem because I'm also affected. In
> my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> OS offloads checksum to HW but the mt7620a cannot calculate the
> checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> move the CPU tag to test if the mt7620a will then digest the frame
> correctly.

I implemented a new DSA tag (rtl8_4t, with "t" as in trailing) that
puts the DSA tag before the Ethernet CRC (the switch supports both).
With no tag in the mac layer, mediatek correctly calculated the ip
checksum. However, mediatek SoC included the extra bytes from the DSA
tag in the TCP checksum, even if they are after the ip length.

This is the packet leaving the OS:

0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
0030   fe 88 83 82 00 00 02 04 05 b4 04 02 08 0a 01 64
0040   fb 28 66 42 e0 79 01 03 03 03 88 99 04 00 00 20
0050   00 08

TCP checksum is at 0x0032 with 0x8382 is the tcp checksum
DSA Tag is at 0x4a with 8899040000200008

This is what arrived at the other end:

0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
0030   fe 88 c3 e8 00 00 02 04 05 b4 04 02 08 0a 01 64
0040   fb 28 66 42 e0 79 01 03 03 03

TCP checksum is 0xc3e8, but the correct one should be 0x50aa
If you calculate tcp checksum including 8899040000200008, you'll get exactly
0xc3e8 (I did the math).

So, If we use a trailing DSA tag, we can leave the IP checksum offloading on
and just turn off the TCP checksum offload. Is it worth it?

Is it still interesting to have the rtl8_4t merged?

Regards,
Luiz Angelo Daros de Luca Jan. 30, 2022, 4:42 a.m. UTC | #44
> > I suggested it might be checksum problem because I'm also affected. In
> > my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> > OS offloads checksum to HW but the mt7620a cannot calculate the
> > checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> > move the CPU tag to test if the mt7620a will then digest the frame
> > correctly.
>
> I implemented a new DSA tag (rtl8_4t, with "t" as in trailing) that
> puts the DSA tag before the Ethernet CRC (the switch supports both).
> With no tag in the mac layer, mediatek correctly calculated the ip
> checksum. However, mediatek SoC included the extra bytes from the DSA
> tag in the TCP checksum, even if they are after the ip length.
>
> This is the packet leaving the OS:
>
> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
> 0030   fe 88 83 82 00 00 02 04 05 b4 04 02 08 0a 01 64
> 0040   fb 28 66 42 e0 79 01 03 03 03 88 99 04 00 00 20
> 0050   00 08
>
> TCP checksum is at 0x0032 with 0x8382 is the tcp checksum
> DSA Tag is at 0x4a with 8899040000200008
>
> This is what arrived at the other end:
>
> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
> 0030   fe 88 c3 e8 00 00 02 04 05 b4 04 02 08 0a 01 64
> 0040   fb 28 66 42 e0 79 01 03 03 03
>
> TCP checksum is 0xc3e8, but the correct one should be 0x50aa
> If you calculate tcp checksum including 8899040000200008, you'll get exactly
> 0xc3e8 (I did the math).
>
> So, If we use a trailing DSA tag, we can leave the IP checksum offloading on
> and just turn off the TCP checksum offload. Is it worth it?

No, IP checksum is always done in SW.

> Is it still interesting to have the rtl8_4t merged?

Maybe it is. It has uncovered a problem. The case of trailing tags
seems to be unsolvable even with csum_start. AFAIK, the driver must
cksum from "skb->csum_start up to the end". When the switch is using
an incompatible tag, we have:

slave(): my features copied from master tells me I can offload
checksum. Do nothing
tagger(): add tag to the end of skb
master(): Offloading HW, chksum from csum_start until the end,
including the added tag
switch(): remove the tag, forward to the network
remove_client(): I got a packet with a broken checksum.

ndo_features_check() will not help because, either in HW or SW, it is
expected to calculate the checksum up to the end. However, there is no
csum_end or csum_len. I don't know if HW offloading will support some
kind of csum_end but it would not be a problem in SW (considering
skb_checksum_help() is adapted to something like skb_checksum_trimmed
without the clone).

That amount of bytes to ignore at the end is a complex question: the
driver either needs some hint (like it happens with skb->csum_offset)
to know where transport payload ends or the taggers (or the dsa) must
save the amount of extra bytes (or which tags were added) in the
sbk_buff. With that info, the driver can check if HW will work with a
different csum_start / csum_end or if only a supported tag is in use.

In my case, using an incompatible tailing tag, I just made it work
hacking dsa and forcing slave interfaces to disable offloading. This
way, checksum is calculated before any tag is added and offloading is
skipped. But it is not a real solution.

Regards,

Luiz,
Florian Fainelli Jan. 30, 2022, 5:24 p.m. UTC | #45
On 1/29/2022 8:42 PM, Luiz Angelo Daros de Luca wrote:
>>> I suggested it might be checksum problem because I'm also affected. In
>>> my case, I have an mt7620a SoC connected to the rtl8367s switch. The
>>> OS offloads checksum to HW but the mt7620a cannot calculate the
>>> checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
>>> move the CPU tag to test if the mt7620a will then digest the frame
>>> correctly.
>>
>> I implemented a new DSA tag (rtl8_4t, with "t" as in trailing) that
>> puts the DSA tag before the Ethernet CRC (the switch supports both).
>> With no tag in the mac layer, mediatek correctly calculated the ip
>> checksum. However, mediatek SoC included the extra bytes from the DSA
>> tag in the TCP checksum, even if they are after the ip length.
>>
>> This is the packet leaving the OS:
>>
>> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
>> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
>> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
>> 0030   fe 88 83 82 00 00 02 04 05 b4 04 02 08 0a 01 64
>> 0040   fb 28 66 42 e0 79 01 03 03 03 88 99 04 00 00 20
>> 0050   00 08
>>
>> TCP checksum is at 0x0032 with 0x8382 is the tcp checksum
>> DSA Tag is at 0x4a with 8899040000200008
>>
>> This is what arrived at the other end:
>>
>> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
>> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
>> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
>> 0030   fe 88 c3 e8 00 00 02 04 05 b4 04 02 08 0a 01 64
>> 0040   fb 28 66 42 e0 79 01 03 03 03
>>
>> TCP checksum is 0xc3e8, but the correct one should be 0x50aa
>> If you calculate tcp checksum including 8899040000200008, you'll get exactly
>> 0xc3e8 (I did the math).
>>
>> So, If we use a trailing DSA tag, we can leave the IP checksum offloading on
>> and just turn off the TCP checksum offload. Is it worth it?
> 
> No, IP checksum is always done in SW.
> 
>> Is it still interesting to have the rtl8_4t merged?
> 
> Maybe it is. It has uncovered a problem. The case of trailing tags
> seems to be unsolvable even with csum_start. AFAIK, the driver must
> cksum from "skb->csum_start up to the end". When the switch is using
> an incompatible tag, we have:
> 
> slave(): my features copied from master tells me I can offload
> checksum. Do nothing
> tagger(): add tag to the end of skb
> master(): Offloading HW, chksum from csum_start until the end,
> including the added tag
> switch(): remove the tag, forward to the network
> remove_client(): I got a packet with a broken checksum.

This is unfortunately expected here, because you program the hardware 
with the full Ethernet frame length which does include the trailer tag, 
and it then uses that length to calculate the transport header checksum 
over the enter payload, thinking the trailer tag is the UDP/TCP payload.

The checksum is calculated "on the fly" as part of the DMA operation to 
send the packet on the wire, so you cannot decouple the checksum 
calculation from the DMA operation, other than by not asking the HW *not 
to* checksum the packet, and instead having software provide that.

Now looking at the datasheet you quoted, there is this:

241. FE_GLO_CFG: Frame Engine Global Configuration (offset: 0x0000)

7:4 RW L2_SPACE L2 Space
(unit: 8 bytes)
0xB

Can you play with this and see if you can account for the extra 4 bytes 
added by the Realtek tag?

> 
> ndo_features_check() will not help because, either in HW or SW, it is
> expected to calculate the checksum up to the end. However, there is no
> csum_end or csum_len. I don't know if HW offloading will support some
> kind of csum_end but it would not be a problem in SW (considering
> skb_checksum_help() is adapted to something like skb_checksum_trimmed
> without the clone).
> 
> That amount of bytes to ignore at the end is a complex question: the
> driver either needs some hint (like it happens with skb->csum_offset)
> to know where transport payload ends or the taggers (or the dsa) must
> save the amount of extra bytes (or which tags were added) in the
> sbk_buff. With that info, the driver can check if HW will work with a
> different csum_start / csum_end or if only a supported tag is in use.
> 
> In my case, using an incompatible tailing tag, I just made it work
> hacking dsa and forcing slave interfaces to disable offloading. This
> way, checksum is calculated before any tag is added and offloading is
> skipped. But it is not a real solution.

Not sure which one is not a "real solution", but for this specific 
combination of DSA conduit driver and switch tag, you have to disable 
checksum offload in the conduit driver and provide it in software. The 
other way would be to configure the realtek switch to work with 
DSA_TAG_8021Q and see if you can continue to offload the data path since 
tagging would use regular 802.1Q vlans, but that means you are going to 
lose a whole lot of management functionality offered by the native 
Realtek tag.
Luiz Angelo Daros de Luca Jan. 31, 2022, 5:26 p.m. UTC | #46
> On 1/29/2022 8:42 PM, Luiz Angelo Daros de Luca wrote:
> >>> I suggested it might be checksum problem because I'm also affected. In
> >>> my case, I have an mt7620a SoC connected to the rtl8367s switch. The
> >>> OS offloads checksum to HW but the mt7620a cannot calculate the
> >>> checksum with the (EtherType) Realtek CPU Tag in place. I'll try to
> >>> move the CPU tag to test if the mt7620a will then digest the frame
> >>> correctly.
> >>
> >> I implemented a new DSA tag (rtl8_4t, with "t" as in trailing) that
> >> puts the DSA tag before the Ethernet CRC (the switch supports both).
> >> With no tag in the mac layer, mediatek correctly calculated the ip
> >> checksum. However, mediatek SoC included the extra bytes from the DSA
> >> tag in the TCP checksum, even if they are after the ip length.
> >>
> >> This is the packet leaving the OS:
> >>
> >> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
> >> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
> >> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
> >> 0030   fe 88 83 82 00 00 02 04 05 b4 04 02 08 0a 01 64
> >> 0040   fb 28 66 42 e0 79 01 03 03 03 88 99 04 00 00 20
> >> 0050   00 08
> >>
> >> TCP checksum is at 0x0032 with 0x8382 is the tcp checksum
> >> DSA Tag is at 0x4a with 8899040000200008
> >>
> >> This is what arrived at the other end:
> >>
> >> 0000   04 0e 3c fc 4f aa 50 d4 f7 33 15 8a 08 00 45 10
> >> 0010   00 3c 00 00 40 00 40 06 b7 58 c0 a8 01 01 c0 a8
> >> 0020   01 02 00 16 a1 50 80 da 39 e9 b2 2a 23 cf a0 12
> >> 0030   fe 88 c3 e8 00 00 02 04 05 b4 04 02 08 0a 01 64
> >> 0040   fb 28 66 42 e0 79 01 03 03 03
> >>
> >> TCP checksum is 0xc3e8, but the correct one should be 0x50aa
> >> If you calculate tcp checksum including 8899040000200008, you'll get exactly
> >> 0xc3e8 (I did the math).
> >>
> >> So, If we use a trailing DSA tag, we can leave the IP checksum offloading on
> >> and just turn off the TCP checksum offload. Is it worth it?
> >
> > No, IP checksum is always done in SW.
> >
> >> Is it still interesting to have the rtl8_4t merged?
> >
> > Maybe it is. It has uncovered a problem. The case of trailing tags
> > seems to be unsolvable even with csum_start. AFAIK, the driver must
> > cksum from "skb->csum_start up to the end". When the switch is using
> > an incompatible tag, we have:
> >
> > slave(): my features copied from master tells me I can offload
> > checksum. Do nothing
> > tagger(): add tag to the end of skb
> > master(): Offloading HW, chksum from csum_start until the end,
> > including the added tag
> > switch(): remove the tag, forward to the network
> > remove_client(): I got a packet with a broken checksum.
>
> This is unfortunately expected here, because you program the hardware
> with the full Ethernet frame length which does include the trailer tag,
> and it then uses that length to calculate the transport header checksum
> over the enter payload, thinking the trailer tag is the UDP/TCP payload.
>
> The checksum is calculated "on the fly" as part of the DMA operation to
> send the packet on the wire, so you cannot decouple the checksum
> calculation from the DMA operation, other than by not asking the HW *not
> to* checksum the packet, and instead having software provide that.
>
> Now looking at the datasheet you quoted, there is this:
>
> 241. FE_GLO_CFG: Frame Engine Global Configuration (offset: 0x0000)
>
> 7:4 RW L2_SPACE L2 Space
> (unit: 8 bytes)
> 0xB
>
> Can you play with this and see if you can account for the extra 4 bytes
> added by the Realtek tag?
>

I played with it, both with the L2_SPACE and RATE_MINUS:

FE_GLO_CFG_REG=0x10100000 FE_GLO_CFG_SIZE=32
FE_GLO_CFG=$(($(devmem $FE_GLO_CFG_REG $FE_GLO_CFG_SIZE)));
for l2space_sig in b0 b1 c0 c1 d0 d1 e0 e1 a0 a1 90 91 80 81 70 71 60
61 50 51 40 41 30 31 20 21 10 11 01 00 e0 e1; do
    FE_GLO_CFG=$(($(devmem $FE_GLO_CFG_REG $FE_GLO_CFG_SIZE)));
    printf 'Before FE_GLO_CFG = 0x%X\n' $FE_GLO_CFG;
    devmem $FE_GLO_CFG_REG $FE_GLO_CFG_SIZE $((FE_GLO_CFG & ~0x00000ff
| (0x$l2space_sig)));
    FE_GLO_CFG=$(($(devmem $FE_GLO_CFG_REG $FE_GLO_CFG_SIZE)));
    printf 'After  FE_GLO_CFG = 0x%X\n' $FE_GLO_CFG;
    echo "Please test L2_SPACE_sig==$l2space_sig"; read;
done; devmem $FE_GLO_CFG_REG $FE_GLO_CFG_SIZE $FE_GLO_CFG_ORIG

It only made a difference for values 0x0 and 0xf but it looks more
like an overflow. And only on the traffic I receive, not send. The
remote endpoint
always receive 0x8382 as the tcp checksum, which is the "fake ip header" sum.

The default value is 0xb (11) and docs says it is a 8-byte unit. What
is 11 * 8 bytes? 88 bytes? Maybe it is wrong in docs.
That same register also has EXT_VLAN, which points to 0x8100 (802.1Q ethertype).

In the same doc, there is also a mention about the L2 space usage,
only related to received traffic:

"1. RX_CTRL pass through VLAN tags on L2 space (at most 2 tags)" (page 245-247)

Anyway, even if the Mediatek switch could remove the Realtek tag, it
should not do that. The Realtek switch still needs it.

> > ndo_features_check() will not help because, either in HW or SW, it is
> > expected to calculate the checksum up to the end. However, there is no
> > csum_end or csum_len. I don't know if HW offloading will support some
> > kind of csum_end but it would not be a problem in SW (considering
> > skb_checksum_help() is adapted to something like skb_checksum_trimmed
> > without the clone).
> >
> > That amount of bytes to ignore at the end is a complex question: the
> > driver either needs some hint (like it happens with skb->csum_offset)
> > to know where transport payload ends or the taggers (or the dsa) must
> > save the amount of extra bytes (or which tags were added) in the
> > sbk_buff. With that info, the driver can check if HW will work with a
> > different csum_start / csum_end or if only a supported tag is in use.

I must be missing something. Is SW TCP checksum really broken when a
tailing tag is in use? If so, it will only work if TCP checksum
offload is enabled in a compatible HW. Anything else like different
vendors, software checksum or stacked tags will be broken.

> > In my case, using an incompatible tailing tag, I just made it work
> > hacking dsa and forcing slave interfaces to disable offloading. This
> > way, checksum is calculated before any tag is added and offloading is
> > skipped. But it is not a real solution.
>
> Not sure which one is not a "real solution", but for this specific
> combination of DSA conduit driver and switch tag, you have to disable
> checksum offload in the conduit driver and provide it in software. The
> other way would be to configure the realtek switch to work with
> DSA_TAG_8021Q and see if you can continue to offload the data path since
> tagging would use regular 802.1Q vlans, but that means you are going to
> lose a whole lot of management functionality offered by the native
> Realtek tag.

Definitely not a real solution. It was just a hack to check if
checksumming at slave device will overcome the issue. As I said,
simply disabling checksum and doing it in SW "as usual" is not enough
because SW checksum also sums to the end. We need to parse each
possible transport layer to find its end or taggers must hint how many
bytes to ignore, something like a new skb->cksum_stop_before_end.
Another solution would be to hint the slave interface if it needs to
checksum right there (modifying slave->vlan_features). None of that
exists today. Is it the right way?

--

Luiz
Vladimir Oltean Feb. 1, 2022, 2:46 p.m. UTC | #47
On Mon, Jan 31, 2022 at 02:26:30PM -0300, Luiz Angelo Daros de Luca wrote:
> > > In my case, using an incompatible tailing tag, I just made it work
> > > hacking dsa and forcing slave interfaces to disable offloading. This
> > > way, checksum is calculated before any tag is added and offloading is
> > > skipped. But it is not a real solution.
> >
> > Not sure which one is not a "real solution", but for this specific
> > combination of DSA conduit driver and switch tag, you have to disable
> > checksum offload in the conduit driver and provide it in software. The
> > other way would be to configure the realtek switch to work with
> > DSA_TAG_8021Q and see if you can continue to offload the data path since
> > tagging would use regular 802.1Q vlans, but that means you are going to
> > lose a whole lot of management functionality offered by the native
> > Realtek tag.
> 
> Definitely not a real solution. It was just a hack to check if
> checksumming at slave device will overcome the issue. As I said,
> simply disabling checksum and doing it in SW "as usual" is not enough
> because SW checksum also sums to the end. We need to parse each
> possible transport layer to find its end or taggers must hint how many
> bytes to ignore, something like a new skb->cksum_stop_before_end.
> Another solution would be to hint the slave interface if it needs to
> checksum right there (modifying slave->vlan_features). None of that
> exists today. Is it the right way?

I think we're not getting any closer to a solution if we've started
discussing tail taggers.

See commit 37120f23ac89 ("net: dsa: tag_ksz: dont let the hardware
process the layer 4 checksum"). It proves that if you calculate the L4
checksum in software before inserting the DSA tag, it won't get
recalculated upon dev_queue_xmit() on the DSA master, since
skb_checksum_help() transitions skb->ip_summed to CHECKSUM_NONE, and the
process of inserting a header/trailer will not update the checksum, so
it will end up being correct on the receive end after the tail tag is
stripped.

Otherwise, I don't completely understand what is the end goal you're
after. Each skb is checked for netdev features when determining whether
to calculate the L4 checksum in software or not. Then even if that skb
was marked for L4 checksum offload by the stack, you can still call
skb_checksum_help() from the xmit procedure of the driver.

Do you want hardware offloading with your DSA header, or why do you say
that forcing slave interfaces to disable the offload is not a real
solution? If so, I recommend looking into a custom tagging protocol
based on tag_8021q.c, but word of warning, some elbow grease will be
required.

If you're ok with software checksumming and just want the minimum amount
of checks in the fastpath, I believe you should listen for
NETDEV_CHANGEUPPER events in your DSA master driver, where
dsa_slave_dev_check(info->upper_dev) is true. From there you should be
able to retrieve the tagging protocol used (if you can't, then export some
helpers that will do that), and enable NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
in master->features if the tag is Mediatek, clear them otherwise.
See bcmsysport.c for an example.
The timing of this notifier is such that it's pointless to mangle
master->vlan_features at that stage, since DSA has already inherited
them. So DSA slaves would still report NETIF_F_IP_CSUM, but the DSA
master would force a software calculation from the correct L3 & L4
offsets, and it would practically work.
Alternatively, I think you could move dsa_slave_setup_tagger() beneath
netdev_upper_dev_link(), and this would give the DSA master an
opportunity to modulate its master->vlan_features in a way that is
desirable to you. I don't see something that would break if you do that.

As Florian and Jakub explained, the APIs for TX checksumming are what
they are, I'm not very happy with the state of things either, but I
can't justify a DSA-specific API. With HW_CSUM, the stack gives you an
L3 and L4 offset, and that is compatible with DSA headers (not
trailers), so the onus is on the DSA master to fall back to software on
offsets it doesn't like.  One could argue that DSA should not work with
IP_CSUM | IPV6_CSUM, but I believe that there are existing drivers that
use these checksum features and that do work at least with certain DSA
tagging protocols (bcmsysport) or even look at the L3 and L4 offsets
(mvneta), meaning that they would work generically with DSA. So
practically speaking, if we issue a blanket statement that DSA shouldn't
inherit IP_CSUM | IPV6_CSUM but just HW_CSUM, that would still break
working setups. Now, we could still do that (since IP_CSUM | IPV6_CSUM
are theoretically deprecated), but then you'd have to be there and help
with some more elbow grease to fix the breakage in mvneta etc, to
convert them to HW_CSUM.
diff mbox series

Patch

diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c
index 59e08b192c06..6a00a162b2ac 100644
--- a/drivers/net/dsa/realtek/rtl8365mb.c
+++ b/drivers/net/dsa/realtek/rtl8365mb.c
@@ -556,7 +556,6 @@  struct rtl8365mb_port {
  * @chip_ver: chip silicon revision
  * @port_mask: mask of all ports
  * @learn_limit_max: maximum number of L2 addresses the chip can learn
- * @cpu: CPU tagging and CPU port configuration for this chip
  * @mib_lock: prevent concurrent reads of MIB counters
  * @ports: per-port data
  * @jam_table: chip-specific initialization jam table
@@ -571,7 +570,6 @@  struct rtl8365mb {
 	u32 chip_ver;
 	u32 port_mask;
 	u32 learn_limit_max;
-	struct rtl8365mb_cpu cpu;
 	struct mutex mib_lock;
 	struct rtl8365mb_port ports[RTL8365MB_MAX_NUM_PORTS];
 	const struct rtl8365mb_jam_tbl_entry *jam_table;
@@ -769,17 +767,20 @@  static int rtl8365mb_ext_config_rgmii(struct realtek_priv *priv, int port,
 	u32 val;
 	int ret;
 
-	if (port != priv->cpu_port) {
-		dev_err(priv->dev, "only one EXT interface is currently supported\n");
+	mb = priv->chip_data;
+	p = &mb->ports[port];
+	ext_int = p->ext_int;
+
+	if (ext_int == RTL8365MB_NOT_EXT) {
+		dev_err(priv->dev,
+			"Port %d is not identified as extenal interface.\n",
+			port);
 		return -EINVAL;
 	}
 
 	dp = dsa_to_port(priv->ds, port);
 	dn = dp->dn;
 
-	mb = priv->chip_data;
-	p = &mb->ports[port];
-	ext_int = p->ext_int;
 
 	/* Set the RGMII TX/RX delay
 	 *
@@ -859,15 +860,17 @@  static int rtl8365mb_ext_config_forcemode(struct realtek_priv *priv, int port,
 	int val;
 	int ret;
 
-	if (port != priv->cpu_port) {
-		dev_err(priv->dev, "only one EXT interface is currently supported\n");
-		return -EINVAL;
-	}
-
 	mb = priv->chip_data;
 	p = &mb->ports[port];
 	ext_int = p->ext_int;
 
+	if (ext_int == RTL8365MB_NOT_EXT) {
+		dev_err(priv->dev,
+			"Port %d is not identified as extenal interface.\n",
+			port);
+		return -EINVAL;
+	}
+
 	if (link) {
 		/* Force the link up with the desired configuration */
 		r_link = 1;
@@ -1734,10 +1737,8 @@  static void rtl8365mb_irq_teardown(struct realtek_priv *priv)
 	}
 }
 
-static int rtl8365mb_cpu_config(struct realtek_priv *priv)
+static int rtl8365mb_cpu_config(struct realtek_priv *priv, struct rtl8365mb_cpu *cpu)
 {
-	struct rtl8365mb *mb = priv->chip_data;
-	struct rtl8365mb_cpu *cpu = &mb->cpu;
 	u32 val;
 	int ret;
 
@@ -1839,11 +1840,17 @@  static int rtl8365mb_setup(struct dsa_switch *ds)
 		dev_info(priv->dev, "no interrupt support\n");
 
 	/* Configure CPU tagging */
+	cpu.mask = 0;
 	dsa_switch_for_each_cpu_port(cpu_dp, priv->ds) {
-		priv->cpu_port = cpu_dp->index;
-		mb->cpu.mask = BIT(priv->cpu_port);
-		mb->cpu.trap_port = priv->cpu_port;
-		ret = rtl8365mb_cpu_config(priv);
+		cpu.enable = 1;
+		cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
+		cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
+		cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
+		cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
+		cpu.trap_port = cpu_dp->index;
+		cpu.mask |= BIT(cpu_dp->index);
+
+		ret = rtl8365mb_cpu_config(priv, &cpu);
 		if (ret)
 			goto out_teardown_irq;
 
@@ -1862,7 +1869,7 @@  static int rtl8365mb_setup(struct dsa_switch *ds)
 		dn = dsa_to_port(priv->ds, i)->dn;
 
 		/* Forward only to the CPU */
-		ret = rtl8365mb_port_set_isolation(priv, i, BIT(priv->cpu_port));
+		ret = rtl8365mb_port_set_isolation(priv, i, cpu.mask);
 		if (ret)
 			goto out_teardown_irq;
 
@@ -2003,12 +2010,6 @@  static int rtl8365mb_detect(struct realtek_priv *priv)
 		mb->jam_table = rtl8365mb_init_jam_8365mb_vc;
 		mb->jam_size = ARRAY_SIZE(rtl8365mb_init_jam_8365mb_vc);
 
-		mb->cpu.enable = 1;
-		mb->cpu.insert = RTL8365MB_CPU_INSERT_TO_ALL;
-		mb->cpu.position = RTL8365MB_CPU_POS_AFTER_SA;
-		mb->cpu.rx_length = RTL8365MB_CPU_RXLEN_64BYTES;
-		mb->cpu.format = RTL8365MB_CPU_FORMAT_8BYTES;
-
 		break;
 	default:
 		dev_err(priv->dev,