diff mbox series

[bpf-next,V1] igc: enable and fix RX hash usage by netstack

Message ID 167604167956.1726972.7266620647404438534.stgit@firesoul (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next,V1] igc: enable and fix RX hash usage by netstack | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 0 this patch: 2
netdev/cc_maintainers fail 1 blamed authors not CCed: sasha.neftin@intel.com; 8 maintainers not CCed: john.fastabend@gmail.com pabeni@redhat.com kuba@kernel.org jesse.brandeburg@intel.com edumazet@google.com sasha.neftin@intel.com hawk@kernel.org davem@davemloft.net
netdev/build_clang success Errors and warnings before: 2 this patch: 2
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn fail Errors and warnings before: 0 this patch: 2
netdev/checkpatch fail CHECK: Please use a blank line after function/struct/union/enum declarations ERROR: spaces required around that '=' (ctx:VxW) WARNING: line length of 82 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix

Commit Message

Jesper Dangaard Brouer Feb. 10, 2023, 3:07 p.m. UTC
When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
("igc: Add transmit and receive fastpath and interrupt handlers"), the
hardware wasn't configured to provide RSS hash, thus it made sense to not
enable net_device NETIF_F_RXHASH feature bit.

The NIC hardware was configured to enable RSS hash info in v5.2 via commit
2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
forgot to set the NETIF_F_RXHASH feature bit.

The original implementation of igc_rx_hash() didn't extract the associated
pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
this patch are about extracting the RSS Type from the hardware and mapping
this to enum pkt_hash_types. This were based on Foxville i225 software user
manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).

For UDP it's worth noting that RSS (type) hashing have been disabled both for
IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
because hardware RSS doesn't handle fragmented pkts well when enabled (can
cause out-of-order). This result in PKT_HASH_TYPE_L3 for UDP packets, and
hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
the effect that netstack will do a software based hash calc calling into
flow_dissect, but only when code calls skb_get_hash(), which doesn't
necessary happen for local delivery.

Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/igc/igc.h      |   52 +++++++++++++++++++++++++++++
 drivers/net/ethernet/intel/igc/igc_main.c |   35 +++++++++++++++++---
 2 files changed, 83 insertions(+), 4 deletions(-)

Comments

Jesper Dangaard Brouer Feb. 10, 2023, 3:23 p.m. UTC | #1
On 10/02/2023 16.07, Jesper Dangaard Brouer wrote:
> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
> hardware wasn't configured to provide RSS hash, thus it made sense to not
> enable net_device NETIF_F_RXHASH feature bit.
> 
> The NIC hardware was configured to enable RSS hash info in v5.2 via commit
> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
> forgot to set the NETIF_F_RXHASH feature bit.
>

Sending this fix against bpf-next, as I found this issue while playing
with implementing XDP-hints kfunc for xmo_rx_hash. I will hopefully send
kfunc patches next week, on top of this.IMHO this fix isn't very 
critical and I hope it can simply go though the
bpf-next tree as it would ease followup kfunc patches.


> The original implementation of igc_rx_hash() didn't extract the associated
> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
> this patch are about extracting the RSS Type from the hardware and mapping
> this to enum pkt_hash_types. This were based on Foxville i225 software user
> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).
> 
> For UDP it's worth noting that RSS (type) hashing have been disabled both for
> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
> because hardware RSS doesn't handle fragmented pkts well when enabled (can
> cause out-of-order). This result in PKT_HASH_TYPE_L3 for UDP packets, and
> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
> the effect that netstack will do a software based hash calc calling into
> flow_dissect, but only when code calls skb_get_hash(), which doesn't
> necessary happen for local delivery.
>


Intel QA tester wanting to verify this patch can use the small bpftrace
tool I wrote and placed here:

 
https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt

Failure scenarios:

$ sudo ./monitor_skb_hash_on_dev.bt igc1
Attaching 2 probes...
Monitor net_device: igc1
Hit Ctrl-C to end.
IFNAME           HASH      Hash-type:L4    Software-hash
igc1             00000000  0               0
igc1             00000000  0               0
igc1             00000000  0               0
^C


Example output with patch:

$ sudo ./monitor_skb_hash_on_dev.bt igc1
Attaching 2 probes...
Monitor net_device: igc1
Hit Ctrl-C to end.
IFNAME           HASH      Hash-type:L4    Software-hash
igc1             FEF98EFE  0               0
igc1             00000000  0               0
igc1             00000000  0               0
igc1             FEF98EFE  0               0
igc1             FEF98EFE  0               0
igc1             FEF98EFE  0               0
igc1             310AF9EA  1               0
igc1             A229FA51  1               0

The repeating hash FEF98EFE is UDP packets that as desc note doesn't
have Hash-type:L4.  The UDP has is repeating as port numbers aren't part
of the hash, and I was sending to same IP. The hash values with L4=1
were TCP packets.

Hope this eases QA work.

> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>   drivers/net/ethernet/intel/igc/igc.h      |   52 +++++++++++++++++++++++++++++
>   drivers/net/ethernet/intel/igc/igc_main.c |   35 +++++++++++++++++---
>   2 files changed, 83 insertions(+), 4 deletions(-)

--Jesper
Alexander Lobakin Feb. 14, 2023, 1:21 p.m. UTC | #2
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 10 Feb 2023 16:07:59 +0100

> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
> hardware wasn't configured to provide RSS hash, thus it made sense to not
> enable net_device NETIF_F_RXHASH feature bit.

[...]

> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>  #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
>  #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
>  
> +/* RX-desc Write-Back format RSS Type's */
> +enum igc_rss_type_num {
> +	IGC_RSS_TYPE_NO_HASH		= 0,
> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
> +	IGC_RSS_TYPE_MAX		= 10,
> +};
> +#define IGC_RSS_TYPE_MAX_TABLE		16
> +#define IGC_RSS_TYPE_MASK		0xF

GENMASK()?

> +
> +/* igc_rss_type - Rx descriptor RSS type field */
> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)

Why use types shorter than u32 on the stack?
Why this union is not const here, since there are no modifications?

> +{
> +	/* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
> +	return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info & IGC_RSS_TYPE_MASK;

The most important I wanted to mention: doesn't this function make the
CPU read the uncached field again, while you could just read it once
onto the stack and then extract all such data from there?

> +}
> +
> +/* Packet header type identified by hardware (when BIT(11) is zero).
> + * Even when UDP ports are not part of RSS hash HW still parse and mark UDP bits
> + */
> +enum igc_pkt_type_bits {
> +	IGC_PKT_TYPE_HDR_IPV4	=	BIT(0),
> +	IGC_PKT_TYPE_HDR_IPV4_WITH_OPT=	BIT(1), /* IPv4 Hdr includes IP options */
> +	IGC_PKT_TYPE_HDR_IPV6	=	BIT(2),
> +	IGC_PKT_TYPE_HDR_IPV6_WITH_EXT=	BIT(3), /* IPv6 Hdr includes extensions */
> +	IGC_PKT_TYPE_HDR_L4_TCP	=	BIT(4),
> +	IGC_PKT_TYPE_HDR_L4_UDP	=	BIT(5),
> +	IGC_PKT_TYPE_HDR_L4_SCTP=	BIT(6),
> +	IGC_PKT_TYPE_HDR_NFS	=	BIT(7),
> +	/* Above only valid when BIT(11) is zero */
> +	IGC_PKT_TYPE_L2		=	BIT(11),
> +	IGC_PKT_TYPE_VLAN	=	BIT(12),
> +	IGC_PKT_TYPE_MASK	=	0x1FFF, /* 13-bits */

Also GENMASK().

> +};
> +
> +/* igc_pkt_type - Rx descriptor Packet type field */
> +static inline u16 igc_pkt_type(union igc_adv_rx_desc *rx_desc)

Also short types and consts.

> +{
> +	u32 data = le32_to_cpu(rx_desc->wb.lower.lo_dword.data);
> +	/* Packet type is 13-bits - as bits (16:4) in lower.lo_dword*/
> +	u16 pkt_type = (data >> 4) & IGC_PKT_TYPE_MASK;

Perfect candidate for FIELD_GET(). No, even for le32_get_bits().

Also my note above about excessive expensive reads.

> +
> +	return pkt_type;
> +}
> +
>  /* Interrupt defines */
>  #define IGC_START_ITR			648 /* ~6000 ints/sec */
>  #define IGC_4K_ITR			980
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 8b572cd2c350..42a072509d2a 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -1677,14 +1677,40 @@ static void igc_rx_checksum(struct igc_ring *ring,
>  		   le32_to_cpu(rx_desc->wb.upper.status_error));
>  }
>  
> +/* Mapping HW RSS Type to enum pkt_hash_types */
> +struct igc_rss_type {
> +	u8 hash_type; /* can contain enum pkt_hash_types */

Why make a struct for one field? + short type note

> +} igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
> +	[IGC_RSS_TYPE_NO_HASH].hash_type	  = PKT_HASH_TYPE_L2,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_IPV4].hash_type	  = PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_IPV6_EX].hash_type	  = PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_IPV6].hash_type	  = PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
> +	[10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9 "Reserved" by HW */
> +	[11].hash_type = PKT_HASH_TYPE_L2,
> +	[12].hash_type = PKT_HASH_TYPE_L2,
> +	[13].hash_type = PKT_HASH_TYPE_L2,
> +	[14].hash_type = PKT_HASH_TYPE_L2,
> +	[15].hash_type = PKT_HASH_TYPE_L2,

Why define those empty if you could do a bound check in the code
instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.

> +};
> +
>  static inline void igc_rx_hash(struct igc_ring *ring,
>  			       union igc_adv_rx_desc *rx_desc,
>  			       struct sk_buff *skb)
>  {
> -	if (ring->netdev->features & NETIF_F_RXHASH)
> -		skb_set_hash(skb,
> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> -			     PKT_HASH_TYPE_L3);
> +	if (ring->netdev->features & NETIF_F_RXHASH) {

	if (!(feature & HASH))
		return;

and -1 indent level?

> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
> +		u8  rss_type = igc_rss_type(rx_desc);
> +		enum pkt_hash_types hash_type;
> +
> +		hash_type = igc_rss_type_table[rss_type].hash_type;
> +		skb_set_hash(skb, rss_hash, hash_type);
> +	}
>  }

[...]

Thanks,
Olek
Paul Menzel Feb. 14, 2023, 3 p.m. UTC | #3
Dear Jesper,


Thank you very much for your patch.

Am 10.02.23 um 16:07 schrieb Jesper Dangaard Brouer:
> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
> hardware wasn't configured to provide RSS hash, thus it made sense to not
> enable net_device NETIF_F_RXHASH feature bit.
> 
> The NIC hardware was configured to enable RSS hash info in v5.2 via commit
> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
> forgot to set the NETIF_F_RXHASH feature bit.
> 
> The original implementation of igc_rx_hash() didn't extract the associated
> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
> this patch are about extracting the RSS Type from the hardware and mapping
> this to enum pkt_hash_types. This were based on Foxville i225 software user

s/This were/This was/

> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).
> 
> For UDP it's worth noting that RSS (type) hashing have been disabled both for
> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
> because hardware RSS doesn't handle fragmented pkts well when enabled (can
> cause out-of-order). This result in PKT_HASH_TYPE_L3 for UDP packets, and

result*s*

> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
> the effect that netstack will do a software based hash calc calling into
> flow_dissect, but only when code calls skb_get_hash(), which doesn't
> necessary happen for local delivery.

Excuse my ignorance, but is that bug visible in practice by users 
(performance?) or is that fix needed for future work?

> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>   drivers/net/ethernet/intel/igc/igc.h      |   52 +++++++++++++++++++++++++++++
>   drivers/net/ethernet/intel/igc/igc_main.c |   35 +++++++++++++++++---
>   2 files changed, 83 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> index df3e26c0cf01..a112eeb59525 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
>   
> +/* RX-desc Write-Back format RSS Type's */
> +enum igc_rss_type_num {
> +	IGC_RSS_TYPE_NO_HASH		= 0,
> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
> +	IGC_RSS_TYPE_MAX		= 10,
> +};
> +#define IGC_RSS_TYPE_MAX_TABLE		16
> +#define IGC_RSS_TYPE_MASK		0xF
> +
> +/* igc_rss_type - Rx descriptor RSS type field */
> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
> +{
> +	/* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
> +	return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info & IGC_RSS_TYPE_MASK;
> +}

Is it necessary to specficy the length of the return value, or could it 
be `unsigned int`. Using “native” types is normally more performant [1]. 
`scripts/bloat-o-meter` might help to verify that.

[…]

>   static inline void igc_rx_hash(struct igc_ring *ring,
>   			       union igc_adv_rx_desc *rx_desc,
>   			       struct sk_buff *skb)
>   {
> -	if (ring->netdev->features & NETIF_F_RXHASH)
> -		skb_set_hash(skb,
> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> -			     PKT_HASH_TYPE_L3);
> +	if (ring->netdev->features & NETIF_F_RXHASH) {
> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
> +		u8  rss_type = igc_rss_type(rx_desc);

Amongst others, also here.

> +		enum pkt_hash_types hash_type;
> +
> +		hash_type = igc_rss_type_table[rss_type].hash_type;
> +		skb_set_hash(skb, rss_hash, hash_type);
> +	}
>   }
>   
>   static void igc_rx_vlan(struct igc_ring *rx_ring,
> @@ -6501,6 +6527,7 @@ static int igc_probe(struct pci_dev *pdev,
>   	netdev->features |= NETIF_F_TSO;
>   	netdev->features |= NETIF_F_TSO6;
>   	netdev->features |= NETIF_F_TSO_ECN;
> +	netdev->features |= NETIF_F_RXHASH;
>   	netdev->features |= NETIF_F_RXCSUM;
>   	netdev->features |= NETIF_F_HW_CSUM;
>   	netdev->features |= NETIF_F_SCTP_CRC;


Kind regards,

Paul


[1]: https://notabs.org/coding/smallIntsBigPenalty.htm
Alexander Lobakin Feb. 14, 2023, 3:13 p.m. UTC | #4
From: Paul Menzel <pmenzel@molgen.mpg.de>
Date: Tue, 14 Feb 2023 16:00:52 +0100

> Dear Jesper,
> 
> 
> Thank you very much for your patch.
> 
> Am 10.02.23 um 16:07 schrieb Jesper Dangaard Brouer:
>> When function igc_rx_hash() was introduced in v4.20 via commit
>> 0507ef8a0372
>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>> hardware wasn't configured to provide RSS hash, thus it made sense to not
>> enable net_device NETIF_F_RXHASH feature bit.
>>
>> The NIC hardware was configured to enable RSS hash info in v5.2 via
>> commit
>> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
>> forgot to set the NETIF_F_RXHASH feature bit.
>>
>> The original implementation of igc_rx_hash() didn't extract the
>> associated
>> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest
>> portions of
>> this patch are about extracting the RSS Type from the hardware and
>> mapping
>> this to enum pkt_hash_types. This were based on Foxville i225 software
>> user
> 
> s/This were/This was/
> 
>> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev
>> 03).
>>
>> For UDP it's worth noting that RSS (type) hashing have been disabled
>> both for
>> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP +
>> IGC_MRQC_RSS_FIELD_IPV6_UDP)
>> because hardware RSS doesn't handle fragmented pkts well when enabled
>> (can
>> cause out-of-order). This result in PKT_HASH_TYPE_L3 for UDP packets, and
> 
> result*s*
> 
>> hash value doesn't include UDP port numbers. Not being
>> PKT_HASH_TYPE_L4, have
>> the effect that netstack will do a software based hash calc calling into
>> flow_dissect, but only when code calls skb_get_hash(), which doesn't
>> necessary happen for local delivery.
> 
> Excuse my ignorance, but is that bug visible in practice by users
> (performance?) or is that fix needed for future work?

Hash calculation always happens when RPS or RFS is enabled. So having no
hash in skb before hitting the netstack slows down their performance.
Also, no hash in skb passed from the driver results in worse NAPI bucket
distribution when there are more traffic flows than Rx queues / CPUs.
+ Netfilter needs hashes on some configurations.

On default configurations and workloads like browsing the Internet this
usually is not the case, but only then I'd say.

> 
>> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control
>> supporting")
>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

[...]

Nice to see that you also care about (not) using short types on the stack :)

> Kind regards,
> 
> Paul
> 
> 
> [1]: https://notabs.org/coding/smallIntsBigPenalty.htm

Thanks,
Olek
Jesper Dangaard Brouer Feb. 16, 2023, 1:29 p.m. UTC | #5
On 14/02/2023 16.00, Paul Menzel wrote:
> 
> Thank you very much for your patch.

Thanks for your review :-)

> Am 10.02.23 um 16:07 schrieb Jesper Dangaard Brouer:
>> When function igc_rx_hash() was introduced in v4.20 via commit 
>> 0507ef8a0372
>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>> hardware wasn't configured to provide RSS hash, thus it made sense to not
>> enable net_device NETIF_F_RXHASH feature bit.
>>
>> The NIC hardware was configured to enable RSS hash info in v5.2 via 
>> commit
>> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
>> forgot to set the NETIF_F_RXHASH feature bit.
>>
>> The original implementation of igc_rx_hash() didn't extract the associated
>> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
>> this patch are about extracting the RSS Type from the hardware and mapping
>> this to enum pkt_hash_types. This were based on Foxville i225 software 
>> user
> 
> s/This were/This was/

Fixed for V2

>> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 
>> 03).
>>
>> For UDP it's worth noting that RSS (type) hashing have been disabled both for
>> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
>> because hardware RSS doesn't handle fragmented pkts well when enabled 
>> (can cause out-of-order). This result in PKT_HASH_TYPE_L3 for UDP packets, and
> 
> result*s*

Fixed for V2

> 
>> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
>> the effect that netstack will do a software based hash calc calling into
>> flow_dissect, but only when code calls skb_get_hash(), which doesn't
>> necessary happen for local delivery.
> 
> Excuse my ignorance, but is that bug visible in practice by users 
> (performance?) or is that fix needed for future work?
> 
>> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> ---
>>   drivers/net/ethernet/intel/igc/igc.h      |   52 
>> +++++++++++++++++++++++++++++
>>   drivers/net/ethernet/intel/igc/igc_main.c |   35 +++++++++++++++++---
>>   2 files changed, 83 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/igc/igc.h 
>> b/drivers/net/ethernet/intel/igc/igc.h
>> index df3e26c0cf01..a112eeb59525 100644
>> --- a/drivers/net/ethernet/intel/igc/igc.h
>> +++ b/drivers/net/ethernet/intel/igc/igc.h
>> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP    0x00400000
>>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP    0x00800000
>> +/* RX-desc Write-Back format RSS Type's */
>> +enum igc_rss_type_num {
>> +    IGC_RSS_TYPE_NO_HASH        = 0,
>> +    IGC_RSS_TYPE_HASH_TCP_IPV4    = 1,
>> +    IGC_RSS_TYPE_HASH_IPV4        = 2,
>> +    IGC_RSS_TYPE_HASH_TCP_IPV6    = 3,
>> +    IGC_RSS_TYPE_HASH_IPV6_EX    = 4,
>> +    IGC_RSS_TYPE_HASH_IPV6        = 5,
>> +    IGC_RSS_TYPE_HASH_TCP_IPV6_EX    = 6,
>> +    IGC_RSS_TYPE_HASH_UDP_IPV4    = 7,
>> +    IGC_RSS_TYPE_HASH_UDP_IPV6    = 8,
>> +    IGC_RSS_TYPE_HASH_UDP_IPV6_EX    = 9,
>> +    IGC_RSS_TYPE_MAX        = 10,
>> +};
>> +#define IGC_RSS_TYPE_MAX_TABLE        16
>> +#define IGC_RSS_TYPE_MASK        0xF
>> +
>> +/* igc_rss_type - Rx descriptor RSS type field */
>> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
>> +{
>> +    /* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
>> +    return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info & IGC_RSS_TYPE_MASK;
>> +}
> 
> Is it necessary to specficy the length of the return value, or could it 
> be `unsigned int`. Using “native” types is normally more performant [1]. 
> `scripts/bloat-o-meter` might help to verify that.
> 

Thanks for the link[1].
Alex/Olek also pointed this out.

The Agner's instruction latency tables[2] do indicate the latency is
slightly higher for r8 and r16 (and m8/m16).  And we likely need to look 
at the zero-extend variants movzx.

I think we should investigate this with "tool" godbolt.org as
scripts/bloat-o-meter will only tell us about code size.
I will experiment a bit and report back :-)

[2] https://www.agner.org/optimize/instruction_tables.pdf

> […]
> 
>>   static inline void igc_rx_hash(struct igc_ring *ring,
>>                      union igc_adv_rx_desc *rx_desc,
>>                      struct sk_buff *skb)
>>   {
>> -    if (ring->netdev->features & NETIF_F_RXHASH)
>> -        skb_set_hash(skb,
>> -                 le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>> -                 PKT_HASH_TYPE_L3);
>> +    if (ring->netdev->features & NETIF_F_RXHASH) {
>> +        u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>> +        u8  rss_type = igc_rss_type(rx_desc);
> 
> Amongst others, also here.

Do notice I expect compiler to optimize this, such that is doesn't place 
this variable on the stack.

>> +        enum pkt_hash_types hash_type;
>> +
>> +        hash_type = igc_rss_type_table[rss_type].hash_type;
>> +        skb_set_hash(skb, rss_hash, hash_type);
>> +    }
>>   }
>>   static void igc_rx_vlan(struct igc_ring *rx_ring,
>> @@ -6501,6 +6527,7 @@ static int igc_probe(struct pci_dev *pdev,
>>       netdev->features |= NETIF_F_TSO;
>>       netdev->features |= NETIF_F_TSO6;
>>       netdev->features |= NETIF_F_TSO_ECN;
>> +    netdev->features |= NETIF_F_RXHASH;
>>       netdev->features |= NETIF_F_RXCSUM;
>>       netdev->features |= NETIF_F_HW_CSUM;
>>       netdev->features |= NETIF_F_SCTP_CRC;
> 
> 
> Kind regards,
> 
> Paul
> 
> 
> [1]: https://notabs.org/coding/smallIntsBigPenalty.htm
>
Jesper Dangaard Brouer Feb. 16, 2023, 3:17 p.m. UTC | #6
On 14/02/2023 16.13, Alexander Lobakin wrote:
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Date: Tue, 14 Feb 2023 16:00:52 +0100
>>
>> Am 10.02.23 um 16:07 schrieb Jesper Dangaard Brouer:
>>> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
>>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>>> hardware wasn't configured to provide RSS hash, thus it made sense to not
>>> enable net_device NETIF_F_RXHASH feature bit.
>>>
[...]
>>
>>> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
>>> the effect that netstack will do a software based hash calc calling into
>>> flow_dissect, but only when code calls skb_get_hash(), which doesn't
>>> necessary happen for local delivery.
>>
>> Excuse my ignorance, but is that bug visible in practice by users
>> (performance?) or is that fix needed for future work?
> 
> Hash calculation always happens when RPS or RFS is enabled. So having no
> hash in skb before hitting the netstack slows down their performance.
> Also, no hash in skb passed from the driver results in worse NAPI bucket
> distribution when there are more traffic flows than Rx queues / CPUs.
> + Netfilter needs hashes on some configurations.
> 

Thanks Olek for explaining that.

My perf measurements show that the expensive part is that netstack will
call the flow_dissector code, when the hardware RX-hash is missing.

>>
>>> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control
>>> supporting")
>>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> [...]
> 
> Nice to see that you also care about (not) using short types on the stack :)

As can be seen by godbolt.org exploration[0] I have done, the stack
isn't used for storing the values.

  [0] 
https://github.com/xdp-project/xdp-project/tree/master/areas/hints/godbolt/

I have created three files[2] with C-code that can be compiled via 
https://godbolt.org/.  The C-code contains a comment with the ASM code 
that was generated with -02 with compiler x86-64 gcc 12.2.

The first file[01] corresponds to this patch.

  [01] 
https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt01.c
  [G01] https://godbolt.org/z/j79M9aTsn

The second file igc_godbolt02.c [02] have changes in [diff02]

  [02] 
https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt02.c
  [G02] https://godbolt.org/z/sErqe4qd5
  [diff02] https://github.com/xdp-project/xdp-project/commit/1f3488a932767

The third file igc_godbolt03.c [03] have changes in [diff03]

  [03] 
https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt03.c
  [G03] https://godbolt.org/z/5K3vE1Wsv
  [diff03] https://github.com/xdp-project/xdp-project/commit/aa9298f68705

Summary, the only thing we can save is replacing some movzx
(zero-extend) with mov instructions.

>>
>> [1]: https://notabs.org/coding/smallIntsBigPenalty.htm
> 
> Thanks,
> Olek
>
Alexander Lobakin Feb. 16, 2023, 3:34 p.m. UTC | #7
From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Date: Thu, 16 Feb 2023 14:29:43 +0100

> 
> On 14/02/2023 16.00, Paul Menzel wrote:
>>
>> Thank you very much for your patch.
> 
> Thanks for your review :-)

[...]

>>>   static inline void igc_rx_hash(struct igc_ring *ring,
>>>                      union igc_adv_rx_desc *rx_desc,
>>>                      struct sk_buff *skb)
>>>   {
>>> -    if (ring->netdev->features & NETIF_F_RXHASH)
>>> -        skb_set_hash(skb,
>>> -                 le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>> -                 PKT_HASH_TYPE_L3);
>>> +    if (ring->netdev->features & NETIF_F_RXHASH) {
>>> +        u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>>> +        u8  rss_type = igc_rss_type(rx_desc);
>>
>> Amongst others, also here.
> 
> Do notice I expect compiler to optimize this, such that is doesn't place
> this variable on the stack.

That's complicated I'd say. I mean, never trust the compilers.
Yesterday I had a problem when the compiler (Clang 16) wasn't inlining
oneliners (called only once) into one big function and was making
19-byte tiny functions from each of them, resulting in 10+ functions
consisting of one call only => 2x jumps between functions :clownface:
Especially when you use non-native words (u8/u16, also u64 for 32-bit
arches) like this.

> 
>>> +        enum pkt_hash_types hash_type;
>>> +
>>> +        hash_type = igc_rss_type_table[rss_type].hash_type;
>>> +        skb_set_hash(skb, rss_hash, hash_type);
>>> +    }
>>>   }
>>>   static void igc_rx_vlan(struct igc_ring *rx_ring,
>>> @@ -6501,6 +6527,7 @@ static int igc_probe(struct pci_dev *pdev,
>>>       netdev->features |= NETIF_F_TSO;
>>>       netdev->features |= NETIF_F_TSO6;
>>>       netdev->features |= NETIF_F_TSO_ECN;
>>> +    netdev->features |= NETIF_F_RXHASH;
>>>       netdev->features |= NETIF_F_RXCSUM;
>>>       netdev->features |= NETIF_F_HW_CSUM;
>>>       netdev->features |= NETIF_F_SCTP_CRC;
>>
>>
>> Kind regards,
>>
>> Paul
>>
>>
>> [1]: https://notabs.org/coding/smallIntsBigPenalty.htm
>>
> 
Thanks,
Olek
Alexander Lobakin Feb. 16, 2023, 3:43 p.m. UTC | #8
From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Date: Thu, 16 Feb 2023 16:17:46 +0100

> 
> On 14/02/2023 16.13, Alexander Lobakin wrote:
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Date: Tue, 14 Feb 2023 16:00:52 +0100
>>>
>>> Am 10.02.23 um 16:07 schrieb Jesper Dangaard Brouer:
>>>> When function igc_rx_hash() was introduced in v4.20 via commit
>>>> 0507ef8a0372
>>>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>>>> hardware wasn't configured to provide RSS hash, thus it made sense
>>>> to not
>>>> enable net_device NETIF_F_RXHASH feature bit.
>>>>
> [...]
>>>
>>>> hash value doesn't include UDP port numbers. Not being
>>>> PKT_HASH_TYPE_L4, have
>>>> the effect that netstack will do a software based hash calc calling
>>>> into
>>>> flow_dissect, but only when code calls skb_get_hash(), which doesn't
>>>> necessary happen for local delivery.
>>>
>>> Excuse my ignorance, but is that bug visible in practice by users
>>> (performance?) or is that fix needed for future work?
>>
>> Hash calculation always happens when RPS or RFS is enabled. So having no
>> hash in skb before hitting the netstack slows down their performance.
>> Also, no hash in skb passed from the driver results in worse NAPI bucket
>> distribution when there are more traffic flows than Rx queues / CPUs.
>> + Netfilter needs hashes on some configurations.
>>
> 
> Thanks Olek for explaining that.

<O

> 
> My perf measurements show that the expensive part is that netstack will
> call the flow_dissector code, when the hardware RX-hash is missing.

Well, not always, but right, the skb_get_hash() family is used widely
across the netstack, so it's highly recommended to have hardware hash
filled in skbs, same as with checksums, to avoid wasting CPU on
computing them in software.
And the Flow Dissector is expensive by its nature, a bunch faster when
you attach a BPF prog to it, but still (not that I support P4, I don't
at all).

> 
>>>
>>>> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control
>>>> supporting")
>>>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>>
>> [...]
>>
>> Nice to see that you also care about (not) using short types on the
>> stack :)
> 
> As can be seen by godbolt.org exploration[0] I have done, the stack
> isn't used for storing the values.
> 
>  [0]
> https://github.com/xdp-project/xdp-project/tree/master/areas/hints/godbolt/
> 
> I have created three files[2] with C-code that can be compiled via
> https://godbolt.org/.  The C-code contains a comment with the ASM code
> that was generated with -02 with compiler x86-64 gcc 12.2.
> 
> The first file[01] corresponds to this patch.
> 
>  [01]
> https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt01.c
>  [G01] https://godbolt.org/z/j79M9aTsn
> 
> The second file igc_godbolt02.c [02] have changes in [diff02]
> 
>  [02]
> https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt02.c
>  [G02] https://godbolt.org/z/sErqe4qd5
>  [diff02] https://github.com/xdp-project/xdp-project/commit/1f3488a932767
> 
> The third file igc_godbolt03.c [03] have changes in [diff03]
> 
>  [03]
> https://github.com/xdp-project/xdp-project/blob/master/areas/hints/godbolt/igc_godbolt03.c
>  [G03] https://godbolt.org/z/5K3vE1Wsv
>  [diff03] https://github.com/xdp-project/xdp-project/commit/aa9298f68705
> 
> Summary, the only thing we can save is replacing some movzx
> (zero-extend) with mov instructions.

Good stuff, thanks! When I call to not use short types on the stack, the
only thing I care about is the resulting object code, not simple "just
don't use it, I said so". So when a developer inspects the results from
using one or another type, he's free in picking whatever he wants if it
doesn't hurt optimization.

[...]

Olek
Jesper Dangaard Brouer Feb. 16, 2023, 4:46 p.m. UTC | #9
On 14/02/2023 14.21, Alexander Lobakin wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> Date: Fri, 10 Feb 2023 16:07:59 +0100
> 
>> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>> hardware wasn't configured to provide RSS hash, thus it made sense to not
>> enable net_device NETIF_F_RXHASH feature bit.
> 
> [...]
> 
>> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
>>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
>>   
>> +/* RX-desc Write-Back format RSS Type's */
>> +enum igc_rss_type_num {
>> +	IGC_RSS_TYPE_NO_HASH		= 0,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
>> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
>> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
>> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
>> +	IGC_RSS_TYPE_MAX		= 10,
>> +};
>> +#define IGC_RSS_TYPE_MAX_TABLE		16
>> +#define IGC_RSS_TYPE_MASK		0xF
> 
> GENMASK()?
> 

hmm... GENMASK(3,0) looks more confusing to me. The mask we need here is
so simple that I prefer not to complicate this with GENMASK.

>> +
>> +/* igc_rss_type - Rx descriptor RSS type field */
>> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
> 
> Why use types shorter than u32 on the stack?

Changing to u32 in V2

> Why this union is not const here, since there are no modifications?

Sure

>> +{
>> +	/* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
>> +	return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info & IGC_RSS_TYPE_MASK;
> 
> The most important I wanted to mention: doesn't this function make the
> CPU read the uncached field again, while you could just read it once
> onto the stack and then extract all such data from there?

I really don't think this is an issues here. The igc_adv_rx_desc is only
16 bytes and it should be hot in CPU cache by now.

To avoid the movzx I have changed this to do a u32 read instead.

>> +}
>> +
>> +/* Packet header type identified by hardware (when BIT(11) is zero).
>> + * Even when UDP ports are not part of RSS hash HW still parse and mark UDP bits
>> + */
>> +enum igc_pkt_type_bits {
>> +	IGC_PKT_TYPE_HDR_IPV4	=	BIT(0),
>> +	IGC_PKT_TYPE_HDR_IPV4_WITH_OPT=	BIT(1), /* IPv4 Hdr includes IP options */
>> +	IGC_PKT_TYPE_HDR_IPV6	=	BIT(2),
>> +	IGC_PKT_TYPE_HDR_IPV6_WITH_EXT=	BIT(3), /* IPv6 Hdr includes extensions */
>> +	IGC_PKT_TYPE_HDR_L4_TCP	=	BIT(4),
>> +	IGC_PKT_TYPE_HDR_L4_UDP	=	BIT(5),
>> +	IGC_PKT_TYPE_HDR_L4_SCTP=	BIT(6),
>> +	IGC_PKT_TYPE_HDR_NFS	=	BIT(7),
>> +	/* Above only valid when BIT(11) is zero */
>> +	IGC_PKT_TYPE_L2		=	BIT(11),
>> +	IGC_PKT_TYPE_VLAN	=	BIT(12),
>> +	IGC_PKT_TYPE_MASK	=	0x1FFF, /* 13-bits */
> 
> Also GENMASK().

GENMASK would make more sense here.

>> +};
>> +
>> +/* igc_pkt_type - Rx descriptor Packet type field */
>> +static inline u16 igc_pkt_type(union igc_adv_rx_desc *rx_desc)
> 
> Also short types and consts.
> 

Fixed in V2

>> +{
>> +	u32 data = le32_to_cpu(rx_desc->wb.lower.lo_dword.data);
>> +	/* Packet type is 13-bits - as bits (16:4) in lower.lo_dword*/
>> +	u16 pkt_type = (data >> 4) & IGC_PKT_TYPE_MASK;
> 
> Perfect candidate for FIELD_GET(). No, even for le32_get_bits().

I adjusted this, but I could not find a central define for FIELD_GET 
(but many drivers open code this).

> Also my note above about excessive expensive reads.
> 
>> +
>> +	return pkt_type;
>> +}
>> +
>>   /* Interrupt defines */
>>   #define IGC_START_ITR			648 /* ~6000 ints/sec */
>>   #define IGC_4K_ITR			980
>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
>> index 8b572cd2c350..42a072509d2a 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>> @@ -1677,14 +1677,40 @@ static void igc_rx_checksum(struct igc_ring *ring,
>>   		   le32_to_cpu(rx_desc->wb.upper.status_error));
>>   }
>>   
>> +/* Mapping HW RSS Type to enum pkt_hash_types */
>> +struct igc_rss_type {
>> +	u8 hash_type; /* can contain enum pkt_hash_types */
> 
> Why make a struct for one field? + short type note
> 
>> +} igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
>> +	[IGC_RSS_TYPE_NO_HASH].hash_type	  = PKT_HASH_TYPE_L2,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_IPV4].hash_type	  = PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_IPV6_EX].hash_type	  = PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_IPV6].hash_type	  = PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>> +	[10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9 "Reserved" by HW */
>> +	[11].hash_type = PKT_HASH_TYPE_L2,
>> +	[12].hash_type = PKT_HASH_TYPE_L2,
>> +	[13].hash_type = PKT_HASH_TYPE_L2,
>> +	[14].hash_type = PKT_HASH_TYPE_L2,
>> +	[15].hash_type = PKT_HASH_TYPE_L2,
> 
> Why define those empty if you could do a bound check in the code
> instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.

Having a branch for this is likely slower.  On godbolt I see that this 
generates suboptimal and larger code.


>> +};
>> +
>>   static inline void igc_rx_hash(struct igc_ring *ring,
>>   			       union igc_adv_rx_desc *rx_desc,
>>   			       struct sk_buff *skb)
>>   {
>> -	if (ring->netdev->features & NETIF_F_RXHASH)
>> -		skb_set_hash(skb,
>> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>> -			     PKT_HASH_TYPE_L3);
>> +	if (ring->netdev->features & NETIF_F_RXHASH) {
> 
> 	if (!(feature & HASH))
> 		return;
> 
> and -1 indent level?

Usually, yes, I also prefer early return style code.
For one I just followed the existing style.

Second, I tried to code it up, but it looks ugly in this case, as the
variable defines need to get moved outside the if statement.

>> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>> +		u8  rss_type = igc_rss_type(rx_desc);
>> +		enum pkt_hash_types hash_type;
>> +
>> +		hash_type = igc_rss_type_table[rss_type].hash_type;
>> +		skb_set_hash(skb, rss_hash, hash_type);
>> +	}
>>   }
> 
> [...]
> 
> Thanks,
> Olek
>
Alexander Lobakin Feb. 20, 2023, 3:39 p.m. UTC | #10
From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Date: Thu, 16 Feb 2023 17:46:53 +0100

> 
> On 14/02/2023 14.21, Alexander Lobakin wrote:
>> From: Jesper Dangaard Brouer <brouer@redhat.com>
>> Date: Fri, 10 Feb 2023 16:07:59 +0100
>>
>>> When function igc_rx_hash() was introduced in v4.20 via commit
>>> 0507ef8a0372
>>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>>> hardware wasn't configured to provide RSS hash, thus it made sense to
>>> not
>>> enable net_device NETIF_F_RXHASH feature bit.
>>
>> [...]
>>
>>> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>>>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP    0x00400000
>>>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP    0x00800000
>>>   +/* RX-desc Write-Back format RSS Type's */
>>> +enum igc_rss_type_num {
>>> +    IGC_RSS_TYPE_NO_HASH        = 0,
>>> +    IGC_RSS_TYPE_HASH_TCP_IPV4    = 1,
>>> +    IGC_RSS_TYPE_HASH_IPV4        = 2,
>>> +    IGC_RSS_TYPE_HASH_TCP_IPV6    = 3,
>>> +    IGC_RSS_TYPE_HASH_IPV6_EX    = 4,
>>> +    IGC_RSS_TYPE_HASH_IPV6        = 5,
>>> +    IGC_RSS_TYPE_HASH_TCP_IPV6_EX    = 6,
>>> +    IGC_RSS_TYPE_HASH_UDP_IPV4    = 7,
>>> +    IGC_RSS_TYPE_HASH_UDP_IPV6    = 8,
>>> +    IGC_RSS_TYPE_HASH_UDP_IPV6_EX    = 9,
>>> +    IGC_RSS_TYPE_MAX        = 10,
>>> +};
>>> +#define IGC_RSS_TYPE_MAX_TABLE        16
>>> +#define IGC_RSS_TYPE_MASK        0xF
>>
>> GENMASK()?
>>
> 
> hmm... GENMASK(3,0) looks more confusing to me. The mask we need here is
> so simple that I prefer not to complicate this with GENMASK.
> 
>>> +
>>> +/* igc_rss_type - Rx descriptor RSS type field */
>>> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
>>
>> Why use types shorter than u32 on the stack?
> 
> Changing to u32 in V2
> 
>> Why this union is not const here, since there are no modifications?
> 
> Sure
> 
>>> +{
>>> +    /* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
>>> +    return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info &
>>> IGC_RSS_TYPE_MASK;
>>
>> The most important I wanted to mention: doesn't this function make the
>> CPU read the uncached field again, while you could just read it once
>> onto the stack and then extract all such data from there?
> 
> I really don't think this is an issues here. The igc_adv_rx_desc is only
> 16 bytes and it should be hot in CPU cache by now.

Rx descriptors are located in the DMA coherent zone (allocated via
dma_alloc_coherent()), I am missing something? Because I was (I am) sure
CPU doesn't cache anything from it (and doesn't reorder reads/writes
from/to). I thought that's the point of coherent zones -- you may talk
to hardware without needing for syncing...

> 
> To avoid the movzx I have changed this to do a u32 read instead.
> 
>>> +}
>>> +
>>> +/* Packet header type identified by hardware (when BIT(11) is zero).
>>> + * Even when UDP ports are not part of RSS hash HW still parse and
>>> mark UDP bits
>>> + */
>>> +enum igc_pkt_type_bits {
>>> +    IGC_PKT_TYPE_HDR_IPV4    =    BIT(0),
>>> +    IGC_PKT_TYPE_HDR_IPV4_WITH_OPT=    BIT(1), /* IPv4 Hdr includes
>>> IP options */
>>> +    IGC_PKT_TYPE_HDR_IPV6    =    BIT(2),
>>> +    IGC_PKT_TYPE_HDR_IPV6_WITH_EXT=    BIT(3), /* IPv6 Hdr includes
>>> extensions */
>>> +    IGC_PKT_TYPE_HDR_L4_TCP    =    BIT(4),
>>> +    IGC_PKT_TYPE_HDR_L4_UDP    =    BIT(5),
>>> +    IGC_PKT_TYPE_HDR_L4_SCTP=    BIT(6),
>>> +    IGC_PKT_TYPE_HDR_NFS    =    BIT(7),
>>> +    /* Above only valid when BIT(11) is zero */
>>> +    IGC_PKT_TYPE_L2        =    BIT(11),
>>> +    IGC_PKT_TYPE_VLAN    =    BIT(12),
>>> +    IGC_PKT_TYPE_MASK    =    0x1FFF, /* 13-bits */
>>
>> Also GENMASK().
> 
> GENMASK would make more sense here.
> 
>>> +};
>>> +
>>> +/* igc_pkt_type - Rx descriptor Packet type field */
>>> +static inline u16 igc_pkt_type(union igc_adv_rx_desc *rx_desc)
>>
>> Also short types and consts.
>>
> 
> Fixed in V2
> 
>>> +{
>>> +    u32 data = le32_to_cpu(rx_desc->wb.lower.lo_dword.data);
>>> +    /* Packet type is 13-bits - as bits (16:4) in lower.lo_dword*/
>>> +    u16 pkt_type = (data >> 4) & IGC_PKT_TYPE_MASK;
>>
>> Perfect candidate for FIELD_GET(). No, even for le32_get_bits().
> 
> I adjusted this, but I could not find a central define for FIELD_GET
> (but many drivers open code this).

<linux/bitfield.h>. It has FIELD_{GET,PREP}() and also builds
{u,__le,__be}{8,16,32}_{encode,get,replace}_bits() via macro (the latter
doesn't get indexed by Elixir, as it doesn't parse functions built via
macros).

> 
>> Also my note above about excessive expensive reads.
>>
>>> +
>>> +    return pkt_type;
>>> +}
>>> +
>>>   /* Interrupt defines */
>>>   #define IGC_START_ITR            648 /* ~6000 ints/sec */
>>>   #define IGC_4K_ITR            980
>>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c
>>> b/drivers/net/ethernet/intel/igc/igc_main.c
>>> index 8b572cd2c350..42a072509d2a 100644
>>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>>> @@ -1677,14 +1677,40 @@ static void igc_rx_checksum(struct igc_ring
>>> *ring,
>>>              le32_to_cpu(rx_desc->wb.upper.status_error));
>>>   }
>>>   +/* Mapping HW RSS Type to enum pkt_hash_types */
>>> +struct igc_rss_type {
>>> +    u8 hash_type; /* can contain enum pkt_hash_types */
>>
>> Why make a struct for one field? + short type note
>>
>>> +} igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
>>> +    [IGC_RSS_TYPE_NO_HASH].hash_type      = PKT_HASH_TYPE_L2,
>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV4].hash_type      = PKT_HASH_TYPE_L4,
>>> +    [IGC_RSS_TYPE_HASH_IPV4].hash_type      = PKT_HASH_TYPE_L3,
>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV6].hash_type      = PKT_HASH_TYPE_L4,
>>> +    [IGC_RSS_TYPE_HASH_IPV6_EX].hash_type      = PKT_HASH_TYPE_L3,
>>> +    [IGC_RSS_TYPE_HASH_IPV6].hash_type      = PKT_HASH_TYPE_L3,
>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV4].hash_type      = PKT_HASH_TYPE_L4,
>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV6].hash_type      = PKT_HASH_TYPE_L4,
>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>>> +    [10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9
>>> "Reserved" by HW */
>>> +    [11].hash_type = PKT_HASH_TYPE_L2,
>>> +    [12].hash_type = PKT_HASH_TYPE_L2,
>>> +    [13].hash_type = PKT_HASH_TYPE_L2,
>>> +    [14].hash_type = PKT_HASH_TYPE_L2,
>>> +    [15].hash_type = PKT_HASH_TYPE_L2,
>>
>> Why define those empty if you could do a bound check in the code
>> instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.
> 
> Having a branch for this is likely slower.  On godbolt I see that this
> generates suboptimal and larger code.

But you have to verify HW output anyway, right? Or would like to rely on
that on some weird revision it won't spit BIT(69) on you?

> 
> 
>>> +};
>>> +
>>>   static inline void igc_rx_hash(struct igc_ring *ring,
>>>                      union igc_adv_rx_desc *rx_desc,
>>>                      struct sk_buff *skb)
>>>   {
>>> -    if (ring->netdev->features & NETIF_F_RXHASH)
>>> -        skb_set_hash(skb,
>>> -                 le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>> -                 PKT_HASH_TYPE_L3);
>>> +    if (ring->netdev->features & NETIF_F_RXHASH) {
>>
>>     if (!(feature & HASH))
>>         return;
>>
>> and -1 indent level?
> 
> Usually, yes, I also prefer early return style code.
> For one I just followed the existing style.

I'd not recommend "keep the existing style" of Intel drivers -- not
something I'd like to keep as is :D

> 
> Second, I tried to code it up, but it looks ugly in this case, as the
> variable defines need to get moved outside the if statement.
> 
>>> +        u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>>> +        u8  rss_type = igc_rss_type(rx_desc);
>>> +        enum pkt_hash_types hash_type;
>>> +
>>> +        hash_type = igc_rss_type_table[rss_type].hash_type;
>>> +        skb_set_hash(skb, rss_hash, hash_type);
>>> +    }
>>>   }
>>
>> [...]
>>
>> Thanks,
>> Olek
>>
> 

Thanks,
Olek
Jesper Dangaard Brouer Feb. 22, 2023, 3 p.m. UTC | #11
On 20/02/2023 16.39, Alexander Lobakin wrote:
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Date: Thu, 16 Feb 2023 17:46:53 +0100
> 
>>
>> On 14/02/2023 14.21, Alexander Lobakin wrote:
>>> From: Jesper Dangaard Brouer <brouer@redhat.com>
>>> Date: Fri, 10 Feb 2023 16:07:59 +0100
>>>
>>>> When function igc_rx_hash() was introduced in v4.20 via commit
>>>> 0507ef8a0372
>>>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>>>> hardware wasn't configured to provide RSS hash, thus it made sense to
>>>> not
>>>> enable net_device NETIF_F_RXHASH feature bit.
>>>
>>> [...]
>>>
>>>> @@ -311,6 +311,58 @@ extern char igc_driver_name[];
>>>>    #define IGC_MRQC_RSS_FIELD_IPV4_UDP    0x00400000
>>>>    #define IGC_MRQC_RSS_FIELD_IPV6_UDP    0x00800000
>>>>    +/* RX-desc Write-Back format RSS Type's */
>>>> +enum igc_rss_type_num {
>>>> +    IGC_RSS_TYPE_NO_HASH        = 0,
>>>> +    IGC_RSS_TYPE_HASH_TCP_IPV4    = 1,
>>>> +    IGC_RSS_TYPE_HASH_IPV4        = 2,
>>>> +    IGC_RSS_TYPE_HASH_TCP_IPV6    = 3,
>>>> +    IGC_RSS_TYPE_HASH_IPV6_EX    = 4,
>>>> +    IGC_RSS_TYPE_HASH_IPV6        = 5,
>>>> +    IGC_RSS_TYPE_HASH_TCP_IPV6_EX    = 6,
>>>> +    IGC_RSS_TYPE_HASH_UDP_IPV4    = 7,
>>>> +    IGC_RSS_TYPE_HASH_UDP_IPV6    = 8,
>>>> +    IGC_RSS_TYPE_HASH_UDP_IPV6_EX    = 9,
>>>> +    IGC_RSS_TYPE_MAX        = 10,
>>>> +};
>>>> +#define IGC_RSS_TYPE_MAX_TABLE        16
>>>> +#define IGC_RSS_TYPE_MASK        0xF
>>>
>>> GENMASK()?
>>>
>>
>> hmm... GENMASK(3,0) looks more confusing to me. The mask we need here is
>> so simple that I prefer not to complicate this with GENMASK.
>>

Changed my mind, I'm going with GENMASK(3,0) in V3.

>>>> +
>>>> +/* igc_rss_type - Rx descriptor RSS type field */
>>>> +static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
>>>
>>> Why use types shorter than u32 on the stack?
>>
>> Changing to u32 in V2
>>
>>> Why this union is not const here, since there are no modifications?
>>
>> Sure
>>
>>>> +{
>>>> +    /* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
>>>> +    return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info &
>>>> IGC_RSS_TYPE_MASK;
>>>
>>> The most important I wanted to mention: doesn't this function make the
>>> CPU read the uncached field again, while you could just read it once
>>> onto the stack and then extract all such data from there?
>>
>> I really don't think this is an issues here. The igc_adv_rx_desc is only
>> 16 bytes and it should be hot in CPU cache by now.
> 
> Rx descriptors are located in the DMA coherent zone (allocated via
> dma_alloc_coherent()), I am missing something? Because I was (I am) sure
> CPU doesn't cache anything from it (and doesn't reorder reads/writes
> from/to). I thought that's the point of coherent zones -- you may talk
> to hardware without needing for syncing...
> 

That is a good point and you are (likely) right.

I do want to remind you that this is a "fixes" patch that dates back to
v5.2.  This driver is from the very beginning coded to access descriptor
this way via union igc_adv_rx_desc.  For a fixes patch, I'm not going to
code up a new and more effecient way of accessing the descriptor memory.

If you truely believe this matters for a 2.5 Gbit/s device, then someone 
(e.g you) can go through this driver and change this pattern in the code.


>>
>> To avoid the movzx I have changed this to do a u32 read instead.
>>
>>>> +}
>>>> +
>>>> +/* Packet header type identified by hardware (when BIT(11) is zero).
>>>> + * Even when UDP ports are not part of RSS hash HW still parse and
>>>> mark UDP bits
>>>> + */
>>>> +enum igc_pkt_type_bits {
>>>> +    IGC_PKT_TYPE_HDR_IPV4    =    BIT(0),
>>>> +    IGC_PKT_TYPE_HDR_IPV4_WITH_OPT=    BIT(1), /* IPv4 Hdr includes
>>>> IP options */
>>>> +    IGC_PKT_TYPE_HDR_IPV6    =    BIT(2),
>>>> +    IGC_PKT_TYPE_HDR_IPV6_WITH_EXT=    BIT(3), /* IPv6 Hdr includes
>>>> extensions */
>>>> +    IGC_PKT_TYPE_HDR_L4_TCP    =    BIT(4),
>>>> +    IGC_PKT_TYPE_HDR_L4_UDP    =    BIT(5),
>>>> +    IGC_PKT_TYPE_HDR_L4_SCTP=    BIT(6),
>>>> +    IGC_PKT_TYPE_HDR_NFS    =    BIT(7),
>>>> +    /* Above only valid when BIT(11) is zero */
>>>> +    IGC_PKT_TYPE_L2        =    BIT(11),
>>>> +    IGC_PKT_TYPE_VLAN    =    BIT(12),
>>>> +    IGC_PKT_TYPE_MASK    =    0x1FFF, /* 13-bits */
>>>
>>> Also GENMASK().
>>
>> GENMASK would make more sense here.
>>
>>>> +};
>>>> +
>>>> +/* igc_pkt_type - Rx descriptor Packet type field */
>>>> +static inline u16 igc_pkt_type(union igc_adv_rx_desc *rx_desc)
>>>
>>> Also short types and consts.
>>>
>>
>> Fixed in V2
>>
>>>> +{
>>>> +    u32 data = le32_to_cpu(rx_desc->wb.lower.lo_dword.data);
>>>> +    /* Packet type is 13-bits - as bits (16:4) in lower.lo_dword*/
>>>> +    u16 pkt_type = (data >> 4) & IGC_PKT_TYPE_MASK;
>>>
>>> Perfect candidate for FIELD_GET(). No, even for le32_get_bits().
>>

Dropping all the code and defines for "igc_pkt_type", as the code ended
up not being used.  I simply kept this around to document what was in
the programmers datasheet (to help others understand the hardware).


>> I adjusted this, but I could not find a central define for FIELD_GET
>> (but many drivers open code this).
> 
> <linux/bitfield.h>. It has FIELD_{GET,PREP}() and also builds
> {u,__le,__be}{8,16,32}_{encode,get,replace}_bits() via macro (the latter
> doesn't get indexed by Elixir, as it doesn't parse functions built via
> macros).

Thanks for the pointer to <linux/bitfield.h>, I'll be using that in V3.

>>
>>> Also my note above about excessive expensive reads.
>>>
>>>> +
>>>> +    return pkt_type;
>>>> +}
>>>> +
>>>>    /* Interrupt defines */
>>>>    #define IGC_START_ITR            648 /* ~6000 ints/sec */
>>>>    #define IGC_4K_ITR            980
>>>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c
>>>> b/drivers/net/ethernet/intel/igc/igc_main.c
>>>> index 8b572cd2c350..42a072509d2a 100644
>>>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>>>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>>>> @@ -1677,14 +1677,40 @@ static void igc_rx_checksum(struct igc_ring
>>>> *ring,
>>>>               le32_to_cpu(rx_desc->wb.upper.status_error));
>>>>    }
>>>>    +/* Mapping HW RSS Type to enum pkt_hash_types */
>>>> +struct igc_rss_type {
>>>> +    u8 hash_type; /* can contain enum pkt_hash_types */
>>>
>>> Why make a struct for one field? + short type note

Also fixing this in V3.

>>>> +} igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
>>>> +    [IGC_RSS_TYPE_NO_HASH].hash_type      = PKT_HASH_TYPE_L2,
>>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV4].hash_type      = PKT_HASH_TYPE_L4,
>>>> +    [IGC_RSS_TYPE_HASH_IPV4].hash_type      = PKT_HASH_TYPE_L3,
>>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV6].hash_type      = PKT_HASH_TYPE_L4,
>>>> +    [IGC_RSS_TYPE_HASH_IPV6_EX].hash_type      = PKT_HASH_TYPE_L3,
>>>> +    [IGC_RSS_TYPE_HASH_IPV6].hash_type      = PKT_HASH_TYPE_L3,
>>>> +    [IGC_RSS_TYPE_HASH_TCP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV4].hash_type      = PKT_HASH_TYPE_L4,
>>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV6].hash_type      = PKT_HASH_TYPE_L4,
>>>> +    [IGC_RSS_TYPE_HASH_UDP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
>>>> +    [10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9
>>>> "Reserved" by HW */
>>>> +    [11].hash_type = PKT_HASH_TYPE_L2,
>>>> +    [12].hash_type = PKT_HASH_TYPE_L2,
>>>> +    [13].hash_type = PKT_HASH_TYPE_L2,
>>>> +    [14].hash_type = PKT_HASH_TYPE_L2,
>>>> +    [15].hash_type = PKT_HASH_TYPE_L2,

Changing these 10-15 to PKT_HASH_TYPE_NONE, which is zero.
The ASM generated table is smaller code size with zero padded content.

>>>
>>> Why define those empty if you could do a bound check in the code
>>> instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.
>>
>> Having a branch for this is likely slower.  On godbolt I see that this
>> generates suboptimal and larger code.
> 
> But you have to verify HW output anyway, right? Or would like to rely on
> that on some weird revision it won't spit BIT(69) on you?
> 

The table is constructed such that the lookup takes care of "verifying"
the HW output.  Notice that software will bit mask the last 4 bits, thus
the number will max be 15.  No matter what hardware outputs it is safe
to do a lookup in the table.  IMHO it is a simple way to avoid an
unnecessary verification branch and still be able to handle buggy/weird
HW revs.


>>>> +};
>>>> +
>>>>    static inline void igc_rx_hash(struct igc_ring *ring,
>>>>                       union igc_adv_rx_desc *rx_desc,
>>>>                       struct sk_buff *skb)
>>>>    {
>>>> -    if (ring->netdev->features & NETIF_F_RXHASH)
>>>> -        skb_set_hash(skb,
>>>> -                 le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>>> -                 PKT_HASH_TYPE_L3);
>>>> +    if (ring->netdev->features & NETIF_F_RXHASH) {
>>>
>>>      if (!(feature & HASH))
>>>          return;
>>>
>>> and -1 indent level?
>>
>> Usually, yes, I also prefer early return style code.
>> For one I just followed the existing style.
> 
> I'd not recommend "keep the existing style" of Intel drivers -- not
> something I'd like to keep as is :D
> 
>>
>> Second, I tried to code it up, but it looks ugly in this case, as the
>> variable defines need to get moved outside the if statement.
>>
>>>> +        u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>>>> +        u8  rss_type = igc_rss_type(rx_desc);
>>>> +        enum pkt_hash_types hash_type;
>>>> +
>>>> +        hash_type = igc_rss_type_table[rss_type].hash_type;
>>>> +        skb_set_hash(skb, rss_hash, hash_type);
>>>> +    }
>>>>    }
>>>
>>> [...]

--Jesper
Alexander Lobakin Feb. 24, 2023, 4:41 p.m. UTC | #12
From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Date: Wed, 22 Feb 2023 16:00:30 +0100

> 
> On 20/02/2023 16.39, Alexander Lobakin wrote:
>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
>> Date: Thu, 16 Feb 2023 17:46:53 +0100

[...]

>> Rx descriptors are located in the DMA coherent zone (allocated via
>> dma_alloc_coherent()), I am missing something? Because I was (I am) sure
>> CPU doesn't cache anything from it (and doesn't reorder reads/writes
>> from/to). I thought that's the point of coherent zones -- you may talk
>> to hardware without needing for syncing...
>>
> 
> That is a good point and you are (likely) right.
> 
> I do want to remind you that this is a "fixes" patch that dates back to
> v5.2.  This driver is from the very beginning coded to access descriptor
> this way via union igc_adv_rx_desc.  For a fixes patch, I'm not going to
> code up a new and more effecient way of accessing the descriptor memory.

Sure, not for fixes definitely. +

> 
> If you truely believe this matters for a 2.5 Gbit/s device, then someone
> (e.g you) can go through this driver and change this pattern in the code.

[...]

>>>>> +    [10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9
>>>>> "Reserved" by HW */
>>>>> +    [11].hash_type = PKT_HASH_TYPE_L2,
>>>>> +    [12].hash_type = PKT_HASH_TYPE_L2,
>>>>> +    [13].hash_type = PKT_HASH_TYPE_L2,
>>>>> +    [14].hash_type = PKT_HASH_TYPE_L2,
>>>>> +    [15].hash_type = PKT_HASH_TYPE_L2,
> 
> Changing these 10-15 to PKT_HASH_TYPE_NONE, which is zero.
> The ASM generated table is smaller code size with zero padded content.

Yeah, and _L2 is applicable only when there's actual hash (but it's
hashed by MAC addresses, for example). Sorry I didn't notice this :s

> 
>>>>
>>>> Why define those empty if you could do a bound check in the code
>>>> instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.
>>>
>>> Having a branch for this is likely slower.  On godbolt I see that this
>>> generates suboptimal and larger code.
>>
>> But you have to verify HW output anyway, right? Or would like to rely on
>> that on some weird revision it won't spit BIT(69) on you?
>>
> 
> The table is constructed such that the lookup takes care of "verifying"
> the HW output.  Notice that software will bit mask the last 4 bits, thus
> the number will max be 15.  No matter what hardware outputs it is safe
> to do a lookup in the table.  IMHO it is a simple way to avoid an
> unnecessary verification branch and still be able to handle buggy/weird
> HW revs.

Ah, didn't notice the field is of 4 bits. Ack then.

[...]

Thanks,
Olek
Alexander Lobakin Feb. 27, 2023, 2:24 p.m. UTC | #13
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 24 Feb 2023 17:41:58 +0100

> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Date: Wed, 22 Feb 2023 16:00:30 +0100

[...]

>>>>> Why define those empty if you could do a bound check in the code
>>>>> instead? E.g. `if (unlikely(bigger_than_9)) return PKT_HASH_TYPE_L2`.
>>>>
>>>> Having a branch for this is likely slower.  On godbolt I see that this
>>>> generates suboptimal and larger code.

BTW, it's funny that when I proposed an optimization, you said "it makes
no sense on 2.5G NICs", but when you omit bounds checking and just
extend the array with zero fields, it suddenly starts making sense to
save a couple instructions :D

(just an observation)

>>>
>>> But you have to verify HW output anyway, right? Or would like to rely on
>>> that on some weird revision it won't spit BIT(69) on you?
[...]

Thanks,
Olek
Alexander Lobakin Feb. 27, 2023, 2:53 p.m. UTC | #14
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 16 Feb 2023 16:43:04 +0100

> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Date: Thu, 16 Feb 2023 16:17:46 +0100

[...]

>> Summary, the only thing we can save is replacing some movzx
>> (zero-extend) with mov instructions.
> 
> Good stuff, thanks! When I call to not use short types on the stack, the
> only thing I care about is the resulting object code, not simple "just
> don't use it, I said so". So when a developer inspects the results from
> using one or another type, he's free in picking whatever he wants if it
> doesn't hurt optimization.

Oh, forgot to mention: I just don't want to give people "bad" example. I
mean, someone may look at some code where u8-u16s are actively used and
start thinking that it's a regular/standard practice, while it's not.
I've seen a lot of places in the kernel where the short types were used
only because "the queue index can't be bigger than 255", "we won't
handle more than 64Kb of data in one loop anyway" and then the stack is
full of u8s and u16s and the object code (I'm talking more about
hotpaths, but generally is applicable to any code) is full of masking
and other completely avoidable operations.

> 
> [...]
Thanks,
Olek
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index df3e26c0cf01..a112eeb59525 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -311,6 +311,58 @@  extern char igc_driver_name[];
 #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
 #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
 
+/* RX-desc Write-Back format RSS Type's */
+enum igc_rss_type_num {
+	IGC_RSS_TYPE_NO_HASH		= 0,
+	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
+	IGC_RSS_TYPE_HASH_IPV4		= 2,
+	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
+	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
+	IGC_RSS_TYPE_HASH_IPV6		= 5,
+	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
+	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
+	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
+	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
+	IGC_RSS_TYPE_MAX		= 10,
+};
+#define IGC_RSS_TYPE_MAX_TABLE		16
+#define IGC_RSS_TYPE_MASK		0xF
+
+/* igc_rss_type - Rx descriptor RSS type field */
+static inline u8 igc_rss_type(union igc_adv_rx_desc *rx_desc)
+{
+	/* RSS Type 4-bit number: 0-9 (above 9 is reserved) */
+	return rx_desc->wb.lower.lo_dword.hs_rss.pkt_info & IGC_RSS_TYPE_MASK;
+}
+
+/* Packet header type identified by hardware (when BIT(11) is zero).
+ * Even when UDP ports are not part of RSS hash HW still parse and mark UDP bits
+ */
+enum igc_pkt_type_bits {
+	IGC_PKT_TYPE_HDR_IPV4	=	BIT(0),
+	IGC_PKT_TYPE_HDR_IPV4_WITH_OPT=	BIT(1), /* IPv4 Hdr includes IP options */
+	IGC_PKT_TYPE_HDR_IPV6	=	BIT(2),
+	IGC_PKT_TYPE_HDR_IPV6_WITH_EXT=	BIT(3), /* IPv6 Hdr includes extensions */
+	IGC_PKT_TYPE_HDR_L4_TCP	=	BIT(4),
+	IGC_PKT_TYPE_HDR_L4_UDP	=	BIT(5),
+	IGC_PKT_TYPE_HDR_L4_SCTP=	BIT(6),
+	IGC_PKT_TYPE_HDR_NFS	=	BIT(7),
+	/* Above only valid when BIT(11) is zero */
+	IGC_PKT_TYPE_L2		=	BIT(11),
+	IGC_PKT_TYPE_VLAN	=	BIT(12),
+	IGC_PKT_TYPE_MASK	=	0x1FFF, /* 13-bits */
+};
+
+/* igc_pkt_type - Rx descriptor Packet type field */
+static inline u16 igc_pkt_type(union igc_adv_rx_desc *rx_desc)
+{
+	u32 data = le32_to_cpu(rx_desc->wb.lower.lo_dword.data);
+	/* Packet type is 13-bits - as bits (16:4) in lower.lo_dword*/
+	u16 pkt_type = (data >> 4) & IGC_PKT_TYPE_MASK;
+
+	return pkt_type;
+}
+
 /* Interrupt defines */
 #define IGC_START_ITR			648 /* ~6000 ints/sec */
 #define IGC_4K_ITR			980
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 8b572cd2c350..42a072509d2a 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1677,14 +1677,40 @@  static void igc_rx_checksum(struct igc_ring *ring,
 		   le32_to_cpu(rx_desc->wb.upper.status_error));
 }
 
+/* Mapping HW RSS Type to enum pkt_hash_types */
+struct igc_rss_type {
+	u8 hash_type; /* can contain enum pkt_hash_types */
+} igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
+	[IGC_RSS_TYPE_NO_HASH].hash_type	  = PKT_HASH_TYPE_L2,
+	[IGC_RSS_TYPE_HASH_TCP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV4].hash_type	  = PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV6_EX].hash_type	  = PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_IPV6].hash_type	  = PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV4].hash_type	  = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6].hash_type	  = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX].hash_type = PKT_HASH_TYPE_L4,
+	[10].hash_type = PKT_HASH_TYPE_L2, /* RSS Type above 9 "Reserved" by HW */
+	[11].hash_type = PKT_HASH_TYPE_L2,
+	[12].hash_type = PKT_HASH_TYPE_L2,
+	[13].hash_type = PKT_HASH_TYPE_L2,
+	[14].hash_type = PKT_HASH_TYPE_L2,
+	[15].hash_type = PKT_HASH_TYPE_L2,
+};
+
 static inline void igc_rx_hash(struct igc_ring *ring,
 			       union igc_adv_rx_desc *rx_desc,
 			       struct sk_buff *skb)
 {
-	if (ring->netdev->features & NETIF_F_RXHASH)
-		skb_set_hash(skb,
-			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
-			     PKT_HASH_TYPE_L3);
+	if (ring->netdev->features & NETIF_F_RXHASH) {
+		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
+		u8  rss_type = igc_rss_type(rx_desc);
+		enum pkt_hash_types hash_type;
+
+		hash_type = igc_rss_type_table[rss_type].hash_type;
+		skb_set_hash(skb, rss_hash, hash_type);
+	}
 }
 
 static void igc_rx_vlan(struct igc_ring *rx_ring,
@@ -6501,6 +6527,7 @@  static int igc_probe(struct pci_dev *pdev,
 	netdev->features |= NETIF_F_TSO;
 	netdev->features |= NETIF_F_TSO6;
 	netdev->features |= NETIF_F_TSO_ECN;
+	netdev->features |= NETIF_F_RXHASH;
 	netdev->features |= NETIF_F_RXCSUM;
 	netdev->features |= NETIF_F_HW_CSUM;
 	netdev->features |= NETIF_F_SCTP_CRC;