diff mbox series

[bpf-next,V2,1/5] igc: enable and fix RX hash usage by netstack

Message ID 168182464270.616355.11391652654430626584.stgit@firesoul (mailing list archive)
State Accepted
Commit 84214ab4689f962b4bfc47fc9a5838d25ac4274d
Delegated to: BPF
Headers show
Series XDP-hints: XDP kfunc metadata for driver igc | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/cc_maintainers fail 1 blamed authors not CCed: sasha.neftin@intel.com; 2 maintainers not CCed: sasha.neftin@intel.com anthony.l.nguyen@intel.com
netdev/build_clang success Errors and warnings before: 20 this patch: 20
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 18 this patch: 18
netdev/checkpatch fail CHECK: Please use a blank line after function/struct/union/enum declarations ERROR: space required after that ',' (ctx:VxV) WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for veristat

Commit Message

Jesper Dangaard Brouer April 18, 2023, 1:30 p.m. UTC
When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
("igc: Add transmit and receive fastpath and interrupt handlers"), the
hardware wasn't configured to provide RSS hash, thus it made sense to not
enable net_device NETIF_F_RXHASH feature bit.

The NIC hardware was configured to enable RSS hash info in v5.2 via commit
2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
forgot to set the NETIF_F_RXHASH feature bit.

The original implementation of igc_rx_hash() didn't extract the associated
pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
this patch are about extracting the RSS Type from the hardware and mapping
this to enum pkt_hash_types. This was based on Foxville i225 software user
manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).

For UDP it's worth noting that RSS (type) hashing have been disabled both for
IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
because hardware RSS doesn't handle fragmented pkts well when enabled (can
cause out-of-order). This results in PKT_HASH_TYPE_L3 for UDP packets, and
hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
the effect that netstack will do a software based hash calc calling into
flow_dissect, but only when code calls skb_get_hash(), which doesn't
necessary happen for local delivery.

For QA verification testing I wrote a small bpftrace prog:
 [0] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt

Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/igc/igc.h      |   28 ++++++++++++++++++++++++++
 drivers/net/ethernet/intel/igc/igc_main.c |   31 +++++++++++++++++++++++++----
 2 files changed, 55 insertions(+), 4 deletions(-)

Comments

John Fastabend April 23, 2023, 2:46 p.m. UTC | #1
Jesper Dangaard Brouer wrote:
> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
> hardware wasn't configured to provide RSS hash, thus it made sense to not
> enable net_device NETIF_F_RXHASH feature bit.
> 
> The NIC hardware was configured to enable RSS hash info in v5.2 via commit
> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
> forgot to set the NETIF_F_RXHASH feature bit.
> 
> The original implementation of igc_rx_hash() didn't extract the associated
> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
> this patch are about extracting the RSS Type from the hardware and mapping
> this to enum pkt_hash_types. This was based on Foxville i225 software user
> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).
> 
> For UDP it's worth noting that RSS (type) hashing have been disabled both for
> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
> because hardware RSS doesn't handle fragmented pkts well when enabled (can
> cause out-of-order). This results in PKT_HASH_TYPE_L3 for UDP packets, and
> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
> the effect that netstack will do a software based hash calc calling into
> flow_dissect, but only when code calls skb_get_hash(), which doesn't
> necessary happen for local delivery.
> 
> For QA verification testing I wrote a small bpftrace prog:
>  [0] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt
> 
> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  drivers/net/ethernet/intel/igc/igc.h      |   28 ++++++++++++++++++++++++++
>  drivers/net/ethernet/intel/igc/igc_main.c |   31 +++++++++++++++++++++++++----
>  2 files changed, 55 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> index 34aebf00a512..f7f9e217e7b4 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -13,6 +13,7 @@
>  #include <linux/ptp_clock_kernel.h>
>  #include <linux/timecounter.h>
>  #include <linux/net_tstamp.h>
> +#include <linux/bitfield.h>
>  
>  #include "igc_hw.h"
>  
> @@ -311,6 +312,33 @@ extern char igc_driver_name[];
>  #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
>  #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
>  
> +/* RX-desc Write-Back format RSS Type's */
> +enum igc_rss_type_num {
> +	IGC_RSS_TYPE_NO_HASH		= 0,
> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
> +	IGC_RSS_TYPE_MAX		= 10,
> +};
> +#define IGC_RSS_TYPE_MAX_TABLE		16
> +#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
> +
> +/* igc_rss_type - Rx descriptor RSS type field */
> +static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
> +{
> +	/* RSS Type 4-bits (3:0) number: 0-9 (above 9 is reserved)
> +	 * Accessing the same bits via u16 (wb.lower.lo_dword.hs_rss.pkt_info)
> +	 * is slightly slower than via u32 (wb.lower.lo_dword.data)
> +	 */
> +	return le32_get_bits(rx_desc->wb.lower.lo_dword.data, IGC_RSS_TYPE_MASK);
> +}
> +
>  /* Interrupt defines */
>  #define IGC_START_ITR			648 /* ~6000 ints/sec */
>  #define IGC_4K_ITR			980
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 1c4676882082..bfa9768d447f 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -1690,14 +1690,36 @@ static void igc_rx_checksum(struct igc_ring *ring,
>  		   le32_to_cpu(rx_desc->wb.upper.status_error));
>  }
>  
> +/* Mapping HW RSS Type to enum pkt_hash_types */
> +static const enum pkt_hash_types igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
> +	[IGC_RSS_TYPE_NO_HASH]		= PKT_HASH_TYPE_L2,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_IPV4]	= PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_IPV6_EX]	= PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_IPV6]	= PKT_HASH_TYPE_L3,
> +	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= PKT_HASH_TYPE_L4,
> +	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = PKT_HASH_TYPE_L4,
> +	[10] = PKT_HASH_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
> +	[11] = PKT_HASH_TYPE_NONE, /* keep array sized for SW bit-mask   */
> +	[12] = PKT_HASH_TYPE_NONE, /* to handle future HW revisons       */
> +	[13] = PKT_HASH_TYPE_NONE,
> +	[14] = PKT_HASH_TYPE_NONE,
> +	[15] = PKT_HASH_TYPE_NONE,
> +};
> +
>  static inline void igc_rx_hash(struct igc_ring *ring,
>  			       union igc_adv_rx_desc *rx_desc,
>  			       struct sk_buff *skb)
>  {
> -	if (ring->netdev->features & NETIF_F_RXHASH)
> -		skb_set_hash(skb,
> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> -			     PKT_HASH_TYPE_L3);
> +	if (ring->netdev->features & NETIF_F_RXHASH) {
> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
> +		u32 rss_type = igc_rss_type(rx_desc);
> +
> +		skb_set_hash(skb, rss_hash, igc_rss_type_table[rss_type]);

Just curious why not copy the logic from the other driver fms10k, ice, ect.

	skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
		     (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
		     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);

avoiding the table logic. Do the driver folks care?
Jesper Dangaard Brouer April 24, 2023, 2:20 p.m. UTC | #2
On 23/04/2023 16.46, John Fastabend wrote:
> Jesper Dangaard Brouer wrote:
>> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
>> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
>> hardware wasn't configured to provide RSS hash, thus it made sense to not
>> enable net_device NETIF_F_RXHASH feature bit.
>>
>> The NIC hardware was configured to enable RSS hash info in v5.2 via commit
>> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
>> forgot to set the NETIF_F_RXHASH feature bit.
>>
>> The original implementation of igc_rx_hash() didn't extract the associated
>> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
>> this patch are about extracting the RSS Type from the hardware and mapping
>> this to enum pkt_hash_types. This was based on Foxville i225 software user
>> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).
>>
>> For UDP it's worth noting that RSS (type) hashing have been disabled both for
>> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
>> because hardware RSS doesn't handle fragmented pkts well when enabled (can
>> cause out-of-order). This results in PKT_HASH_TYPE_L3 for UDP packets, and
>> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
>> the effect that netstack will do a software based hash calc calling into
>> flow_dissect, but only when code calls skb_get_hash(), which doesn't
>> necessary happen for local delivery.
>>
>> For QA verification testing I wrote a small bpftrace prog:
>>   [0] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt
>>
>> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> ---
>>   drivers/net/ethernet/intel/igc/igc.h      |   28 ++++++++++++++++++++++++++
>>   drivers/net/ethernet/intel/igc/igc_main.c |   31 +++++++++++++++++++++++++----
>>   2 files changed, 55 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
>> index 34aebf00a512..f7f9e217e7b4 100644
>> --- a/drivers/net/ethernet/intel/igc/igc.h
>> +++ b/drivers/net/ethernet/intel/igc/igc.h
>> @@ -13,6 +13,7 @@
>>   #include <linux/ptp_clock_kernel.h>
>>   #include <linux/timecounter.h>
>>   #include <linux/net_tstamp.h>
>> +#include <linux/bitfield.h>
>>   
>>   #include "igc_hw.h"
>>   
>> @@ -311,6 +312,33 @@ extern char igc_driver_name[];
>>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
>>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
>>   
>> +/* RX-desc Write-Back format RSS Type's */
>> +enum igc_rss_type_num {
>> +	IGC_RSS_TYPE_NO_HASH		= 0,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
>> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
>> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
>> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
>> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
>> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
>> +	IGC_RSS_TYPE_MAX		= 10,
>> +};
>> +#define IGC_RSS_TYPE_MAX_TABLE		16
>> +#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
>> +
>> +/* igc_rss_type - Rx descriptor RSS type field */
>> +static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
>> +{
>> +	/* RSS Type 4-bits (3:0) number: 0-9 (above 9 is reserved)
>> +	 * Accessing the same bits via u16 (wb.lower.lo_dword.hs_rss.pkt_info)
>> +	 * is slightly slower than via u32 (wb.lower.lo_dword.data)
>> +	 */
>> +	return le32_get_bits(rx_desc->wb.lower.lo_dword.data, IGC_RSS_TYPE_MASK);
>> +}
>> +
>>   /* Interrupt defines */
>>   #define IGC_START_ITR			648 /* ~6000 ints/sec */
>>   #define IGC_4K_ITR			980
>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
>> index 1c4676882082..bfa9768d447f 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>> @@ -1690,14 +1690,36 @@ static void igc_rx_checksum(struct igc_ring *ring,
>>   		   le32_to_cpu(rx_desc->wb.upper.status_error));
>>   }
>>   
>> +/* Mapping HW RSS Type to enum pkt_hash_types */
>> +static const enum pkt_hash_types igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
>> +	[IGC_RSS_TYPE_NO_HASH]		= PKT_HASH_TYPE_L2,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_IPV4]	= PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_IPV6_EX]	= PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_IPV6]	= PKT_HASH_TYPE_L3,
>> +	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= PKT_HASH_TYPE_L4,
>> +	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = PKT_HASH_TYPE_L4,
>> +	[10] = PKT_HASH_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
>> +	[11] = PKT_HASH_TYPE_NONE, /* keep array sized for SW bit-mask   */
>> +	[12] = PKT_HASH_TYPE_NONE, /* to handle future HW revisons       */
>> +	[13] = PKT_HASH_TYPE_NONE,
>> +	[14] = PKT_HASH_TYPE_NONE,
>> +	[15] = PKT_HASH_TYPE_NONE,
>> +};
>> +
>>   static inline void igc_rx_hash(struct igc_ring *ring,
>>   			       union igc_adv_rx_desc *rx_desc,
>>   			       struct sk_buff *skb)
>>   {
>> -	if (ring->netdev->features & NETIF_F_RXHASH)
>> -		skb_set_hash(skb,
>> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>> -			     PKT_HASH_TYPE_L3);
>> +	if (ring->netdev->features & NETIF_F_RXHASH) {
>> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
>> +		u32 rss_type = igc_rss_type(rx_desc);
>> +
>> +		skb_set_hash(skb, rss_hash, igc_rss_type_table[rss_type]);
> 
> Just curious why not copy the logic from the other driver fms10k, ice, ect.
> 
> 	skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> 		     (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
> 		     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);

Detail: This code mis-categorize (e.g. ARP) PKT_HASH_TYPE_L2 as
PKT_HASH_TYPE_L3, but as core reduces this further to one SKB bit, it
doesn't really matter.

> avoiding the table logic. Do the driver folks care?

The define IXGBE_RSS_L4_TYPES_MASK becomes the "table" logic as a 1-bit
true/false table.  It is a more compact table, let me know if this is
preferred.

Yes, it is really upto driver maintainer people to decide, what code is
preferred ?

--Jesper
John Fastabend April 24, 2023, 7:17 p.m. UTC | #3
Jesper Dangaard Brouer wrote:
> 
> 
> On 23/04/2023 16.46, John Fastabend wrote:
> > Jesper Dangaard Brouer wrote:
> >> When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
> >> ("igc: Add transmit and receive fastpath and interrupt handlers"), the
> >> hardware wasn't configured to provide RSS hash, thus it made sense to not
> >> enable net_device NETIF_F_RXHASH feature bit.
> >>
> >> The NIC hardware was configured to enable RSS hash info in v5.2 via commit
> >> 2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
> >> forgot to set the NETIF_F_RXHASH feature bit.
> >>
> >> The original implementation of igc_rx_hash() didn't extract the associated
> >> pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
> >> this patch are about extracting the RSS Type from the hardware and mapping
> >> this to enum pkt_hash_types. This was based on Foxville i225 software user
> >> manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).
> >>
> >> For UDP it's worth noting that RSS (type) hashing have been disabled both for
> >> IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
> >> because hardware RSS doesn't handle fragmented pkts well when enabled (can
> >> cause out-of-order). This results in PKT_HASH_TYPE_L3 for UDP packets, and
> >> hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
> >> the effect that netstack will do a software based hash calc calling into
> >> flow_dissect, but only when code calls skb_get_hash(), which doesn't
> >> necessary happen for local delivery.
> >>
> >> For QA verification testing I wrote a small bpftrace prog:
> >>   [0] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt
> >>
> >> Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
> >> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> >> ---
> >>   drivers/net/ethernet/intel/igc/igc.h      |   28 ++++++++++++++++++++++++++
> >>   drivers/net/ethernet/intel/igc/igc_main.c |   31 +++++++++++++++++++++++++----
> >>   2 files changed, 55 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> >> index 34aebf00a512..f7f9e217e7b4 100644
> >> --- a/drivers/net/ethernet/intel/igc/igc.h
> >> +++ b/drivers/net/ethernet/intel/igc/igc.h
> >> @@ -13,6 +13,7 @@
> >>   #include <linux/ptp_clock_kernel.h>
> >>   #include <linux/timecounter.h>
> >>   #include <linux/net_tstamp.h>
> >> +#include <linux/bitfield.h>
> >>   
> >>   #include "igc_hw.h"
> >>   
> >> @@ -311,6 +312,33 @@ extern char igc_driver_name[];
> >>   #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
> >>   #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
> >>   
> >> +/* RX-desc Write-Back format RSS Type's */
> >> +enum igc_rss_type_num {
> >> +	IGC_RSS_TYPE_NO_HASH		= 0,
> >> +	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
> >> +	IGC_RSS_TYPE_HASH_IPV4		= 2,
> >> +	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
> >> +	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
> >> +	IGC_RSS_TYPE_HASH_IPV6		= 5,
> >> +	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
> >> +	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
> >> +	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
> >> +	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
> >> +	IGC_RSS_TYPE_MAX		= 10,
> >> +};
> >> +#define IGC_RSS_TYPE_MAX_TABLE		16
> >> +#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
> >> +
> >> +/* igc_rss_type - Rx descriptor RSS type field */
> >> +static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
> >> +{
> >> +	/* RSS Type 4-bits (3:0) number: 0-9 (above 9 is reserved)
> >> +	 * Accessing the same bits via u16 (wb.lower.lo_dword.hs_rss.pkt_info)
> >> +	 * is slightly slower than via u32 (wb.lower.lo_dword.data)
> >> +	 */
> >> +	return le32_get_bits(rx_desc->wb.lower.lo_dword.data, IGC_RSS_TYPE_MASK);
> >> +}
> >> +
> >>   /* Interrupt defines */
> >>   #define IGC_START_ITR			648 /* ~6000 ints/sec */
> >>   #define IGC_4K_ITR			980
> >> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> >> index 1c4676882082..bfa9768d447f 100644
> >> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> >> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> >> @@ -1690,14 +1690,36 @@ static void igc_rx_checksum(struct igc_ring *ring,
> >>   		   le32_to_cpu(rx_desc->wb.upper.status_error));
> >>   }
> >>   
> >> +/* Mapping HW RSS Type to enum pkt_hash_types */
> >> +static const enum pkt_hash_types igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
> >> +	[IGC_RSS_TYPE_NO_HASH]		= PKT_HASH_TYPE_L2,
> >> +	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= PKT_HASH_TYPE_L4,
> >> +	[IGC_RSS_TYPE_HASH_IPV4]	= PKT_HASH_TYPE_L3,
> >> +	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= PKT_HASH_TYPE_L4,
> >> +	[IGC_RSS_TYPE_HASH_IPV6_EX]	= PKT_HASH_TYPE_L3,
> >> +	[IGC_RSS_TYPE_HASH_IPV6]	= PKT_HASH_TYPE_L3,
> >> +	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = PKT_HASH_TYPE_L4,
> >> +	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= PKT_HASH_TYPE_L4,
> >> +	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= PKT_HASH_TYPE_L4,
> >> +	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = PKT_HASH_TYPE_L4,
> >> +	[10] = PKT_HASH_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
> >> +	[11] = PKT_HASH_TYPE_NONE, /* keep array sized for SW bit-mask   */
> >> +	[12] = PKT_HASH_TYPE_NONE, /* to handle future HW revisons       */
> >> +	[13] = PKT_HASH_TYPE_NONE,
> >> +	[14] = PKT_HASH_TYPE_NONE,
> >> +	[15] = PKT_HASH_TYPE_NONE,
> >> +};
> >> +
> >>   static inline void igc_rx_hash(struct igc_ring *ring,
> >>   			       union igc_adv_rx_desc *rx_desc,
> >>   			       struct sk_buff *skb)
> >>   {
> >> -	if (ring->netdev->features & NETIF_F_RXHASH)
> >> -		skb_set_hash(skb,
> >> -			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> >> -			     PKT_HASH_TYPE_L3);
> >> +	if (ring->netdev->features & NETIF_F_RXHASH) {
> >> +		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
> >> +		u32 rss_type = igc_rss_type(rx_desc);
> >> +
> >> +		skb_set_hash(skb, rss_hash, igc_rss_type_table[rss_type]);
> > 
> > Just curious why not copy the logic from the other driver fms10k, ice, ect.
> > 
> > 	skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
> > 		     (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
> > 		     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
> 
> Detail: This code mis-categorize (e.g. ARP) PKT_HASH_TYPE_L2 as
> PKT_HASH_TYPE_L3, but as core reduces this further to one SKB bit, it
> doesn't really matter.
> 
> > avoiding the table logic. Do the driver folks care?
> 
> The define IXGBE_RSS_L4_TYPES_MASK becomes the "table" logic as a 1-bit
> true/false table.  It is a more compact table, let me know if this is
> preferred.
> 
> Yes, it is really upto driver maintainer people to decide, what code is
> preferred ?

Yeah doesn't matter much to me either way. I was just looking at code
compared to ice driver while reviewing.

> 
> --Jesper
>
Jesper Dangaard Brouer April 25, 2023, 8:43 a.m. UTC | #4
On 24/04/2023 21.17, John Fastabend wrote:
>>> Just curious why not copy the logic from the other driver fms10k, ice, ect.
>>>
>>> 	skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>> 		     (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
>>> 		     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
>> Detail: This code mis-categorize (e.g. ARP) PKT_HASH_TYPE_L2 as
>> PKT_HASH_TYPE_L3, but as core reduces this further to one SKB bit, it
>> doesn't really matter.
>>
>>> avoiding the table logic. Do the driver folks care?
>> The define IXGBE_RSS_L4_TYPES_MASK becomes the "table" logic as a 1-bit
>> true/false table.  It is a more compact table, let me know if this is
>> preferred.
>>
>> Yes, it is really upto driver maintainer people to decide, what code is
>> preferred ?
 >
> Yeah doesn't matter much to me either way. I was just looking at code
> compared to ice driver while reviewing.

My preference is to apply this patchset. We/I can easily followup and
change this to use the more compact approach later (if someone prefers).

I know net-next is "closed", but this patchset was posted prior to the
close.  Plus, a number of companies are waiting for the XDP-hint for HW
RX timestamp.  The support for driver stmmac is already in net-next
(commit e3f9c3e34840 ("net: stmmac: add Rx HWTS metadata to XDP receive
pkt")). Thus, it would be a help if both igc+stmmac changes land in same
kernel version, as both drivers are being evaluated by these companies.

Pretty please,
--Jesper
Daniel Borkmann April 27, 2023, 5 p.m. UTC | #5
On 4/25/23 10:43 AM, Jesper Dangaard Brouer wrote:
> On 24/04/2023 21.17, John Fastabend wrote:
>>>> Just curious why not copy the logic from the other driver fms10k, ice, ect.
>>>>
>>>>     skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>>>              (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
>>>>              PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
>>> Detail: This code mis-categorize (e.g. ARP) PKT_HASH_TYPE_L2 as
>>> PKT_HASH_TYPE_L3, but as core reduces this further to one SKB bit, it
>>> doesn't really matter.
>>>
>>>> avoiding the table logic. Do the driver folks care?
>>> The define IXGBE_RSS_L4_TYPES_MASK becomes the "table" logic as a 1-bit
>>> true/false table.  It is a more compact table, let me know if this is
>>> preferred.
>>>
>>> Yes, it is really upto driver maintainer people to decide, what code is
>>> preferred ?
>  >
>> Yeah doesn't matter much to me either way. I was just looking at code
>> compared to ice driver while reviewing.
> 
> My preference is to apply this patchset. We/I can easily followup and
> change this to use the more compact approach later (if someone prefers).

Consistency might help imo and would avoid questions/confusion on /why/
doing it differently for igc vs some of the others.

> I know net-next is "closed", but this patchset was posted prior to the
> close.  Plus, a number of companies are waiting for the XDP-hint for HW
> RX timestamp.  The support for driver stmmac is already in net-next
> (commit e3f9c3e34840 ("net: stmmac: add Rx HWTS metadata to XDP receive
> pkt")). Thus, it would be a help if both igc+stmmac changes land in same
> kernel version, as both drivers are being evaluated by these companies.

Given merge window is open now and net-next closed, it's too late to land
(unless Dave/Jakub thinks otherwise given it touches also driver bits).
I've applied the series to bpf-next right now.

Thanks,
Daniel
Jesper Dangaard Brouer April 28, 2023, 10:13 a.m. UTC | #6
On 27/04/2023 19.00, Daniel Borkmann wrote:
> On 4/25/23 10:43 AM, Jesper Dangaard Brouer wrote:
>> On 24/04/2023 21.17, John Fastabend wrote:
>>>>> Just curious why not copy the logic from the other driver fms10k, 
>>>>> ice, ect.
>>>>>
>>>>>     skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
>>>>>              (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ?
>>>>>              PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
>>>> Detail: This code mis-categorize (e.g. ARP) PKT_HASH_TYPE_L2 as
>>>> PKT_HASH_TYPE_L3, but as core reduces this further to one SKB bit, it
>>>> doesn't really matter.
>>>>
>>>>> avoiding the table logic. Do the driver folks care?
>>>> The define IXGBE_RSS_L4_TYPES_MASK becomes the "table" logic as a 1-bit
>>>> true/false table.  It is a more compact table, let me know if this is
>>>> preferred.
>>>>
>>>> Yes, it is really upto driver maintainer people to decide, what code is
>>>> preferred ?
>>  >
>>> Yeah doesn't matter much to me either way. I was just looking at code
>>> compared to ice driver while reviewing.
>>
>> My preference is to apply this patchset. We/I can easily followup and
>> change this to use the more compact approach later (if someone prefers).
> 
> Consistency might help imo and would avoid questions/confusion on /why/
> doing it differently for igc vs some of the others.
>

Well, drivers often do things differently, so that not something new. I
found the other approach less readable (and theoretically wrong for the
L2 case).  For igc this approach makes it easier to read (IMHO I'm
biased of cause) and easier to compare with kfunc metadata hint type
(that doesn't have RSS type information loss).

>> I know net-next is "closed", but this patchset was posted prior to the
>> close.  Plus, a number of companies are waiting for the XDP-hint for HW
>> RX timestamp.  The support for driver stmmac is already in net-next
>> (commit e3f9c3e34840 ("net: stmmac: add Rx HWTS metadata to XDP receive
>> pkt")). Thus, it would be a help if both igc+stmmac changes land in same
>> kernel version, as both drivers are being evaluated by these companies.
> 
> Given merge window is open now and net-next closed, it's too late to land
> (unless Dave/Jakub thinks otherwise given it touches also driver bits).
> I've applied the series to bpf-next right now.

It's not a big deal that it didn't reached net-next, end-users will just
have to wait for another kernel release to see this feature, or backport
the feature themselves.

Thanks for applying it.

--Jesper
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 34aebf00a512..f7f9e217e7b4 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -13,6 +13,7 @@ 
 #include <linux/ptp_clock_kernel.h>
 #include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
+#include <linux/bitfield.h>
 
 #include "igc_hw.h"
 
@@ -311,6 +312,33 @@  extern char igc_driver_name[];
 #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
 #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
 
+/* RX-desc Write-Back format RSS Type's */
+enum igc_rss_type_num {
+	IGC_RSS_TYPE_NO_HASH		= 0,
+	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
+	IGC_RSS_TYPE_HASH_IPV4		= 2,
+	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
+	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
+	IGC_RSS_TYPE_HASH_IPV6		= 5,
+	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
+	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
+	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
+	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
+	IGC_RSS_TYPE_MAX		= 10,
+};
+#define IGC_RSS_TYPE_MAX_TABLE		16
+#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
+
+/* igc_rss_type - Rx descriptor RSS type field */
+static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
+{
+	/* RSS Type 4-bits (3:0) number: 0-9 (above 9 is reserved)
+	 * Accessing the same bits via u16 (wb.lower.lo_dword.hs_rss.pkt_info)
+	 * is slightly slower than via u32 (wb.lower.lo_dword.data)
+	 */
+	return le32_get_bits(rx_desc->wb.lower.lo_dword.data, IGC_RSS_TYPE_MASK);
+}
+
 /* Interrupt defines */
 #define IGC_START_ITR			648 /* ~6000 ints/sec */
 #define IGC_4K_ITR			980
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 1c4676882082..bfa9768d447f 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1690,14 +1690,36 @@  static void igc_rx_checksum(struct igc_ring *ring,
 		   le32_to_cpu(rx_desc->wb.upper.status_error));
 }
 
+/* Mapping HW RSS Type to enum pkt_hash_types */
+static const enum pkt_hash_types igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
+	[IGC_RSS_TYPE_NO_HASH]		= PKT_HASH_TYPE_L2,
+	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV4]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV6_EX]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_IPV6]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = PKT_HASH_TYPE_L4,
+	[10] = PKT_HASH_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
+	[11] = PKT_HASH_TYPE_NONE, /* keep array sized for SW bit-mask   */
+	[12] = PKT_HASH_TYPE_NONE, /* to handle future HW revisons       */
+	[13] = PKT_HASH_TYPE_NONE,
+	[14] = PKT_HASH_TYPE_NONE,
+	[15] = PKT_HASH_TYPE_NONE,
+};
+
 static inline void igc_rx_hash(struct igc_ring *ring,
 			       union igc_adv_rx_desc *rx_desc,
 			       struct sk_buff *skb)
 {
-	if (ring->netdev->features & NETIF_F_RXHASH)
-		skb_set_hash(skb,
-			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
-			     PKT_HASH_TYPE_L3);
+	if (ring->netdev->features & NETIF_F_RXHASH) {
+		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
+		u32 rss_type = igc_rss_type(rx_desc);
+
+		skb_set_hash(skb, rss_hash, igc_rss_type_table[rss_type]);
+	}
 }
 
 static void igc_rx_vlan(struct igc_ring *rx_ring,
@@ -6554,6 +6576,7 @@  static int igc_probe(struct pci_dev *pdev,
 	netdev->features |= NETIF_F_TSO;
 	netdev->features |= NETIF_F_TSO6;
 	netdev->features |= NETIF_F_TSO_ECN;
+	netdev->features |= NETIF_F_RXHASH;
 	netdev->features |= NETIF_F_RXCSUM;
 	netdev->features |= NETIF_F_HW_CSUM;
 	netdev->features |= NETIF_F_SCTP_CRC;