diff mbox series

[bpf-next,v2,12/20] xdp: Add checksum level hint

Message ID 20230703181226.19380-13-larysa.zaremba@intel.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series XDP metadata via kfuncs for ice | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 4173 this patch: 4174
netdev/cc_maintainers warning 6 maintainers not CCed: hawk@kernel.org corbet@lwn.net davem@davemloft.net pabeni@redhat.com edumazet@google.com linux-doc@vger.kernel.org
netdev/build_clang fail Errors and warnings before: 897 this patch: 897
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 4390 this patch: 4391
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Larysa Zaremba July 3, 2023, 6:12 p.m. UTC
Implement functionality that enables drivers to expose to XDP code,
whether checksums was checked and on what level.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 Documentation/networking/xdp-rx-metadata.rst |  3 +++
 include/linux/netdevice.h                    |  1 +
 include/net/xdp.h                            |  2 ++
 kernel/bpf/offload.c                         |  2 ++
 net/core/xdp.c                               | 21 ++++++++++++++++++++
 5 files changed, 29 insertions(+)

Comments

John Fastabend July 3, 2023, 8:38 p.m. UTC | #1
Larysa Zaremba wrote:
> Implement functionality that enables drivers to expose to XDP code,
> whether checksums was checked and on what level.
> 
> Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> ---
>  Documentation/networking/xdp-rx-metadata.rst |  3 +++
>  include/linux/netdevice.h                    |  1 +
>  include/net/xdp.h                            |  2 ++
>  kernel/bpf/offload.c                         |  2 ++
>  net/core/xdp.c                               | 21 ++++++++++++++++++++
>  5 files changed, 29 insertions(+)
> 
> diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> index ea6dd79a21d3..4ec6ddfd2a52 100644
> --- a/Documentation/networking/xdp-rx-metadata.rst
> +++ b/Documentation/networking/xdp-rx-metadata.rst
> @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
>  .. kernel-doc:: net/core/xdp.c
>     :identifiers: bpf_xdp_metadata_rx_vlan_tag
>  
> +.. kernel-doc:: net/core/xdp.c
> +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> +
>  An XDP program can use these kfuncs to read the metadata into stack
>  variables for its own consumption. Or, to pass the metadata on to other
>  consumers, an XDP program can store it into the metadata area carried
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 4fa4380e6d89..569563687172 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
>  			       enum xdp_rss_hash_type *rss_type);
>  	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
>  				   __be16 *vlan_proto);
> +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
>  };
>  
>  /**
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 89c58f56ffc6..61ed38fa79d1 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
>  			   bpf_xdp_metadata_rx_hash) \
>  	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
>  			   bpf_xdp_metadata_rx_vlan_tag) \
> +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> +			   bpf_xdp_metadata_rx_csum_lvl) \
>  
>  enum {
>  #define XDP_METADATA_KFUNC(name, _) name,
> diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> index 986e7becfd42..a133fb775f49 100644
> --- a/kernel/bpf/offload.c
> +++ b/kernel/bpf/offload.c
> @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
>  		p = ops->xmo_rx_hash;
>  	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
>  		p = ops->xmo_rx_vlan_tag;
> +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> +		p = ops->xmo_rx_csum_lvl;
>  out:
>  	up_read(&bpf_devs_lock);
>  
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index f6262c90e45f..c666d3e0a26c 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
>  	return -EOPNOTSUPP;
>  }
>  
> +/**
> + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> + * @ctx: XDP context pointer.
> + * @csum_level: Return value pointer.
> + *
> + * In case of success, csum_level contains depth of the last verified checksum.
> + * If only the outermost checksum was verified, csum_level is 0, if both
> + * encapsulation and inner transport checksums were verified, csum_level is 1,
> + * and so on.
> + * For more details, refer to csum_level field in sk_buff.
> + *
> + * Return:
> + * * Returns 0 on success or ``-errno`` on error.
> + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> + * * ``-ENODATA``    : Checksum was not validated
> + */
> +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)

Istead of ENODATA should we return what would be put in the ip_summed field
CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,

 bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);

or something like that? Or is the thought that its not really necessary?
I don't have a strong preference but figured it was worth asking.

> +{
> +	return -EOPNOTSUPP;
> +}
> +
>  __diag_pop();
>  
>  BTF_SET8_START(xdp_metadata_kfunc_ids)
> -- 
> 2.41.0
>
Larysa Zaremba July 4, 2023, 9:24 a.m. UTC | #2
On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
> Larysa Zaremba wrote:
> > Implement functionality that enables drivers to expose to XDP code,
> > whether checksums was checked and on what level.
> > 
> > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > ---
> >  Documentation/networking/xdp-rx-metadata.rst |  3 +++
> >  include/linux/netdevice.h                    |  1 +
> >  include/net/xdp.h                            |  2 ++
> >  kernel/bpf/offload.c                         |  2 ++
> >  net/core/xdp.c                               | 21 ++++++++++++++++++++
> >  5 files changed, 29 insertions(+)
> > 
> > diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> > index ea6dd79a21d3..4ec6ddfd2a52 100644
> > --- a/Documentation/networking/xdp-rx-metadata.rst
> > +++ b/Documentation/networking/xdp-rx-metadata.rst
> > @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
> >  .. kernel-doc:: net/core/xdp.c
> >     :identifiers: bpf_xdp_metadata_rx_vlan_tag
> >  
> > +.. kernel-doc:: net/core/xdp.c
> > +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> > +
> >  An XDP program can use these kfuncs to read the metadata into stack
> >  variables for its own consumption. Or, to pass the metadata on to other
> >  consumers, an XDP program can store it into the metadata area carried
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 4fa4380e6d89..569563687172 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
> >  			       enum xdp_rss_hash_type *rss_type);
> >  	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
> >  				   __be16 *vlan_proto);
> > +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
> >  };
> >  
> >  /**
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 89c58f56ffc6..61ed38fa79d1 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> >  			   bpf_xdp_metadata_rx_hash) \
> >  	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
> >  			   bpf_xdp_metadata_rx_vlan_tag) \
> > +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> > +			   bpf_xdp_metadata_rx_csum_lvl) \
> >  
> >  enum {
> >  #define XDP_METADATA_KFUNC(name, _) name,
> > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > index 986e7becfd42..a133fb775f49 100644
> > --- a/kernel/bpf/offload.c
> > +++ b/kernel/bpf/offload.c
> > @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
> >  		p = ops->xmo_rx_hash;
> >  	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
> >  		p = ops->xmo_rx_vlan_tag;
> > +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> > +		p = ops->xmo_rx_csum_lvl;
> >  out:
> >  	up_read(&bpf_devs_lock);
> >  
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index f6262c90e45f..c666d3e0a26c 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
> >  	return -EOPNOTSUPP;
> >  }
> >  
> > +/**
> > + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> > + * @ctx: XDP context pointer.
> > + * @csum_level: Return value pointer.
> > + *
> > + * In case of success, csum_level contains depth of the last verified checksum.
> > + * If only the outermost checksum was verified, csum_level is 0, if both
> > + * encapsulation and inner transport checksums were verified, csum_level is 1,
> > + * and so on.
> > + * For more details, refer to csum_level field in sk_buff.
> > + *
> > + * Return:
> > + * * Returns 0 on success or ``-errno`` on error.
> > + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> > + * * ``-ENODATA``    : Checksum was not validated
> > + */
> > +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
> 
> Istead of ENODATA should we return what would be put in the ip_summed field
> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
> 
>  bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
> 
> or something like that? Or is the thought that its not really necessary?
> I don't have a strong preference but figured it was worth asking.
>

I see no value in returning CHECKSUM_COMPLETE without the actual checksum value. 
Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would 
overcomplicate the function signature.
 
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> > +
> >  __diag_pop();
> >  
> >  BTF_SET8_START(xdp_metadata_kfunc_ids)
> > -- 
> > 2.41.0
> >
Jesper Dangaard Brouer July 4, 2023, 10:39 a.m. UTC | #3
Cc. DaveM+Alex Duyck, as I value your insights on checksums.

On 04/07/2023 11.24, Larysa Zaremba wrote:
> On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
>> Larysa Zaremba wrote:
>>> Implement functionality that enables drivers to expose to XDP code,
>>> whether checksums was checked and on what level.
>>>
>>> Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
>>> ---
>>>   Documentation/networking/xdp-rx-metadata.rst |  3 +++
>>>   include/linux/netdevice.h                    |  1 +
>>>   include/net/xdp.h                            |  2 ++
>>>   kernel/bpf/offload.c                         |  2 ++
>>>   net/core/xdp.c                               | 21 ++++++++++++++++++++
>>>   5 files changed, 29 insertions(+)
>>>
>>> diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
>>> index ea6dd79a21d3..4ec6ddfd2a52 100644
>>> --- a/Documentation/networking/xdp-rx-metadata.rst
>>> +++ b/Documentation/networking/xdp-rx-metadata.rst
>>> @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
>>>   .. kernel-doc:: net/core/xdp.c
>>>      :identifiers: bpf_xdp_metadata_rx_vlan_tag
>>>   
>>> +.. kernel-doc:: net/core/xdp.c
>>> +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
>>> +
>>>   An XDP program can use these kfuncs to read the metadata into stack
>>>   variables for its own consumption. Or, to pass the metadata on to other
>>>   consumers, an XDP program can store it into the metadata area carried
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index 4fa4380e6d89..569563687172 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
>>>   			       enum xdp_rss_hash_type *rss_type);
>>>   	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
>>>   				   __be16 *vlan_proto);
>>> +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
>>>   };
>>>   
>>>   /**
>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>> index 89c58f56ffc6..61ed38fa79d1 100644
>>> --- a/include/net/xdp.h
>>> +++ b/include/net/xdp.h
>>> @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
>>>   			   bpf_xdp_metadata_rx_hash) \
>>>   	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
>>>   			   bpf_xdp_metadata_rx_vlan_tag) \
>>> +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
>>> +			   bpf_xdp_metadata_rx_csum_lvl) \
>>>   
>>>   enum {
>>>   #define XDP_METADATA_KFUNC(name, _) name,
>>> diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
>>> index 986e7becfd42..a133fb775f49 100644
>>> --- a/kernel/bpf/offload.c
>>> +++ b/kernel/bpf/offload.c
>>> @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
>>>   		p = ops->xmo_rx_hash;
>>>   	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
>>>   		p = ops->xmo_rx_vlan_tag;
>>> +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
>>> +		p = ops->xmo_rx_csum_lvl;
>>>   out:
>>>   	up_read(&bpf_devs_lock);
>>>   
>>> diff --git a/net/core/xdp.c b/net/core/xdp.c
>>> index f6262c90e45f..c666d3e0a26c 100644
>>> --- a/net/core/xdp.c
>>> +++ b/net/core/xdp.c
>>> @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
>>>   	return -EOPNOTSUPP;
>>>   }
>>>   
>>> +/**
>>> + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
>>> + * @ctx: XDP context pointer.
>>> + * @csum_level: Return value pointer.
>>> + *
>>> + * In case of success, csum_level contains depth of the last verified checksum.
>>> + * If only the outermost checksum was verified, csum_level is 0, if both
>>> + * encapsulation and inner transport checksums were verified, csum_level is 1,
>>> + * and so on.
>>> + * For more details, refer to csum_level field in sk_buff.
>>> + *
>>> + * Return:
>>> + * * Returns 0 on success or ``-errno`` on error.
>>> + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
>>> + * * ``-ENODATA``    : Checksum was not validated
>>> + */
>>> +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
>>
>> Istead of ENODATA should we return what would be put in the ip_summed field
>> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,

I was thinking the same, what about checksum "type".

>>
>>   bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
>>
>> or something like that? Or is the thought that its not really necessary?
>> I don't have a strong preference but figured it was worth asking.
>>
> 
> I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
> Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
> overcomplicate the function signature.
>   

So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent 
to CHECKSUM_UNNECESSARY?

Looking at documentation[1] (generated from skbuff.h):
  [1] 
https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device

Is the idea that we can add another kfunc (new signature) than can deal
with the other types of checksums (in a later kernel release)?


>>> +{
>>> +	return -EOPNOTSUPP;
>>> +}
>>> +
>>>   __diag_pop();
>
Larysa Zaremba July 4, 2023, 11:19 a.m. UTC | #4
On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
> Cc. DaveM+Alex Duyck, as I value your insights on checksums.
> 
> On 04/07/2023 11.24, Larysa Zaremba wrote:
> > On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
> > > Larysa Zaremba wrote:
> > > > Implement functionality that enables drivers to expose to XDP code,
> > > > whether checksums was checked and on what level.
> > > > 
> > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > ---
> > > >   Documentation/networking/xdp-rx-metadata.rst |  3 +++
> > > >   include/linux/netdevice.h                    |  1 +
> > > >   include/net/xdp.h                            |  2 ++
> > > >   kernel/bpf/offload.c                         |  2 ++
> > > >   net/core/xdp.c                               | 21 ++++++++++++++++++++
> > > >   5 files changed, 29 insertions(+)
> > > > 
> > > > diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> > > > index ea6dd79a21d3..4ec6ddfd2a52 100644
> > > > --- a/Documentation/networking/xdp-rx-metadata.rst
> > > > +++ b/Documentation/networking/xdp-rx-metadata.rst
> > > > @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
> > > >   .. kernel-doc:: net/core/xdp.c
> > > >      :identifiers: bpf_xdp_metadata_rx_vlan_tag
> > > > +.. kernel-doc:: net/core/xdp.c
> > > > +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> > > > +
> > > >   An XDP program can use these kfuncs to read the metadata into stack
> > > >   variables for its own consumption. Or, to pass the metadata on to other
> > > >   consumers, an XDP program can store it into the metadata area carried
> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > index 4fa4380e6d89..569563687172 100644
> > > > --- a/include/linux/netdevice.h
> > > > +++ b/include/linux/netdevice.h
> > > > @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
> > > >   			       enum xdp_rss_hash_type *rss_type);
> > > >   	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
> > > >   				   __be16 *vlan_proto);
> > > > +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
> > > >   };
> > > >   /**
> > > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > > index 89c58f56ffc6..61ed38fa79d1 100644
> > > > --- a/include/net/xdp.h
> > > > +++ b/include/net/xdp.h
> > > > @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> > > >   			   bpf_xdp_metadata_rx_hash) \
> > > >   	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
> > > >   			   bpf_xdp_metadata_rx_vlan_tag) \
> > > > +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> > > > +			   bpf_xdp_metadata_rx_csum_lvl) \
> > > >   enum {
> > > >   #define XDP_METADATA_KFUNC(name, _) name,
> > > > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > > > index 986e7becfd42..a133fb775f49 100644
> > > > --- a/kernel/bpf/offload.c
> > > > +++ b/kernel/bpf/offload.c
> > > > @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
> > > >   		p = ops->xmo_rx_hash;
> > > >   	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
> > > >   		p = ops->xmo_rx_vlan_tag;
> > > > +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> > > > +		p = ops->xmo_rx_csum_lvl;
> > > >   out:
> > > >   	up_read(&bpf_devs_lock);
> > > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > > index f6262c90e45f..c666d3e0a26c 100644
> > > > --- a/net/core/xdp.c
> > > > +++ b/net/core/xdp.c
> > > > @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
> > > >   	return -EOPNOTSUPP;
> > > >   }
> > > > +/**
> > > > + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> > > > + * @ctx: XDP context pointer.
> > > > + * @csum_level: Return value pointer.
> > > > + *
> > > > + * In case of success, csum_level contains depth of the last verified checksum.
> > > > + * If only the outermost checksum was verified, csum_level is 0, if both
> > > > + * encapsulation and inner transport checksums were verified, csum_level is 1,
> > > > + * and so on.
> > > > + * For more details, refer to csum_level field in sk_buff.
> > > > + *
> > > > + * Return:
> > > > + * * Returns 0 on success or ``-errno`` on error.
> > > > + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> > > > + * * ``-ENODATA``    : Checksum was not validated
> > > > + */
> > > > +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
> > > 
> > > Istead of ENODATA should we return what would be put in the ip_summed field
> > > CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
> 
> I was thinking the same, what about checksum "type".
> 
> > > 
> > >   bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
> > > 
> > > or something like that? Or is the thought that its not really necessary?
> > > I don't have a strong preference but figured it was worth asking.
> > > 
> > 
> > I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
> > Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
> > overcomplicate the function signature.
> 
> So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
> CHECKSUM_UNNECESSARY?

This is 100% true for physical NICs, it's more complicated for veth, bacause it 
often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is 
treated by the network stack as a validated checksum, because there is no way 
internally generated packet could be messed up. I would be grateful if you could 
look at the veth patch and share your opinion about this.

> 
> Looking at documentation[1] (generated from skbuff.h):
>  [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
> 
> Is the idea that we can add another kfunc (new signature) than can deal
> with the other types of checksums (in a later kernel release)?
>

Yes, that is the idea.
 
> 
> > > > +{
> > > > +	return -EOPNOTSUPP;
> > > > +}
> > > > +
> > > >   __diag_pop();
> > 
>
John Fastabend July 6, 2023, 5:50 a.m. UTC | #5
Larysa Zaremba wrote:
> On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
> > Cc. DaveM+Alex Duyck, as I value your insights on checksums.
> > 
> > On 04/07/2023 11.24, Larysa Zaremba wrote:
> > > On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
> > > > Larysa Zaremba wrote:
> > > > > Implement functionality that enables drivers to expose to XDP code,
> > > > > whether checksums was checked and on what level.
> > > > > 
> > > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > > ---
> > > > >   Documentation/networking/xdp-rx-metadata.rst |  3 +++
> > > > >   include/linux/netdevice.h                    |  1 +
> > > > >   include/net/xdp.h                            |  2 ++
> > > > >   kernel/bpf/offload.c                         |  2 ++
> > > > >   net/core/xdp.c                               | 21 ++++++++++++++++++++
> > > > >   5 files changed, 29 insertions(+)
> > > > > 
> > > > > diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> > > > > index ea6dd79a21d3..4ec6ddfd2a52 100644
> > > > > --- a/Documentation/networking/xdp-rx-metadata.rst
> > > > > +++ b/Documentation/networking/xdp-rx-metadata.rst
> > > > > @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
> > > > >   .. kernel-doc:: net/core/xdp.c
> > > > >      :identifiers: bpf_xdp_metadata_rx_vlan_tag
> > > > > +.. kernel-doc:: net/core/xdp.c
> > > > > +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> > > > > +
> > > > >   An XDP program can use these kfuncs to read the metadata into stack
> > > > >   variables for its own consumption. Or, to pass the metadata on to other
> > > > >   consumers, an XDP program can store it into the metadata area carried
> > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > > index 4fa4380e6d89..569563687172 100644
> > > > > --- a/include/linux/netdevice.h
> > > > > +++ b/include/linux/netdevice.h
> > > > > @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
> > > > >   			       enum xdp_rss_hash_type *rss_type);
> > > > >   	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
> > > > >   				   __be16 *vlan_proto);
> > > > > +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
> > > > >   };
> > > > >   /**
> > > > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > > > index 89c58f56ffc6..61ed38fa79d1 100644
> > > > > --- a/include/net/xdp.h
> > > > > +++ b/include/net/xdp.h
> > > > > @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> > > > >   			   bpf_xdp_metadata_rx_hash) \
> > > > >   	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
> > > > >   			   bpf_xdp_metadata_rx_vlan_tag) \
> > > > > +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> > > > > +			   bpf_xdp_metadata_rx_csum_lvl) \
> > > > >   enum {
> > > > >   #define XDP_METADATA_KFUNC(name, _) name,
> > > > > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > > > > index 986e7becfd42..a133fb775f49 100644
> > > > > --- a/kernel/bpf/offload.c
> > > > > +++ b/kernel/bpf/offload.c
> > > > > @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
> > > > >   		p = ops->xmo_rx_hash;
> > > > >   	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
> > > > >   		p = ops->xmo_rx_vlan_tag;
> > > > > +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> > > > > +		p = ops->xmo_rx_csum_lvl;
> > > > >   out:
> > > > >   	up_read(&bpf_devs_lock);
> > > > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > > > index f6262c90e45f..c666d3e0a26c 100644
> > > > > --- a/net/core/xdp.c
> > > > > +++ b/net/core/xdp.c
> > > > > @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
> > > > >   	return -EOPNOTSUPP;
> > > > >   }
> > > > > +/**
> > > > > + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> > > > > + * @ctx: XDP context pointer.
> > > > > + * @csum_level: Return value pointer.
> > > > > + *
> > > > > + * In case of success, csum_level contains depth of the last verified checksum.
> > > > > + * If only the outermost checksum was verified, csum_level is 0, if both
> > > > > + * encapsulation and inner transport checksums were verified, csum_level is 1,
> > > > > + * and so on.
> > > > > + * For more details, refer to csum_level field in sk_buff.
> > > > > + *
> > > > > + * Return:
> > > > > + * * Returns 0 on success or ``-errno`` on error.
> > > > > + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> > > > > + * * ``-ENODATA``    : Checksum was not validated
> > > > > + */
> > > > > +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
> > > > 
> > > > Istead of ENODATA should we return what would be put in the ip_summed field
> > > > CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
> > 
> > I was thinking the same, what about checksum "type".
> > 
> > > > 
> > > >   bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
> > > > 
> > > > or something like that? Or is the thought that its not really necessary?
> > > > I don't have a strong preference but figured it was worth asking.
> > > > 
> > > 
> > > I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
> > > Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
> > > overcomplicate the function signature.
> > 
> > So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
> > CHECKSUM_UNNECESSARY?
> 
> This is 100% true for physical NICs, it's more complicated for veth, bacause it 
> often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is 
> treated by the network stack as a validated checksum, because there is no way 
> internally generated packet could be messed up. I would be grateful if you could 
> look at the veth patch and share your opinion about this.
> 
> > 
> > Looking at documentation[1] (generated from skbuff.h):
> >  [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
> > 
> > Is the idea that we can add another kfunc (new signature) than can deal
> > with the other types of checksums (in a later kernel release)?
> >
> 
> Yes, that is the idea.

If we think there is a chance we might need another kfunc we should add it
in the same kfunc. It would be unfortunate to have to do two kfuncs when
one would work. It shouldn't cost much/anything(?) to hardcode the type for
most cases? I think if we need it later I would advocate for updating this
kfunc to support it. Of course then userspace will have to swivel on the
kfunc signature.
Jesper Dangaard Brouer July 6, 2023, 9:04 a.m. UTC | #6
On 06/07/2023 07.50, John Fastabend wrote:
> Larysa Zaremba wrote:
>> On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
>>> Cc. DaveM+Alex Duyck, as I value your insights on checksums.
>>>
>>> On 04/07/2023 11.24, Larysa Zaremba wrote:
>>>> On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
>>>>> Larysa Zaremba wrote:
>>>>>> Implement functionality that enables drivers to expose to XDP code,
>>>>>> whether checksums was checked and on what level.
>>>>>>
>>>>>> Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
>>>>>> ---
>>>>>>    Documentation/networking/xdp-rx-metadata.rst |  3 +++
>>>>>>    include/linux/netdevice.h                    |  1 +
>>>>>>    include/net/xdp.h                            |  2 ++
>>>>>>    kernel/bpf/offload.c                         |  2 ++
>>>>>>    net/core/xdp.c                               | 21 ++++++++++++++++++++
>>>>>>    5 files changed, 29 insertions(+)
>>>>>>
>>>>>> diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
>>>>>> index ea6dd79a21d3..4ec6ddfd2a52 100644
>>>>>> --- a/Documentation/networking/xdp-rx-metadata.rst
>>>>>> +++ b/Documentation/networking/xdp-rx-metadata.rst
>>>>>> @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
>>>>>>    .. kernel-doc:: net/core/xdp.c
>>>>>>       :identifiers: bpf_xdp_metadata_rx_vlan_tag
>>>>>> +.. kernel-doc:: net/core/xdp.c
>>>>>> +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
>>>>>> +
>>>>>>    An XDP program can use these kfuncs to read the metadata into stack
>>>>>>    variables for its own consumption. Or, to pass the metadata on to other
>>>>>>    consumers, an XDP program can store it into the metadata area carried
>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>> index 4fa4380e6d89..569563687172 100644
>>>>>> --- a/include/linux/netdevice.h
>>>>>> +++ b/include/linux/netdevice.h
>>>>>> @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
>>>>>>    			       enum xdp_rss_hash_type *rss_type);
>>>>>>    	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
>>>>>>    				   __be16 *vlan_proto);
>>>>>> +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
>>>>>>    };
>>>>>>    /**
>>>>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>>>>> index 89c58f56ffc6..61ed38fa79d1 100644
>>>>>> --- a/include/net/xdp.h
>>>>>> +++ b/include/net/xdp.h
>>>>>> @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
>>>>>>    			   bpf_xdp_metadata_rx_hash) \
>>>>>>    	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
>>>>>>    			   bpf_xdp_metadata_rx_vlan_tag) \
>>>>>> +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
>>>>>> +			   bpf_xdp_metadata_rx_csum_lvl) \
>>>>>>    enum {
>>>>>>    #define XDP_METADATA_KFUNC(name, _) name,
>>>>>> diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
>>>>>> index 986e7becfd42..a133fb775f49 100644
>>>>>> --- a/kernel/bpf/offload.c
>>>>>> +++ b/kernel/bpf/offload.c
>>>>>> @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
>>>>>>    		p = ops->xmo_rx_hash;
>>>>>>    	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
>>>>>>    		p = ops->xmo_rx_vlan_tag;
>>>>>> +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
>>>>>> +		p = ops->xmo_rx_csum_lvl;
>>>>>>    out:
>>>>>>    	up_read(&bpf_devs_lock);
>>>>>> diff --git a/net/core/xdp.c b/net/core/xdp.c
>>>>>> index f6262c90e45f..c666d3e0a26c 100644
>>>>>> --- a/net/core/xdp.c
>>>>>> +++ b/net/core/xdp.c
>>>>>> @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
>>>>>>    	return -EOPNOTSUPP;
>>>>>>    }
>>>>>> +/**
>>>>>> + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
>>>>>> + * @ctx: XDP context pointer.
>>>>>> + * @csum_level: Return value pointer.
>>>>>> + *
>>>>>> + * In case of success, csum_level contains depth of the last verified checksum.
>>>>>> + * If only the outermost checksum was verified, csum_level is 0, if both
>>>>>> + * encapsulation and inner transport checksums were verified, csum_level is 1,
>>>>>> + * and so on.
>>>>>> + * For more details, refer to csum_level field in sk_buff.
>>>>>> + *
>>>>>> + * Return:
>>>>>> + * * Returns 0 on success or ``-errno`` on error.
>>>>>> + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
>>>>>> + * * ``-ENODATA``    : Checksum was not validated
>>>>>> + */
>>>>>> +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
>>>>>
>>>>> Istead of ENODATA should we return what would be put in the ip_summed field
>>>>> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
>>>
>>> I was thinking the same, what about checksum "type".
>>>
>>>>>
>>>>>    bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
>>>>>
>>>>> or something like that? Or is the thought that its not really necessary?
>>>>> I don't have a strong preference but figured it was worth asking.
>>>>>
>>>>
>>>> I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
>>>> Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
>>>> overcomplicate the function signature.
>>>
>>> So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
>>> CHECKSUM_UNNECESSARY?
>>
>> This is 100% true for physical NICs, it's more complicated for veth, bacause it
>> often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is
>> treated by the network stack as a validated checksum, because there is no way
>> internally generated packet could be messed up. I would be grateful if you could
>> look at the veth patch and share your opinion about this.
>>
>>>
>>> Looking at documentation[1] (generated from skbuff.h):
>>>   [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
>>>
>>> Is the idea that we can add another kfunc (new signature) than can deal
>>> with the other types of checksums (in a later kernel release)?
>>>
>>
>> Yes, that is the idea.
> 
> If we think there is a chance we might need another kfunc we should add it
> in the same kfunc. It would be unfortunate to have to do two kfuncs when
> one would work. It shouldn't cost much/anything(?) to hardcode the type for
> most cases? I think if we need it later I would advocate for updating this
> kfunc to support it. Of course then userspace will have to swivel on the
> kfunc signature.
> 

I think it might make sense to have 3 kfuncs for checksumming.
As this would allow BPF-prog to focus on CHECKSUM_UNNECESSARY, and then
only call additional kfunc for extracting e.g csum_start  + csum_offset
when type is CHECKSUM_PARTIAL.

We could extend bpf_xdp_metadata_rx_csum_lvl() to give the csum_type
CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}.

  int bpf_xdp_metadata_rx_csum_lvl(*ctx, u8 *csum_level, u8 *csum_type)

And then add two kfunc e.g.
  (1) bpf_xdp_metadata_rx_csum_partial(ctx, start, offset)
  (2) bpf_xdp_metadata_rx_csum_complete(ctx, csum)

Pseudo BPF-prog code:

  err = bpf_xdp_metadata_rx_csum_lvl(ctx, level, type);
  if (!err && type != CHECKSUM_UNNECESSARY) {
      if (type == CHECKSUM_PARTIAL)
          err = bpf_xdp_metadata_rx_csum_partial(ctx, start, offset);
      if (type == CHECKSUM_COMPLETE)
          err = bpf_xdp_metadata_rx_csum_complete(ctx, csum);
  }

Looking at code, I feel we could rename [...]_csum_lvl to csum_type.
E.g. bpf_xdp_metadata_rx_csum_type.

Feel free to disagree,
--Jesper
Larysa Zaremba July 6, 2023, 12:38 p.m. UTC | #7
On Thu, Jul 06, 2023 at 11:04:49AM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 06/07/2023 07.50, John Fastabend wrote:
> > Larysa Zaremba wrote:
> > > On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
> > > > Cc. DaveM+Alex Duyck, as I value your insights on checksums.
> > > > 
> > > > On 04/07/2023 11.24, Larysa Zaremba wrote:
> > > > > On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
> > > > > > Larysa Zaremba wrote:
> > > > > > > Implement functionality that enables drivers to expose to XDP code,
> > > > > > > whether checksums was checked and on what level.
> > > > > > > 
> > > > > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > > > > ---
> > > > > > >    Documentation/networking/xdp-rx-metadata.rst |  3 +++
> > > > > > >    include/linux/netdevice.h                    |  1 +
> > > > > > >    include/net/xdp.h                            |  2 ++
> > > > > > >    kernel/bpf/offload.c                         |  2 ++
> > > > > > >    net/core/xdp.c                               | 21 ++++++++++++++++++++
> > > > > > >    5 files changed, 29 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > index ea6dd79a21d3..4ec6ddfd2a52 100644
> > > > > > > --- a/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > +++ b/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
> > > > > > >    .. kernel-doc:: net/core/xdp.c
> > > > > > >       :identifiers: bpf_xdp_metadata_rx_vlan_tag
> > > > > > > +.. kernel-doc:: net/core/xdp.c
> > > > > > > +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> > > > > > > +
> > > > > > >    An XDP program can use these kfuncs to read the metadata into stack
> > > > > > >    variables for its own consumption. Or, to pass the metadata on to other
> > > > > > >    consumers, an XDP program can store it into the metadata area carried
> > > > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > > > > index 4fa4380e6d89..569563687172 100644
> > > > > > > --- a/include/linux/netdevice.h
> > > > > > > +++ b/include/linux/netdevice.h
> > > > > > > @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
> > > > > > >    			       enum xdp_rss_hash_type *rss_type);
> > > > > > >    	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
> > > > > > >    				   __be16 *vlan_proto);
> > > > > > > +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
> > > > > > >    };
> > > > > > >    /**
> > > > > > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > > > > > index 89c58f56ffc6..61ed38fa79d1 100644
> > > > > > > --- a/include/net/xdp.h
> > > > > > > +++ b/include/net/xdp.h
> > > > > > > @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> > > > > > >    			   bpf_xdp_metadata_rx_hash) \
> > > > > > >    	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
> > > > > > >    			   bpf_xdp_metadata_rx_vlan_tag) \
> > > > > > > +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> > > > > > > +			   bpf_xdp_metadata_rx_csum_lvl) \
> > > > > > >    enum {
> > > > > > >    #define XDP_METADATA_KFUNC(name, _) name,
> > > > > > > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > > > > > > index 986e7becfd42..a133fb775f49 100644
> > > > > > > --- a/kernel/bpf/offload.c
> > > > > > > +++ b/kernel/bpf/offload.c
> > > > > > > @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
> > > > > > >    		p = ops->xmo_rx_hash;
> > > > > > >    	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
> > > > > > >    		p = ops->xmo_rx_vlan_tag;
> > > > > > > +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> > > > > > > +		p = ops->xmo_rx_csum_lvl;
> > > > > > >    out:
> > > > > > >    	up_read(&bpf_devs_lock);
> > > > > > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > > > > > index f6262c90e45f..c666d3e0a26c 100644
> > > > > > > --- a/net/core/xdp.c
> > > > > > > +++ b/net/core/xdp.c
> > > > > > > @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
> > > > > > >    	return -EOPNOTSUPP;
> > > > > > >    }
> > > > > > > +/**
> > > > > > > + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> > > > > > > + * @ctx: XDP context pointer.
> > > > > > > + * @csum_level: Return value pointer.
> > > > > > > + *
> > > > > > > + * In case of success, csum_level contains depth of the last verified checksum.
> > > > > > > + * If only the outermost checksum was verified, csum_level is 0, if both
> > > > > > > + * encapsulation and inner transport checksums were verified, csum_level is 1,
> > > > > > > + * and so on.
> > > > > > > + * For more details, refer to csum_level field in sk_buff.
> > > > > > > + *
> > > > > > > + * Return:
> > > > > > > + * * Returns 0 on success or ``-errno`` on error.
> > > > > > > + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> > > > > > > + * * ``-ENODATA``    : Checksum was not validated
> > > > > > > + */
> > > > > > > +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
> > > > > > 
> > > > > > Istead of ENODATA should we return what would be put in the ip_summed field
> > > > > > CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
> > > > 
> > > > I was thinking the same, what about checksum "type".
> > > > 
> > > > > > 
> > > > > >    bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
> > > > > > 
> > > > > > or something like that? Or is the thought that its not really necessary?
> > > > > > I don't have a strong preference but figured it was worth asking.
> > > > > > 
> > > > > 
> > > > > I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
> > > > > Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
> > > > > overcomplicate the function signature.
> > > > 
> > > > So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
> > > > CHECKSUM_UNNECESSARY?
> > > 
> > > This is 100% true for physical NICs, it's more complicated for veth, bacause it
> > > often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is
> > > treated by the network stack as a validated checksum, because there is no way
> > > internally generated packet could be messed up. I would be grateful if you could
> > > look at the veth patch and share your opinion about this.
> > > 
> > > > 
> > > > Looking at documentation[1] (generated from skbuff.h):
> > > >   [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
> > > > 
> > > > Is the idea that we can add another kfunc (new signature) than can deal
> > > > with the other types of checksums (in a later kernel release)?
> > > > 
> > > 
> > > Yes, that is the idea.
> > 
> > If we think there is a chance we might need another kfunc we should add it
> > in the same kfunc. It would be unfortunate to have to do two kfuncs when
> > one would work. It shouldn't cost much/anything(?) to hardcode the type for
> > most cases? I think if we need it later I would advocate for updating this
> > kfunc to support it. Of course then userspace will have to swivel on the
> > kfunc signature.
> > 
> 
> I think it might make sense to have 3 kfuncs for checksumming.
> As this would allow BPF-prog to focus on CHECKSUM_UNNECESSARY, and then
> only call additional kfunc for extracting e.g csum_start  + csum_offset
> when type is CHECKSUM_PARTIAL.
> 
> We could extend bpf_xdp_metadata_rx_csum_lvl() to give the csum_type
> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}.
> 
>  int bpf_xdp_metadata_rx_csum_lvl(*ctx, u8 *csum_level, u8 *csum_type)
> 
> And then add two kfunc e.g.
>  (1) bpf_xdp_metadata_rx_csum_partial(ctx, start, offset)
>  (2) bpf_xdp_metadata_rx_csum_complete(ctx, csum)
> 
> Pseudo BPF-prog code:
> 
>  err = bpf_xdp_metadata_rx_csum_lvl(ctx, level, type);
>  if (!err && type != CHECKSUM_UNNECESSARY) {
>      if (type == CHECKSUM_PARTIAL)
>          err = bpf_xdp_metadata_rx_csum_partial(ctx, start, offset);
>      if (type == CHECKSUM_COMPLETE)
>          err = bpf_xdp_metadata_rx_csum_complete(ctx, csum);
>  }
> 
> Looking at code, I feel we could rename [...]_csum_lvl to csum_type.
> E.g. bpf_xdp_metadata_rx_csum_type.
>

What about:

union csum_info {
	struct {
		u16 csum_start;
		u16 csum_offset;
	};
	u32 checksum;
	u8 checksum_level;
};

bpf_xdp_metadata_rx_csum(*ctx, u8 *csum_status, union csum_info *info);

One thing that is worth considering in my opinion is whether some hardware can 
provide both CHECKSUM_UNNECESSARY and CHECKSUM_COMPLETE. Judging by [0], this 
does occur. I such cases using an enum to represent the checksum status would 
artificially limit the capabilities. Now, imagine the situation:

- You want to use your XDP program with 2 different NICs

[...]

err = bpf_xdp_metadata_rx_csum(*ctx, &status, &info);
if (!err && status == CHECKSUM_UNNECESSARY)
	/* Do stuff */

[...]
- One NIC can both calculate CHECKSUM_COMPLETE and parse headers, another one 
  is only able to parse headers. Those can be very similar NICs from different 
  generation.
- You test your program on the simpler NIC, program works fine.
- You tests your program on the more advanced one and suddenly you need an 
  'else if' case with some additional calculations.

Please write, whether this makes sense :D and if so, we can work out a solution.

> Feel free to disagree,
> --Jesper
> 
>
Larysa Zaremba July 6, 2023, 12:49 p.m. UTC | #8
On Thu, Jul 06, 2023 at 02:38:33PM +0200, Larysa Zaremba wrote:
> On Thu, Jul 06, 2023 at 11:04:49AM +0200, Jesper Dangaard Brouer wrote:
> > 
> > 
> > On 06/07/2023 07.50, John Fastabend wrote:
> > > Larysa Zaremba wrote:
> > > > On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
> > > > > Cc. DaveM+Alex Duyck, as I value your insights on checksums.
> > > > > 
> > > > > On 04/07/2023 11.24, Larysa Zaremba wrote:
> > > > > > On Mon, Jul 03, 2023 at 01:38:27PM -0700, John Fastabend wrote:
> > > > > > > Larysa Zaremba wrote:
> > > > > > > > Implement functionality that enables drivers to expose to XDP code,
> > > > > > > > whether checksums was checked and on what level.
> > > > > > > > 
> > > > > > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > > > > > ---
> > > > > > > >    Documentation/networking/xdp-rx-metadata.rst |  3 +++
> > > > > > > >    include/linux/netdevice.h                    |  1 +
> > > > > > > >    include/net/xdp.h                            |  2 ++
> > > > > > > >    kernel/bpf/offload.c                         |  2 ++
> > > > > > > >    net/core/xdp.c                               | 21 ++++++++++++++++++++
> > > > > > > >    5 files changed, 29 insertions(+)
> > > > > > > > 
> > > > > > > > diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > > index ea6dd79a21d3..4ec6ddfd2a52 100644
> > > > > > > > --- a/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > > +++ b/Documentation/networking/xdp-rx-metadata.rst
> > > > > > > > @@ -26,6 +26,9 @@ metadata is supported, this set will grow:
> > > > > > > >    .. kernel-doc:: net/core/xdp.c
> > > > > > > >       :identifiers: bpf_xdp_metadata_rx_vlan_tag
> > > > > > > > +.. kernel-doc:: net/core/xdp.c
> > > > > > > > +   :identifiers: bpf_xdp_metadata_rx_csum_lvl
> > > > > > > > +
> > > > > > > >    An XDP program can use these kfuncs to read the metadata into stack
> > > > > > > >    variables for its own consumption. Or, to pass the metadata on to other
> > > > > > > >    consumers, an XDP program can store it into the metadata area carried
> > > > > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > > > > > index 4fa4380e6d89..569563687172 100644
> > > > > > > > --- a/include/linux/netdevice.h
> > > > > > > > +++ b/include/linux/netdevice.h
> > > > > > > > @@ -1660,6 +1660,7 @@ struct xdp_metadata_ops {
> > > > > > > >    			       enum xdp_rss_hash_type *rss_type);
> > > > > > > >    	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
> > > > > > > >    				   __be16 *vlan_proto);
> > > > > > > > +	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
> > > > > > > >    };
> > > > > > > >    /**
> > > > > > > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > > > > > > index 89c58f56ffc6..61ed38fa79d1 100644
> > > > > > > > --- a/include/net/xdp.h
> > > > > > > > +++ b/include/net/xdp.h
> > > > > > > > @@ -391,6 +391,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> > > > > > > >    			   bpf_xdp_metadata_rx_hash) \
> > > > > > > >    	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
> > > > > > > >    			   bpf_xdp_metadata_rx_vlan_tag) \
> > > > > > > > +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
> > > > > > > > +			   bpf_xdp_metadata_rx_csum_lvl) \
> > > > > > > >    enum {
> > > > > > > >    #define XDP_METADATA_KFUNC(name, _) name,
> > > > > > > > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> > > > > > > > index 986e7becfd42..a133fb775f49 100644
> > > > > > > > --- a/kernel/bpf/offload.c
> > > > > > > > +++ b/kernel/bpf/offload.c
> > > > > > > > @@ -850,6 +850,8 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
> > > > > > > >    		p = ops->xmo_rx_hash;
> > > > > > > >    	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
> > > > > > > >    		p = ops->xmo_rx_vlan_tag;
> > > > > > > > +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
> > > > > > > > +		p = ops->xmo_rx_csum_lvl;
> > > > > > > >    out:
> > > > > > > >    	up_read(&bpf_devs_lock);
> > > > > > > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > > > > > > index f6262c90e45f..c666d3e0a26c 100644
> > > > > > > > --- a/net/core/xdp.c
> > > > > > > > +++ b/net/core/xdp.c
> > > > > > > > @@ -758,6 +758,27 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
> > > > > > > >    	return -EOPNOTSUPP;
> > > > > > > >    }
> > > > > > > > +/**
> > > > > > > > + * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
> > > > > > > > + * @ctx: XDP context pointer.
> > > > > > > > + * @csum_level: Return value pointer.
> > > > > > > > + *
> > > > > > > > + * In case of success, csum_level contains depth of the last verified checksum.
> > > > > > > > + * If only the outermost checksum was verified, csum_level is 0, if both
> > > > > > > > + * encapsulation and inner transport checksums were verified, csum_level is 1,
> > > > > > > > + * and so on.
> > > > > > > > + * For more details, refer to csum_level field in sk_buff.
> > > > > > > > + *
> > > > > > > > + * Return:
> > > > > > > > + * * Returns 0 on success or ``-errno`` on error.
> > > > > > > > + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
> > > > > > > > + * * ``-ENODATA``    : Checksum was not validated
> > > > > > > > + */
> > > > > > > > +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
> > > > > > > 
> > > > > > > Istead of ENODATA should we return what would be put in the ip_summed field
> > > > > > > CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
> > > > > 
> > > > > I was thinking the same, what about checksum "type".
> > > > > 
> > > > > > > 
> > > > > > >    bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
> > > > > > > 
> > > > > > > or something like that? Or is the thought that its not really necessary?
> > > > > > > I don't have a strong preference but figured it was worth asking.
> > > > > > > 
> > > > > > 
> > > > > > I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
> > > > > > Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
> > > > > > overcomplicate the function signature.
> > > > > 
> > > > > So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
> > > > > CHECKSUM_UNNECESSARY?
> > > > 
> > > > This is 100% true for physical NICs, it's more complicated for veth, bacause it
> > > > often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is
> > > > treated by the network stack as a validated checksum, because there is no way
> > > > internally generated packet could be messed up. I would be grateful if you could
> > > > look at the veth patch and share your opinion about this.
> > > > 
> > > > > 
> > > > > Looking at documentation[1] (generated from skbuff.h):
> > > > >   [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
> > > > > 
> > > > > Is the idea that we can add another kfunc (new signature) than can deal
> > > > > with the other types of checksums (in a later kernel release)?
> > > > > 
> > > > 
> > > > Yes, that is the idea.
> > > 
> > > If we think there is a chance we might need another kfunc we should add it
> > > in the same kfunc. It would be unfortunate to have to do two kfuncs when
> > > one would work. It shouldn't cost much/anything(?) to hardcode the type for
> > > most cases? I think if we need it later I would advocate for updating this
> > > kfunc to support it. Of course then userspace will have to swivel on the
> > > kfunc signature.
> > > 
> > 
> > I think it might make sense to have 3 kfuncs for checksumming.
> > As this would allow BPF-prog to focus on CHECKSUM_UNNECESSARY, and then
> > only call additional kfunc for extracting e.g csum_start  + csum_offset
> > when type is CHECKSUM_PARTIAL.
> > 
> > We could extend bpf_xdp_metadata_rx_csum_lvl() to give the csum_type
> > CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}.
> > 
> >  int bpf_xdp_metadata_rx_csum_lvl(*ctx, u8 *csum_level, u8 *csum_type)
> > 
> > And then add two kfunc e.g.
> >  (1) bpf_xdp_metadata_rx_csum_partial(ctx, start, offset)
> >  (2) bpf_xdp_metadata_rx_csum_complete(ctx, csum)
> > 
> > Pseudo BPF-prog code:
> > 
> >  err = bpf_xdp_metadata_rx_csum_lvl(ctx, level, type);
> >  if (!err && type != CHECKSUM_UNNECESSARY) {
> >      if (type == CHECKSUM_PARTIAL)
> >          err = bpf_xdp_metadata_rx_csum_partial(ctx, start, offset);
> >      if (type == CHECKSUM_COMPLETE)
> >          err = bpf_xdp_metadata_rx_csum_complete(ctx, csum);
> >  }
> > 
> > Looking at code, I feel we could rename [...]_csum_lvl to csum_type.
> > E.g. bpf_xdp_metadata_rx_csum_type.
> >
> 
> What about:
> 
> union csum_info {
> 	struct {
> 		u16 csum_start;
> 		u16 csum_offset;
> 	};
> 	u32 checksum;
> 	u8 checksum_level;
> };
> 
> bpf_xdp_metadata_rx_csum(*ctx, u8 *csum_status, union csum_info *info);
> 
> One thing that is worth considering in my opinion is whether some hardware can 
> provide both CHECKSUM_UNNECESSARY and CHECKSUM_COMPLETE. Judging by [0], this 
> does occur. I such cases using an enum to represent the checksum status would 
> artificially limit the capabilities. Now, imagine the situation:
> 
> - You want to use your XDP program with 2 different NICs
> 
> [...]
> 
> err = bpf_xdp_metadata_rx_csum(*ctx, &status, &info);
> if (!err && status == CHECKSUM_UNNECESSARY)
> 	/* Do stuff */
> 
> [...]
> - One NIC can both calculate CHECKSUM_COMPLETE and parse headers, another one 
>   is only able to parse headers. Those can be very similar NICs from different 
>   generation.
> - You test your program on the simpler NIC, program works fine.
> - You tests your program on the more advanced one and suddenly you need an 
>   'else if' case with some additional calculations.
> 
> Please write, whether this makes sense :D and if so, we can work out a solution.
>

Forgot the link:
[0] https://elixir.bootlin.com/linux/v6.4.2/source/include/linux/skbuff.h#L143
 
> > Feel free to disagree,
> > --Jesper
> > 
> > 
>
Alexander Lobakin July 10, 2023, 4:58 p.m. UTC | #9
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Thu, 6 Jul 2023 14:49:44 +0200

> On Thu, Jul 06, 2023 at 02:38:33PM +0200, Larysa Zaremba wrote:
>> On Thu, Jul 06, 2023 at 11:04:49AM +0200, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 06/07/2023 07.50, John Fastabend wrote:
>>>> Larysa Zaremba wrote:
>>>>> On Tue, Jul 04, 2023 at 12:39:06PM +0200, Jesper Dangaard Brouer wrote:
>>>>>> Cc. DaveM+Alex Duyck, as I value your insights on checksums.

[...]

>>>>>>>>> + * Return:
>>>>>>>>> + * * Returns 0 on success or ``-errno`` on error.
>>>>>>>>> + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
>>>>>>>>> + * * ``-ENODATA``    : Checksum was not validated
>>>>>>>>> + */
>>>>>>>>> +__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
>>>>>>>>
>>>>>>>> Istead of ENODATA should we return what would be put in the ip_summed field
>>>>>>>> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}? Then sig would be,
>>>>>>
>>>>>> I was thinking the same, what about checksum "type".
>>>>>>
>>>>>>>>
>>>>>>>>    bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *type, u8 *lvl);
>>>>>>>>
>>>>>>>> or something like that? Or is the thought that its not really necessary?
>>>>>>>> I don't have a strong preference but figured it was worth asking.
>>>>>>>>
>>>>>>>
>>>>>>> I see no value in returning CHECKSUM_COMPLETE without the actual checksum value.
>>>>>>> Same with CHECKSUM_PARTIAL and csum_start. Returning those values too would
>>>>>>> overcomplicate the function signature.
>>>>>>
>>>>>> So, this kfunc bpf_xdp_metadata_rx_csum_lvl() success is it equivilent to
>>>>>> CHECKSUM_UNNECESSARY?
>>>>>
>>>>> This is 100% true for physical NICs, it's more complicated for veth, bacause it
>>>>> often receives CHECKSUM_PARTIAL, which shouldn't normally apprear on RX, but is
>>>>> treated by the network stack as a validated checksum, because there is no way
>>>>> internally generated packet could be messed up. I would be grateful if you could
>>>>> look at the veth patch and share your opinion about this.
>>>>>
>>>>>>
>>>>>> Looking at documentation[1] (generated from skbuff.h):
>>>>>>   [1] https://kernel.org/doc/html/latest/networking/skbuff.html#checksumming-of-received-packets-by-device
>>>>>>
>>>>>> Is the idea that we can add another kfunc (new signature) than can deal
>>>>>> with the other types of checksums (in a later kernel release)?
>>>>>>
>>>>>
>>>>> Yes, that is the idea.
>>>>
>>>> If we think there is a chance we might need another kfunc we should add it
>>>> in the same kfunc. It would be unfortunate to have to do two kfuncs when
>>>> one would work. It shouldn't cost much/anything(?) to hardcode the type for
>>>> most cases? I think if we need it later I would advocate for updating this
>>>> kfunc to support it. Of course then userspace will have to swivel on the
>>>> kfunc signature.
>>>>
>>>
>>> I think it might make sense to have 3 kfuncs for checksumming.

Isn't that overcomplicating? 3 callbacks for just one damn thing. IOW I
agree with John.

PARTIAL and COMPLETE are mutually exclusive. Their "additional" output
can be unionized. Level is 2 bits, status is 2 bits. Level makes sense
only with UNNECESSARY (correct me if I'm wrong).
IOW the kfunc could return:

-errno - not implemented or something went wrong
0 - none
1 - complete
2 - partial
3 + lvl - unnecessary

(CHECKSUM_* defs could be shuffled accordingly)

Then `if (ret > 2)` would mean UNNECESSARY and most programs could stop
here already. Programs wanting to extract the level can do `ret - 3`.
One additional pointer to u32 (union) to fetch additional data. I would
even say "BPF prog can pass NULL if it doesn't care", but OTOH I dunno
how to validate PARTIAL then :D (COMPLETE usually assumes it's valid)

>>> As this would allow BPF-prog to focus on CHECKSUM_UNNECESSARY, and then
>>> only call additional kfunc for extracting e.g csum_start  + csum_offset
>>> when type is CHECKSUM_PARTIAL.
>>>
>>> We could extend bpf_xdp_metadata_rx_csum_lvl() to give the csum_type
>>> CHECKSUM_{NONE, UNNECESSARY, COMPLETE, PARTIAL}.
>>>
>>>  int bpf_xdp_metadata_rx_csum_lvl(*ctx, u8 *csum_level, u8 *csum_type)
>>>
>>> And then add two kfunc e.g.
>>>  (1) bpf_xdp_metadata_rx_csum_partial(ctx, start, offset)
>>>  (2) bpf_xdp_metadata_rx_csum_complete(ctx, csum)
>>>
>>> Pseudo BPF-prog code:
>>>
>>>  err = bpf_xdp_metadata_rx_csum_lvl(ctx, level, type);
>>>  if (!err && type != CHECKSUM_UNNECESSARY) {

And hurt cool HW which by default returns COMPLETE? }:>

>>>      if (type == CHECKSUM_PARTIAL)
>>>          err = bpf_xdp_metadata_rx_csum_partial(ctx, start, offset);
>>>      if (type == CHECKSUM_COMPLETE)
>>>          err = bpf_xdp_metadata_rx_csum_complete(ctx, csum);

I don't feel like 1 hotpath `if` is worth multiplying kfuncs.

[...]

Thanks,
Olek
diff mbox series

Patch

diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
index ea6dd79a21d3..4ec6ddfd2a52 100644
--- a/Documentation/networking/xdp-rx-metadata.rst
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -26,6 +26,9 @@  metadata is supported, this set will grow:
 .. kernel-doc:: net/core/xdp.c
    :identifiers: bpf_xdp_metadata_rx_vlan_tag
 
+.. kernel-doc:: net/core/xdp.c
+   :identifiers: bpf_xdp_metadata_rx_csum_lvl
+
 An XDP program can use these kfuncs to read the metadata into stack
 variables for its own consumption. Or, to pass the metadata on to other
 consumers, an XDP program can store it into the metadata area carried
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4fa4380e6d89..569563687172 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1660,6 +1660,7 @@  struct xdp_metadata_ops {
 			       enum xdp_rss_hash_type *rss_type);
 	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, u16 *vlan_tag,
 				   __be16 *vlan_proto);
+	int	(*xmo_rx_csum_lvl)(const struct xdp_md *ctx, u8 *csum_level);
 };
 
 /**
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 89c58f56ffc6..61ed38fa79d1 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -391,6 +391,8 @@  void xdp_attachment_setup(struct xdp_attachment_info *info,
 			   bpf_xdp_metadata_rx_hash) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
 			   bpf_xdp_metadata_rx_vlan_tag) \
+	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM_LVL, \
+			   bpf_xdp_metadata_rx_csum_lvl) \
 
 enum {
 #define XDP_METADATA_KFUNC(name, _) name,
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 986e7becfd42..a133fb775f49 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -850,6 +850,8 @@  void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
 		p = ops->xmo_rx_hash;
 	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_VLAN_TAG))
 		p = ops->xmo_rx_vlan_tag;
+	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_CSUM_LVL))
+		p = ops->xmo_rx_csum_lvl;
 out:
 	up_read(&bpf_devs_lock);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f6262c90e45f..c666d3e0a26c 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -758,6 +758,27 @@  __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, u16 *vlan
 	return -EOPNOTSUPP;
 }
 
+/**
+ * bpf_xdp_metadata_rx_csum_lvl - Get depth at which HW has checked the checksum.
+ * @ctx: XDP context pointer.
+ * @csum_level: Return value pointer.
+ *
+ * In case of success, csum_level contains depth of the last verified checksum.
+ * If only the outermost checksum was verified, csum_level is 0, if both
+ * encapsulation and inner transport checksums were verified, csum_level is 1,
+ * and so on.
+ * For more details, refer to csum_level field in sk_buff.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
+ * * ``-ENODATA``    : Checksum was not validated
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_csum_lvl(const struct xdp_md *ctx, u8 *csum_level)
+{
+	return -EOPNOTSUPP;
+}
+
 __diag_pop();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)