diff mbox series

[net,v2] xdp, net: fix for construct skb by xdp inside xsk zc rx

Message ID 20210617145534.101458-1-xuanzhuo@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series [net,v2] xdp, net: fix for construct skb by xdp inside xsk zc rx | expand

Commit Message

Xuan Zhuo June 17, 2021, 2:55 p.m. UTC
When each driver supports xsk rx, if the received buff returns XDP_PASS
after run xdp prog, it must construct skb based on xdp. This patch
extracts this logic into a public function xdp_construct_skb().

There is a bug in the original logic. When constructing skb, we should
copy the meta information to skb and then use __skb_pull() to correct
the data.

Fixes: 0a714186d3c0f ("i40e: add AF_XDP zero-copy Rx support")
Fixes: 2d4238f556972 ("ice: Add support for AF_XDP")
Fixes: bba2556efad66 ("net: stmmac: Enable RX via AF_XDP zero-copy")
Fixes: d0bcacd0a1309 ("ixgbe: add AF_XDP zero-copy Rx support")
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 16 +---------
 drivers/net/ethernet/intel/ice/ice_xsk.c      | 12 +-------
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 12 +-------
 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +-------------
 include/net/xdp.h                             | 30 +++++++++++++++++++
 5 files changed, 34 insertions(+), 59 deletions(-)

Comments

Maciej Fijalkowski June 28, 2021, 10:47 a.m. UTC | #1
On Thu, Jun 17, 2021 at 10:55:34PM +0800, Xuan Zhuo wrote:
> When each driver supports xsk rx, if the received buff returns XDP_PASS
> after run xdp prog, it must construct skb based on xdp. This patch
> extracts this logic into a public function xdp_construct_skb().
> 
> There is a bug in the original logic. When constructing skb, we should
> copy the meta information to skb and then use __skb_pull() to correct
> the data.

Thanks for fixing the bug on Intel drivers, Xuan. However, together with
Magnus we feel that include/net/xdp.h is not a correct place for
introducing xdp_construct_skb. If mlx side could use it, then probably
include/net/xdp_sock_drv.h is a better fit for that.

Once again, CCing Maxim.
Maxim, any chances that mlx driver could be aligned in a way that we could
have a common function for creating skb on ZC path?

Otherwise, maybe we should think about introducing the Intel-specific
common header in tree?

> 
> Fixes: 0a714186d3c0f ("i40e: add AF_XDP zero-copy Rx support")
> Fixes: 2d4238f556972 ("ice: Add support for AF_XDP")
> Fixes: bba2556efad66 ("net: stmmac: Enable RX via AF_XDP zero-copy")
> Fixes: d0bcacd0a1309 ("ixgbe: add AF_XDP zero-copy Rx support")
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 16 +---------
>  drivers/net/ethernet/intel/ice/ice_xsk.c      | 12 +-------
>  drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 12 +-------
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +-------------
>  include/net/xdp.h                             | 30 +++++++++++++++++++
>  5 files changed, 34 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index 68f177a86403..81b0f44eedda 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -246,23 +246,9 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
>  static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
>  					     struct xdp_buff *xdp)
>  {
> -	unsigned int metasize = xdp->data - xdp->data_meta;
> -	unsigned int datasize = xdp->data_end - xdp->data;
>  	struct sk_buff *skb;
>  
> -	/* allocate a skb to store the frags */
> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
> -			       xdp->data_end - xdp->data_hard_start,
> -			       GFP_ATOMIC | __GFP_NOWARN);
> -	if (unlikely(!skb))
> -		goto out;
> -
> -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
> -	if (metasize)
> -		skb_metadata_set(skb, metasize);
> -
> -out:
> +	skb = xdp_construct_skb(xdp, &rx_ring->q_vector->napi);
>  	xsk_buff_free(xdp);
>  	return skb;
>  }
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index a1f89ea3c2bd..f95e1adcebda 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -430,22 +430,12 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
>  static struct sk_buff *
>  ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
>  {
> -	unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
> -	unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
> -	unsigned int datasize_hard = rx_buf->xdp->data_end -
> -				     rx_buf->xdp->data_hard_start;
>  	struct sk_buff *skb;
>  
> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
> -			       GFP_ATOMIC | __GFP_NOWARN);
> +	skb = xdp_construct_skb(rx_buf->xdp, &rx_ring->q_vector->napi);
>  	if (unlikely(!skb))
>  		return NULL;
>  
> -	skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
> -	memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
> -	if (metasize)
> -		skb_metadata_set(skb, metasize);
> -
>  	xsk_buff_free(rx_buf->xdp);
>  	rx_buf->xdp = NULL;
>  	return skb;
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> index f72d2978263b..123945832c96 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> @@ -203,22 +203,12 @@ bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
>  static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
>  					      struct ixgbe_rx_buffer *bi)
>  {
> -	unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
> -	unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
>  	struct sk_buff *skb;
>  
> -	/* allocate a skb to store the frags */
> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
> -			       bi->xdp->data_end - bi->xdp->data_hard_start,
> -			       GFP_ATOMIC | __GFP_NOWARN);
> +	skb = xdp_construct_skb(bi->xdp, &rx_ring->q_vector->napi);
>  	if (unlikely(!skb))
>  		return NULL;
>  
> -	skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
> -	memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
> -	if (metasize)
> -		skb_metadata_set(skb, metasize);
> -
>  	xsk_buff_free(bi->xdp);
>  	bi->xdp = NULL;
>  	return skb;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index c87202cbd3d6..143ac1edb876 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -4729,27 +4729,6 @@ static void stmmac_finalize_xdp_rx(struct stmmac_priv *priv,
>  		xdp_do_flush();
>  }
>  
> -static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch,
> -					       struct xdp_buff *xdp)
> -{
> -	unsigned int metasize = xdp->data - xdp->data_meta;
> -	unsigned int datasize = xdp->data_end - xdp->data;
> -	struct sk_buff *skb;
> -
> -	skb = __napi_alloc_skb(&ch->rxtx_napi,
> -			       xdp->data_end - xdp->data_hard_start,
> -			       GFP_ATOMIC | __GFP_NOWARN);
> -	if (unlikely(!skb))
> -		return NULL;
> -
> -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
> -	if (metasize)
> -		skb_metadata_set(skb, metasize);
> -
> -	return skb;
> -}
> -
>  static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
>  				   struct dma_desc *p, struct dma_desc *np,
>  				   struct xdp_buff *xdp)
> @@ -4761,7 +4740,7 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
>  	struct sk_buff *skb;
>  	u32 hash;
>  
> -	skb = stmmac_construct_skb_zc(ch, xdp);
> +	skb = xdp_construct_skb(xdp, &ch->rxtx_napi);
>  	if (!skb) {
>  		priv->dev->stats.rx_dropped++;
>  		return;
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index a5bc214a49d9..561e21eaf718 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -95,6 +95,36 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
>  	xdp->data_meta = meta_valid ? data : data + 1;
>  }
>  
> +static __always_inline struct sk_buff *
> +xdp_construct_skb(struct xdp_buff *xdp, struct napi_struct *napi)
> +{
> +	unsigned int metasize;
> +	unsigned int datasize;
> +	unsigned int headroom;
> +	struct sk_buff *skb;
> +	unsigned int len;
> +
> +	/* this include metasize */
> +	datasize = xdp->data_end  - xdp->data_meta;
> +	metasize = xdp->data      - xdp->data_meta;
> +	headroom = xdp->data_meta - xdp->data_hard_start;
> +	len      = xdp->data_end  - xdp->data_hard_start;
> +
> +	/* allocate a skb to store the frags */
> +	skb = __napi_alloc_skb(napi, len, GFP_ATOMIC | __GFP_NOWARN);
> +	if (unlikely(!skb))
> +		return NULL;
> +
> +	skb_reserve(skb, headroom);
> +	memcpy(__skb_put(skb, datasize), xdp->data_meta, datasize);
> +	if (metasize) {
> +		__skb_pull(skb, metasize);
> +		skb_metadata_set(skb, metasize);
> +	}
> +
> +	return skb;
> +}
> +
>  /* Reserve memory area at end-of data area.
>   *
>   * This macro reserves tailroom in the XDP buffer by limiting the
> -- 
> 2.31.0
>
Maxim Mikityanskiy June 29, 2021, 9:50 a.m. UTC | #2
On 2021-06-28 13:47, Maciej Fijalkowski wrote:
> On Thu, Jun 17, 2021 at 10:55:34PM +0800, Xuan Zhuo wrote:
>> When each driver supports xsk rx, if the received buff returns XDP_PASS
>> after run xdp prog, it must construct skb based on xdp. This patch
>> extracts this logic into a public function xdp_construct_skb().
>>
>> There is a bug in the original logic. When constructing skb, we should
>> copy the meta information to skb and then use __skb_pull() to correct
>> the data.
> 
> Thanks for fixing the bug on Intel drivers, Xuan. However, together with
> Magnus we feel that include/net/xdp.h is not a correct place for
> introducing xdp_construct_skb. If mlx side could use it, then probably
> include/net/xdp_sock_drv.h is a better fit for that.
> 
> Once again, CCing Maxim.
> Maxim, any chances that mlx driver could be aligned in a way that we could
> have a common function for creating skb on ZC path?

I'm sorry I missed the v1.

I have reviewed the differences between mlx5e_xsk_construct_skb and 
xdp_construct_skb. I would say it is possible for mlx5 to adapt and use 
the new API, but it may also require changes to xdp_construct_skb. 
Please see the list of differences below.

> 
> Otherwise, maybe we should think about introducing the Intel-specific
> common header in tree?

Sure, you can do it to share Intel-specific stuff between Intel drivers. 
However, in this particular case I think all XSK-enabled drivers would 
benefit from this function, especially after previous efforts that aimed 
to minimize the differences between drivers, the amount of code in the 
drivers and to share as much as possible. So, in my opinion, this stuff 
belongs to xdp_sock_drv.h. (Moreover, I see this patch also changes 
stmmac, so it's no longer Intel-specific.)

Differences between mlx5e_xsk_construct_skb and xdp_construct_skb:

1. __napi_alloc_skb is called with __GFP_NOWARN in xdp_construct_skb, 
but without this flag in mlx5. Why do we need to use non-default flags? 
Why can't we stick with napi_alloc_skb? I see only Intel drivers and XSK 
in stmmac use __napi_alloc_skb instead of napi_alloc_skb, and it looks 
to me as it was just copied from the regular (non-XSK) datapath of i40e 
(i40e_construct_skb) to i40e's XSK, then to stmmac, then to 
xdp_construct_skb, and I don't truly understand the reason of having 
__GFP_NOWARN where it first appeared (i40e_construct_skb). Could someone 
explain the reason for __GFP_NOWARN, so that we could decide whether we 
want it in a generic XSK helper?

2. skb_put in mlx5 vs __skb_put in xdp_construct_skb - shouldn't be a 
big deal, the difference is just an extra overflow check in skb_put.

3. XDP metadata. mlx5 calls xdp_set_data_meta_invalid, while other 
XSK-enabled drivers set metadata size to 0 and allow the XDP program to 
push some metadata. xdp_construct_skb only supports xdp_buffs with 
metadata, it will break if xdp_data_meta_unsupported. There are a few 
possible ways to address it:

3.1. Add a check for xdp_data_meta_unsupported to xdp_construct_skb. It 
will lift the undocumented limitation of this function and allow it to 
handle all valid kinds of xdp_buff.

3.2. Have two versions of xdp_construct_skb: one for xdp_buffs with 
metadata, the other for ones without metadata. Sounds ugly and not 
robust, but could spare a few CPU cycles for drivers that can't have 
metadata.

3.3. Remove xdp_set_data_meta_invalid from mlx5. I think the reason for 
this call was some design decision, rather than a technical limitation. 
On the other hand, even though it will allow mlx5 to work with 
xdp_construct_skb in its current implementation, it would still be nice 
to combine it with 3.1 to avoid having issues with future drivers (if 
not, at least document in a clear way that the xdp_buff parameter must 
have metadata). Tariq/Saeed, could you comment on this point?

Thanks,
Max

>>
>> Fixes: 0a714186d3c0f ("i40e: add AF_XDP zero-copy Rx support")
>> Fixes: 2d4238f556972 ("ice: Add support for AF_XDP")
>> Fixes: bba2556efad66 ("net: stmmac: Enable RX via AF_XDP zero-copy")
>> Fixes: d0bcacd0a1309 ("ixgbe: add AF_XDP zero-copy Rx support")
>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> ---
>>   drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 16 +---------
>>   drivers/net/ethernet/intel/ice/ice_xsk.c      | 12 +-------
>>   drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 12 +-------
>>   .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +-------------
>>   include/net/xdp.h                             | 30 +++++++++++++++++++
>>   5 files changed, 34 insertions(+), 59 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>> index 68f177a86403..81b0f44eedda 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>> @@ -246,23 +246,9 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
>>   static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
>>   					     struct xdp_buff *xdp)
>>   {
>> -	unsigned int metasize = xdp->data - xdp->data_meta;
>> -	unsigned int datasize = xdp->data_end - xdp->data;
>>   	struct sk_buff *skb;
>>   
>> -	/* allocate a skb to store the frags */
>> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
>> -			       xdp->data_end - xdp->data_hard_start,
>> -			       GFP_ATOMIC | __GFP_NOWARN);
>> -	if (unlikely(!skb))
>> -		goto out;
>> -
>> -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>> -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
>> -	if (metasize)
>> -		skb_metadata_set(skb, metasize);
>> -
>> -out:
>> +	skb = xdp_construct_skb(xdp, &rx_ring->q_vector->napi);
>>   	xsk_buff_free(xdp);
>>   	return skb;
>>   }
>> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
>> index a1f89ea3c2bd..f95e1adcebda 100644
>> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
>> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
>> @@ -430,22 +430,12 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
>>   static struct sk_buff *
>>   ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
>>   {
>> -	unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
>> -	unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
>> -	unsigned int datasize_hard = rx_buf->xdp->data_end -
>> -				     rx_buf->xdp->data_hard_start;
>>   	struct sk_buff *skb;
>>   
>> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
>> -			       GFP_ATOMIC | __GFP_NOWARN);
>> +	skb = xdp_construct_skb(rx_buf->xdp, &rx_ring->q_vector->napi);
>>   	if (unlikely(!skb))
>>   		return NULL;
>>   
>> -	skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
>> -	memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
>> -	if (metasize)
>> -		skb_metadata_set(skb, metasize);
>> -
>>   	xsk_buff_free(rx_buf->xdp);
>>   	rx_buf->xdp = NULL;
>>   	return skb;
>> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
>> index f72d2978263b..123945832c96 100644
>> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
>> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
>> @@ -203,22 +203,12 @@ bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
>>   static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
>>   					      struct ixgbe_rx_buffer *bi)
>>   {
>> -	unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
>> -	unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
>>   	struct sk_buff *skb;
>>   
>> -	/* allocate a skb to store the frags */
>> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
>> -			       bi->xdp->data_end - bi->xdp->data_hard_start,
>> -			       GFP_ATOMIC | __GFP_NOWARN);
>> +	skb = xdp_construct_skb(bi->xdp, &rx_ring->q_vector->napi);
>>   	if (unlikely(!skb))
>>   		return NULL;
>>   
>> -	skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
>> -	memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
>> -	if (metasize)
>> -		skb_metadata_set(skb, metasize);
>> -
>>   	xsk_buff_free(bi->xdp);
>>   	bi->xdp = NULL;
>>   	return skb;
>> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
>> index c87202cbd3d6..143ac1edb876 100644
>> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
>> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
>> @@ -4729,27 +4729,6 @@ static void stmmac_finalize_xdp_rx(struct stmmac_priv *priv,
>>   		xdp_do_flush();
>>   }
>>   
>> -static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch,
>> -					       struct xdp_buff *xdp)
>> -{
>> -	unsigned int metasize = xdp->data - xdp->data_meta;
>> -	unsigned int datasize = xdp->data_end - xdp->data;
>> -	struct sk_buff *skb;
>> -
>> -	skb = __napi_alloc_skb(&ch->rxtx_napi,
>> -			       xdp->data_end - xdp->data_hard_start,
>> -			       GFP_ATOMIC | __GFP_NOWARN);
>> -	if (unlikely(!skb))
>> -		return NULL;
>> -
>> -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>> -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
>> -	if (metasize)
>> -		skb_metadata_set(skb, metasize);
>> -
>> -	return skb;
>> -}
>> -
>>   static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
>>   				   struct dma_desc *p, struct dma_desc *np,
>>   				   struct xdp_buff *xdp)
>> @@ -4761,7 +4740,7 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
>>   	struct sk_buff *skb;
>>   	u32 hash;
>>   
>> -	skb = stmmac_construct_skb_zc(ch, xdp);
>> +	skb = xdp_construct_skb(xdp, &ch->rxtx_napi);
>>   	if (!skb) {
>>   		priv->dev->stats.rx_dropped++;
>>   		return;
>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>> index a5bc214a49d9..561e21eaf718 100644
>> --- a/include/net/xdp.h
>> +++ b/include/net/xdp.h
>> @@ -95,6 +95,36 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
>>   	xdp->data_meta = meta_valid ? data : data + 1;
>>   }
>>   
>> +static __always_inline struct sk_buff *
>> +xdp_construct_skb(struct xdp_buff *xdp, struct napi_struct *napi)
>> +{
>> +	unsigned int metasize;
>> +	unsigned int datasize;
>> +	unsigned int headroom;
>> +	struct sk_buff *skb;
>> +	unsigned int len;
>> +
>> +	/* this include metasize */
>> +	datasize = xdp->data_end  - xdp->data_meta;
>> +	metasize = xdp->data      - xdp->data_meta;
>> +	headroom = xdp->data_meta - xdp->data_hard_start;
>> +	len      = xdp->data_end  - xdp->data_hard_start;
>> +
>> +	/* allocate a skb to store the frags */
>> +	skb = __napi_alloc_skb(napi, len, GFP_ATOMIC | __GFP_NOWARN);
>> +	if (unlikely(!skb))
>> +		return NULL;
>> +
>> +	skb_reserve(skb, headroom);
>> +	memcpy(__skb_put(skb, datasize), xdp->data_meta, datasize);
>> +	if (metasize) {
>> +		__skb_pull(skb, metasize);
>> +		skb_metadata_set(skb, metasize);
>> +	}
>> +
>> +	return skb;
>> +}
>> +
>>   /* Reserve memory area at end-of data area.
>>    *
>>    * This macro reserves tailroom in the XDP buffer by limiting the
>> -- 
>> 2.31.0
>>
Maciej Fijalkowski June 29, 2021, 12:44 p.m. UTC | #3
On Tue, Jun 29, 2021 at 12:50:53PM +0300, Maxim Mikityanskiy wrote:
> On 2021-06-28 13:47, Maciej Fijalkowski wrote:
> > On Thu, Jun 17, 2021 at 10:55:34PM +0800, Xuan Zhuo wrote:
> > > When each driver supports xsk rx, if the received buff returns XDP_PASS
> > > after run xdp prog, it must construct skb based on xdp. This patch
> > > extracts this logic into a public function xdp_construct_skb().
> > > 
> > > There is a bug in the original logic. When constructing skb, we should
> > > copy the meta information to skb and then use __skb_pull() to correct
> > > the data.
> > 
> > Thanks for fixing the bug on Intel drivers, Xuan. However, together with
> > Magnus we feel that include/net/xdp.h is not a correct place for
> > introducing xdp_construct_skb. If mlx side could use it, then probably
> > include/net/xdp_sock_drv.h is a better fit for that.
> > 
> > Once again, CCing Maxim.
> > Maxim, any chances that mlx driver could be aligned in a way that we could
> > have a common function for creating skb on ZC path?
> 
> I'm sorry I missed the v1.
> 
> I have reviewed the differences between mlx5e_xsk_construct_skb and
> xdp_construct_skb. I would say it is possible for mlx5 to adapt and use the
> new API, but it may also require changes to xdp_construct_skb. Please see
> the list of differences below.

Great!

> 
> > 
> > Otherwise, maybe we should think about introducing the Intel-specific
> > common header in tree?
> 
> Sure, you can do it to share Intel-specific stuff between Intel drivers.
> However, in this particular case I think all XSK-enabled drivers would
> benefit from this function, especially after previous efforts that aimed to
> minimize the differences between drivers, the amount of code in the drivers
> and to share as much as possible. So, in my opinion, this stuff belongs to
> xdp_sock_drv.h. (Moreover, I see this patch also changes stmmac, so it's no
> longer Intel-specific.)

Right, stmmac just looked similar to what Intel is doing and AFAICT the
author of that code was from Intel, sorry for assumption.

> 
> Differences between mlx5e_xsk_construct_skb and xdp_construct_skb:
> 
> 1. __napi_alloc_skb is called with __GFP_NOWARN in xdp_construct_skb, but
> without this flag in mlx5. Why do we need to use non-default flags? Why
> can't we stick with napi_alloc_skb? I see only Intel drivers and XSK in
> stmmac use __napi_alloc_skb instead of napi_alloc_skb, and it looks to me as
> it was just copied from the regular (non-XSK) datapath of i40e
> (i40e_construct_skb) to i40e's XSK, then to stmmac, then to
> xdp_construct_skb, and I don't truly understand the reason of having
> __GFP_NOWARN where it first appeared (i40e_construct_skb). Could someone
> explain the reason for __GFP_NOWARN, so that we could decide whether we want
> it in a generic XSK helper?

It's a c&p to me.

> 
> 2. skb_put in mlx5 vs __skb_put in xdp_construct_skb - shouldn't be a big
> deal, the difference is just an extra overflow check in skb_put.
> 
> 3. XDP metadata. mlx5 calls xdp_set_data_meta_invalid, while other
> XSK-enabled drivers set metadata size to 0 and allow the XDP program to push
> some metadata. xdp_construct_skb only supports xdp_buffs with metadata, it
> will break if xdp_data_meta_unsupported. There are a few possible ways to
> address it:
> 
> 3.1. Add a check for xdp_data_meta_unsupported to xdp_construct_skb. It will
> lift the undocumented limitation of this function and allow it to handle all
> valid kinds of xdp_buff.
> 
> 3.2. Have two versions of xdp_construct_skb: one for xdp_buffs with
> metadata, the other for ones without metadata. Sounds ugly and not robust,
> but could spare a few CPU cycles for drivers that can't have metadata.
> 
> 3.3. Remove xdp_set_data_meta_invalid from mlx5. I think the reason for this
> call was some design decision, rather than a technical limitation. On the
> other hand, even though it will allow mlx5 to work with xdp_construct_skb in
> its current implementation, it would still be nice to combine it with 3.1 to
> avoid having issues with future drivers (if not, at least document in a
> clear way that the xdp_buff parameter must have metadata). Tariq/Saeed,
> could you comment on this point?

So this sounds like we need to hear from you guys if you're going to add a
support to metadata in AF_XDP ZC. Then we can decide with steps forward.
Thanks for this breakdown!

> 
> Thanks,
> Max
> 
> > > 
> > > Fixes: 0a714186d3c0f ("i40e: add AF_XDP zero-copy Rx support")
> > > Fixes: 2d4238f556972 ("ice: Add support for AF_XDP")
> > > Fixes: bba2556efad66 ("net: stmmac: Enable RX via AF_XDP zero-copy")
> > > Fixes: d0bcacd0a1309 ("ixgbe: add AF_XDP zero-copy Rx support")
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >   drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 16 +---------
> > >   drivers/net/ethernet/intel/ice/ice_xsk.c      | 12 +-------
> > >   drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 12 +-------
> > >   .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +-------------
> > >   include/net/xdp.h                             | 30 +++++++++++++++++++
> > >   5 files changed, 34 insertions(+), 59 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > index 68f177a86403..81b0f44eedda 100644
> > > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > @@ -246,23 +246,9 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
> > >   static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
> > >   					     struct xdp_buff *xdp)
> > >   {
> > > -	unsigned int metasize = xdp->data - xdp->data_meta;
> > > -	unsigned int datasize = xdp->data_end - xdp->data;
> > >   	struct sk_buff *skb;
> > > -	/* allocate a skb to store the frags */
> > > -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
> > > -			       xdp->data_end - xdp->data_hard_start,
> > > -			       GFP_ATOMIC | __GFP_NOWARN);
> > > -	if (unlikely(!skb))
> > > -		goto out;
> > > -
> > > -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> > > -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
> > > -	if (metasize)
> > > -		skb_metadata_set(skb, metasize);
> > > -
> > > -out:
> > > +	skb = xdp_construct_skb(xdp, &rx_ring->q_vector->napi);
> > >   	xsk_buff_free(xdp);
> > >   	return skb;
> > >   }
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > index a1f89ea3c2bd..f95e1adcebda 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > @@ -430,22 +430,12 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
> > >   static struct sk_buff *
> > >   ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
> > >   {
> > > -	unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
> > > -	unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
> > > -	unsigned int datasize_hard = rx_buf->xdp->data_end -
> > > -				     rx_buf->xdp->data_hard_start;
> > >   	struct sk_buff *skb;
> > > -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
> > > -			       GFP_ATOMIC | __GFP_NOWARN);
> > > +	skb = xdp_construct_skb(rx_buf->xdp, &rx_ring->q_vector->napi);
> > >   	if (unlikely(!skb))
> > >   		return NULL;
> > > -	skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
> > > -	memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
> > > -	if (metasize)
> > > -		skb_metadata_set(skb, metasize);
> > > -
> > >   	xsk_buff_free(rx_buf->xdp);
> > >   	rx_buf->xdp = NULL;
> > >   	return skb;
> > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> > > index f72d2978263b..123945832c96 100644
> > > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> > > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> > > @@ -203,22 +203,12 @@ bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
> > >   static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
> > >   					      struct ixgbe_rx_buffer *bi)
> > >   {
> > > -	unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
> > > -	unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
> > >   	struct sk_buff *skb;
> > > -	/* allocate a skb to store the frags */
> > > -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
> > > -			       bi->xdp->data_end - bi->xdp->data_hard_start,
> > > -			       GFP_ATOMIC | __GFP_NOWARN);
> > > +	skb = xdp_construct_skb(bi->xdp, &rx_ring->q_vector->napi);
> > >   	if (unlikely(!skb))
> > >   		return NULL;
> > > -	skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
> > > -	memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
> > > -	if (metasize)
> > > -		skb_metadata_set(skb, metasize);
> > > -
> > >   	xsk_buff_free(bi->xdp);
> > >   	bi->xdp = NULL;
> > >   	return skb;
> > > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > > index c87202cbd3d6..143ac1edb876 100644
> > > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> > > @@ -4729,27 +4729,6 @@ static void stmmac_finalize_xdp_rx(struct stmmac_priv *priv,
> > >   		xdp_do_flush();
> > >   }
> > > -static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch,
> > > -					       struct xdp_buff *xdp)
> > > -{
> > > -	unsigned int metasize = xdp->data - xdp->data_meta;
> > > -	unsigned int datasize = xdp->data_end - xdp->data;
> > > -	struct sk_buff *skb;
> > > -
> > > -	skb = __napi_alloc_skb(&ch->rxtx_napi,
> > > -			       xdp->data_end - xdp->data_hard_start,
> > > -			       GFP_ATOMIC | __GFP_NOWARN);
> > > -	if (unlikely(!skb))
> > > -		return NULL;
> > > -
> > > -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> > > -	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
> > > -	if (metasize)
> > > -		skb_metadata_set(skb, metasize);
> > > -
> > > -	return skb;
> > > -}
> > > -
> > >   static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
> > >   				   struct dma_desc *p, struct dma_desc *np,
> > >   				   struct xdp_buff *xdp)
> > > @@ -4761,7 +4740,7 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
> > >   	struct sk_buff *skb;
> > >   	u32 hash;
> > > -	skb = stmmac_construct_skb_zc(ch, xdp);
> > > +	skb = xdp_construct_skb(xdp, &ch->rxtx_napi);
> > >   	if (!skb) {
> > >   		priv->dev->stats.rx_dropped++;
> > >   		return;
> > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > index a5bc214a49d9..561e21eaf718 100644
> > > --- a/include/net/xdp.h
> > > +++ b/include/net/xdp.h
> > > @@ -95,6 +95,36 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
> > >   	xdp->data_meta = meta_valid ? data : data + 1;
> > >   }
> > > +static __always_inline struct sk_buff *
> > > +xdp_construct_skb(struct xdp_buff *xdp, struct napi_struct *napi)
> > > +{
> > > +	unsigned int metasize;
> > > +	unsigned int datasize;
> > > +	unsigned int headroom;
> > > +	struct sk_buff *skb;
> > > +	unsigned int len;
> > > +
> > > +	/* this include metasize */
> > > +	datasize = xdp->data_end  - xdp->data_meta;
> > > +	metasize = xdp->data      - xdp->data_meta;
> > > +	headroom = xdp->data_meta - xdp->data_hard_start;
> > > +	len      = xdp->data_end  - xdp->data_hard_start;
> > > +
> > > +	/* allocate a skb to store the frags */
> > > +	skb = __napi_alloc_skb(napi, len, GFP_ATOMIC | __GFP_NOWARN);
> > > +	if (unlikely(!skb))
> > > +		return NULL;
> > > +
> > > +	skb_reserve(skb, headroom);
> > > +	memcpy(__skb_put(skb, datasize), xdp->data_meta, datasize);
> > > +	if (metasize) {
> > > +		__skb_pull(skb, metasize);
> > > +		skb_metadata_set(skb, metasize);
> > > +	}
> > > +
> > > +	return skb;
> > > +}
> > > +
> > >   /* Reserve memory area at end-of data area.
> > >    *
> > >    * This macro reserves tailroom in the XDP buffer by limiting the
> > > -- 
> > > 2.31.0
> > > 
>
Jesper Dangaard Brouer June 29, 2021, 4:55 p.m. UTC | #4
On 28/06/2021 12.47, Maciej Fijalkowski wrote:

> +static __always_inline struct sk_buff *
> +xdp_construct_skb(struct xdp_buff *xdp, struct napi_struct *napi)
> +{

I don't like the generic name "xdp_construct_skb".

What about calling it "xdp_copy_construct_skb", because below is 
memcpy'ing the data.

Functions that use this call free (or recycle) the memory backing the 
packet, after calling this function.

(I'm open to other naming suggestions)


> +	unsigned int metasize;
> +	unsigned int datasize;
> +	unsigned int headroom;
> +	struct sk_buff *skb;
> +	unsigned int len;
> +
> +	/* this include metasize */
> +	datasize = xdp->data_end  - xdp->data_meta;
> +	metasize = xdp->data      - xdp->data_meta;
> +	headroom = xdp->data_meta - xdp->data_hard_start;
> +	len      = xdp->data_end  - xdp->data_hard_start;
> +
> +	/* allocate a skb to store the frags */
> +	skb = __napi_alloc_skb(napi, len, GFP_ATOMIC | __GFP_NOWARN);
> +	if (unlikely(!skb))
> +		return NULL;
> +
> +	skb_reserve(skb, headroom);
> +	memcpy(__skb_put(skb, datasize), xdp->data_meta, datasize);
> +	if (metasize) {
> +		__skb_pull(skb, metasize);
> +		skb_metadata_set(skb, metasize);
> +	}
> +
> +	return skb;
> +}
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 68f177a86403..81b0f44eedda 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -246,23 +246,9 @@  bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
 					     struct xdp_buff *xdp)
 {
-	unsigned int metasize = xdp->data - xdp->data_meta;
-	unsigned int datasize = xdp->data_end - xdp->data;
 	struct sk_buff *skb;
 
-	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       xdp->data_end - xdp->data_hard_start,
-			       GFP_ATOMIC | __GFP_NOWARN);
-	if (unlikely(!skb))
-		goto out;
-
-	skb_reserve(skb, xdp->data - xdp->data_hard_start);
-	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
-	if (metasize)
-		skb_metadata_set(skb, metasize);
-
-out:
+	skb = xdp_construct_skb(xdp, &rx_ring->q_vector->napi);
 	xsk_buff_free(xdp);
 	return skb;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index a1f89ea3c2bd..f95e1adcebda 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -430,22 +430,12 @@  static void ice_bump_ntc(struct ice_ring *rx_ring)
 static struct sk_buff *
 ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 {
-	unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
-	unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
-	unsigned int datasize_hard = rx_buf->xdp->data_end -
-				     rx_buf->xdp->data_hard_start;
 	struct sk_buff *skb;
 
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = xdp_construct_skb(rx_buf->xdp, &rx_ring->q_vector->napi);
 	if (unlikely(!skb))
 		return NULL;
 
-	skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
-	memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
-	if (metasize)
-		skb_metadata_set(skb, metasize);
-
 	xsk_buff_free(rx_buf->xdp);
 	rx_buf->xdp = NULL;
 	return skb;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index f72d2978263b..123945832c96 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -203,22 +203,12 @@  bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
 static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
 					      struct ixgbe_rx_buffer *bi)
 {
-	unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
-	unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
 	struct sk_buff *skb;
 
-	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       bi->xdp->data_end - bi->xdp->data_hard_start,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = xdp_construct_skb(bi->xdp, &rx_ring->q_vector->napi);
 	if (unlikely(!skb))
 		return NULL;
 
-	skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
-	memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
-	if (metasize)
-		skb_metadata_set(skb, metasize);
-
 	xsk_buff_free(bi->xdp);
 	bi->xdp = NULL;
 	return skb;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c87202cbd3d6..143ac1edb876 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4729,27 +4729,6 @@  static void stmmac_finalize_xdp_rx(struct stmmac_priv *priv,
 		xdp_do_flush();
 }
 
-static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch,
-					       struct xdp_buff *xdp)
-{
-	unsigned int metasize = xdp->data - xdp->data_meta;
-	unsigned int datasize = xdp->data_end - xdp->data;
-	struct sk_buff *skb;
-
-	skb = __napi_alloc_skb(&ch->rxtx_napi,
-			       xdp->data_end - xdp->data_hard_start,
-			       GFP_ATOMIC | __GFP_NOWARN);
-	if (unlikely(!skb))
-		return NULL;
-
-	skb_reserve(skb, xdp->data - xdp->data_hard_start);
-	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
-	if (metasize)
-		skb_metadata_set(skb, metasize);
-
-	return skb;
-}
-
 static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
 				   struct dma_desc *p, struct dma_desc *np,
 				   struct xdp_buff *xdp)
@@ -4761,7 +4740,7 @@  static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue,
 	struct sk_buff *skb;
 	u32 hash;
 
-	skb = stmmac_construct_skb_zc(ch, xdp);
+	skb = xdp_construct_skb(xdp, &ch->rxtx_napi);
 	if (!skb) {
 		priv->dev->stats.rx_dropped++;
 		return;
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a5bc214a49d9..561e21eaf718 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -95,6 +95,36 @@  xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 	xdp->data_meta = meta_valid ? data : data + 1;
 }
 
+static __always_inline struct sk_buff *
+xdp_construct_skb(struct xdp_buff *xdp, struct napi_struct *napi)
+{
+	unsigned int metasize;
+	unsigned int datasize;
+	unsigned int headroom;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	/* this include metasize */
+	datasize = xdp->data_end  - xdp->data_meta;
+	metasize = xdp->data      - xdp->data_meta;
+	headroom = xdp->data_meta - xdp->data_hard_start;
+	len      = xdp->data_end  - xdp->data_hard_start;
+
+	/* allocate a skb to store the frags */
+	skb = __napi_alloc_skb(napi, len, GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	memcpy(__skb_put(skb, datasize), xdp->data_meta, datasize);
+	if (metasize) {
+		__skb_pull(skb, metasize);
+		skb_metadata_set(skb, metasize);
+	}
+
+	return skb;
+}
+
 /* Reserve memory area at end-of data area.
  *
  * This macro reserves tailroom in the XDP buffer by limiting the