diff mbox series

[net-next,v2,3/3] net: add netmem_t to skb_frag_t

Message ID 20231217080913.2025973-4-almasrymina@google.com (mailing list archive)
State New, archived
Headers show
Series Abstract page from net stack | expand

Commit Message

Mina Almasry Dec. 17, 2023, 8:09 a.m. UTC
Use netmem_t instead of page directly in skb_frag_t. Currently netmem_t
is always a struct page underneath, but the abstraction allows efforts
to add support for skb frags not backed by pages.

There is unfortunately 1 instance where the skb_frag_t is assumed to be
a bio_vec in kcm. For this case, add a debug assert that the skb frag is
indeed backed by a page, and do a cast.

Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
that the API can be used to create netmem skbs.

Signed-off-by: Mina Almasry <almasrymina@google.com>

---

v2:
- Add skb frag filling helpers.
---
 include/linux/skbuff.h | 70 ++++++++++++++++++++++++++++++++----------
 net/core/skbuff.c      | 22 +++++++++----
 net/kcm/kcmsock.c      | 10 ++++--
 3 files changed, 78 insertions(+), 24 deletions(-)

Comments

Yunsheng Lin Dec. 18, 2023, 12:39 p.m. UTC | #1
On 2023/12/17 16:09, Mina Almasry wrote:
> Use netmem_t instead of page directly in skb_frag_t. Currently netmem_t
> is always a struct page underneath, but the abstraction allows efforts
> to add support for skb frags not backed by pages.
> 
> There is unfortunately 1 instance where the skb_frag_t is assumed to be
> a bio_vec in kcm. For this case, add a debug assert that the skb frag is
> indeed backed by a page, and do a cast.
> 
> Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
> that the API can be used to create netmem skbs.
> 
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> 

...

>  
> -typedef struct bio_vec skb_frag_t;
> +typedef struct skb_frag {
> +	struct netmem *bv_page;

bv_page -> bv_netmem?

> +	unsigned int bv_len;
> +	unsigned int bv_offset;
> +} skb_frag_t;
>  
>  /**
>   * skb_frag_size() - Returns the size of a skb fragment
> @@ -2431,22 +2436,37 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
>  	return skb_headlen(skb) + __skb_pagelen(skb);
>  }
>  

...

>  /**
> @@ -2462,10 +2482,10 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
>  }
>  
>  /**
> - * __skb_fill_page_desc - initialise a paged fragment in an skb
> + * __skb_fill_netmem_desc - initialise a paged fragment in an skb
>   * @skb: buffer containing fragment to be initialised
>   * @i: paged fragment index to initialise
> - * @page: the page to use for this fragment
> + * @netmem: the netmem to use for this fragment
>   * @off: the offset to the data with @page
>   * @size: the length of the data
>   *
> @@ -2474,10 +2494,13 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
>   *
>   * Does not take any additional reference on the fragment.
>   */
> -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> -					struct page *page, int off, int size)
> +static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
> +					  struct netmem *netmem, int off,
> +					  int size)
>  {
> -	__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
> +	struct page *page = netmem_to_page(netmem);
> +
> +	__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
>  
>  	/* Propagate page pfmemalloc to the skb if we can. The problem is
>  	 * that not all callers have unique ownership of the page but rely
> @@ -2485,7 +2508,21 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
>  	 */
>  	page = compound_head(page);
>  	if (page_is_pfmemalloc(page))
> -		skb->pfmemalloc	= true;
> +		skb->pfmemalloc = true;

Is it possible to introduce netmem_is_pfmemalloc() and netmem_compound_head()
for netmem, and have some built-time testing to ensure the implementation
is the same between page_is_pfmemalloc()/compound_head() and
netmem_is_pfmemalloc()/netmem_compound_head()? So that we can avoid the
netmem_to_page() as much as possible, especially in the driver.


> +}
> +
> +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> +					struct page *page, int off, int size)
> +{
> +	__skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
> +}
> +

...

>   */
>  static inline struct page *skb_frag_page(const skb_frag_t *frag)
>  {
> -	return frag->bv_page;
> +	return netmem_to_page(frag->bv_page);

It seems we are not able to have a safe type protection for the above
function, as the driver may be able to pass a devmem frag as a param here,
and pass the returned page into the mm subsystem, and compiler is not able
to catch it when compiling.

>  }
>  
>  /**
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 83af8aaeb893..053d220aa2f2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -845,16 +845,24 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  }
>  EXPORT_SYMBOL(__napi_alloc_skb);
>  

...

> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index 65d1f6755f98..5c46db045f4c 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -636,9 +636,15 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
>  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
>  			msize += skb_shinfo(skb)->frags[i].bv_len;
>  
> +		/* The cast to struct bio_vec* here assumes the frags are
> +		 * struct page based.
> +		 */
> +		DEBUG_NET_WARN_ON_ONCE(
> +			!skb_frag_page(&skb_shinfo(skb)->frags[0]));

It seems skb_frag_page() always return non-NULL in this patch, the above
checking seems unnecessary?
Mina Almasry Dec. 18, 2023, 8:22 p.m. UTC | #2
On Mon, Dec 18, 2023 at 4:39 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/12/17 16:09, Mina Almasry wrote:
> > Use netmem_t instead of page directly in skb_frag_t. Currently netmem_t
> > is always a struct page underneath, but the abstraction allows efforts
> > to add support for skb frags not backed by pages.
> >
> > There is unfortunately 1 instance where the skb_frag_t is assumed to be
> > a bio_vec in kcm. For this case, add a debug assert that the skb frag is
> > indeed backed by a page, and do a cast.
> >
> > Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
> > that the API can be used to create netmem skbs.
> >
> > Signed-off-by: Mina Almasry <almasrymina@google.com>
> >
>
> ...
>
> >
> > -typedef struct bio_vec skb_frag_t;
> > +typedef struct skb_frag {
> > +     struct netmem *bv_page;
>
> bv_page -> bv_netmem?
>

bv_page, bv_len & bv_offset all are misnomers after this change
indeed, because bv_ refers to bio_vec and skb_frag_t is no longer a
bio_vec. However I'm hoping renaming everything can be done in a
separate series. Maybe I'll just apply the bv_page -> bv_netmem
change, that doesn't seem to be much code churn and it makes things
much less confusing.

> > +     unsigned int bv_len;
> > +     unsigned int bv_offset;
> > +} skb_frag_t;
> >
> >  /**
> >   * skb_frag_size() - Returns the size of a skb fragment
> > @@ -2431,22 +2436,37 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
> >       return skb_headlen(skb) + __skb_pagelen(skb);
> >  }
> >
>
> ...
>
> >  /**
> > @@ -2462,10 +2482,10 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
> >  }
> >
> >  /**
> > - * __skb_fill_page_desc - initialise a paged fragment in an skb
> > + * __skb_fill_netmem_desc - initialise a paged fragment in an skb
> >   * @skb: buffer containing fragment to be initialised
> >   * @i: paged fragment index to initialise
> > - * @page: the page to use for this fragment
> > + * @netmem: the netmem to use for this fragment
> >   * @off: the offset to the data with @page
> >   * @size: the length of the data
> >   *
> > @@ -2474,10 +2494,13 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
> >   *
> >   * Does not take any additional reference on the fragment.
> >   */
> > -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> > -                                     struct page *page, int off, int size)
> > +static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
> > +                                       struct netmem *netmem, int off,
> > +                                       int size)
> >  {
> > -     __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
> > +     struct page *page = netmem_to_page(netmem);
> > +
> > +     __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
> >
> >       /* Propagate page pfmemalloc to the skb if we can. The problem is
> >        * that not all callers have unique ownership of the page but rely
> > @@ -2485,7 +2508,21 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> >        */
> >       page = compound_head(page);
> >       if (page_is_pfmemalloc(page))
> > -             skb->pfmemalloc = true;
> > +             skb->pfmemalloc = true;
>
> Is it possible to introduce netmem_is_pfmemalloc() and netmem_compound_head()
> for netmem,

That is exactly the plan, and I added these helpers in the follow up
series which introduces devmem support:

https://patchwork.kernel.org/project/netdevbpf/patch/20231218024024.3516870-8-almasrymina@google.com/

> and have some built-time testing to ensure the implementation
> is the same between page_is_pfmemalloc()/compound_head() and
> netmem_is_pfmemalloc()/netmem_compound_head()?

That doesn't seem desirable to me. It's too hacky IMO to duplicate the
implementation details of the MM stack in the net stack and that is
not the implementation you see in the patch that adds these helpers
above.

> So that we can avoid the
> netmem_to_page() as much as possible, especially in the driver.
>

Agreed.

>
> > +}
> > +
> > +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> > +                                     struct page *page, int off, int size)
> > +{
> > +     __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
> > +}
> > +
>
> ...
>
> >   */
> >  static inline struct page *skb_frag_page(const skb_frag_t *frag)
> >  {
> > -     return frag->bv_page;
> > +     return netmem_to_page(frag->bv_page);
>
> It seems we are not able to have a safe type protection for the above
> function, as the driver may be able to pass a devmem frag as a param here,
> and pass the returned page into the mm subsystem, and compiler is not able
> to catch it when compiling.
>

That depends on the implementation of netmem_to_page(). As I
implemented it in the follow up series, netmem_to_page() always checks
that netmem is actually a page before doing the conversion via
checking the LSB checking. It's of course unacceptable to make an
unconditional cast here. That will get around the type safety as you
point out, and defeat the point. But I'm not doing that.

I can add a comment above netmem_to_page():

/* Returns page* if the netmem is backed by a page, NULL otherwise. Currently
 * netmem can only be backed by a page, so we always return the underlying
 * page.
 */
static inline struct page *netmem_to_page(struct netmem *netmem);

> >  }
> >
> >  /**
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 83af8aaeb893..053d220aa2f2 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -845,16 +845,24 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >  }
> >  EXPORT_SYMBOL(__napi_alloc_skb);
> >
>
> ...
>
> > diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> > index 65d1f6755f98..5c46db045f4c 100644
> > --- a/net/kcm/kcmsock.c
> > +++ b/net/kcm/kcmsock.c
> > @@ -636,9 +636,15 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
> >               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> >                       msize += skb_shinfo(skb)->frags[i].bv_len;
> >
> > +             /* The cast to struct bio_vec* here assumes the frags are
> > +              * struct page based.
> > +              */
> > +             DEBUG_NET_WARN_ON_ONCE(
> > +                     !skb_frag_page(&skb_shinfo(skb)->frags[0]));
>
> It seems skb_frag_page() always return non-NULL in this patch, the above
> checking seems unnecessary?

We're doing a cast below, and the cast is only valid if the frag has a
page underneath. This check makes sure the skb has a page. In the
series that adds devmem support, skb_frag_page() returns NULL as the
frag has no page. Since this patch adds the dangerous cast, I think it
may be reasonable for it to also add the check.

I can add a comment above skb_frag_page() to indicate the intention again:

 * Returns the &struct page associated with @frag. Returns NULL if this frag
 * has no associated page.

But as of this series netmem can only be a page, so I think adding
that information may be premature.
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ce38874dbd1..03ab13072962 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,6 +37,7 @@ 
 #endif
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <net/netmem.h>
 
 /**
  * DOC: skb checksums
@@ -359,7 +360,11 @@  extern int sysctl_max_skb_frags;
  */
 #define GSO_BY_FRAGS	0xFFFF
 
-typedef struct bio_vec skb_frag_t;
+typedef struct skb_frag {
+	struct netmem *bv_page;
+	unsigned int bv_len;
+	unsigned int bv_offset;
+} skb_frag_t;
 
 /**
  * skb_frag_size() - Returns the size of a skb fragment
@@ -2431,22 +2436,37 @@  static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 	return skb_headlen(skb) + __skb_pagelen(skb);
 }
 
+static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
+					     struct netmem *netmem, int off,
+					     int size)
+{
+	frag->bv_page = netmem;
+	frag->bv_offset = off;
+	skb_frag_size_set(frag, size);
+}
+
 static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
 					   struct page *page,
 					   int off, int size)
 {
-	frag->bv_page = page;
-	frag->bv_offset = off;
-	skb_frag_size_set(frag, size);
+	skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
+}
+
+static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
+						int i, struct netmem *netmem,
+						int off, int size)
+{
+	skb_frag_t *frag = &shinfo->frags[i];
+
+	skb_frag_fill_netmem_desc(frag, netmem, off, size);
 }
 
 static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
 					      int i, struct page *page,
 					      int off, int size)
 {
-	skb_frag_t *frag = &shinfo->frags[i];
-
-	skb_frag_fill_page_desc(frag, page, off, size);
+	__skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
+				     size);
 }
 
 /**
@@ -2462,10 +2482,10 @@  static inline void skb_len_add(struct sk_buff *skb, int delta)
 }
 
 /**
- * __skb_fill_page_desc - initialise a paged fragment in an skb
+ * __skb_fill_netmem_desc - initialise a paged fragment in an skb
  * @skb: buffer containing fragment to be initialised
  * @i: paged fragment index to initialise
- * @page: the page to use for this fragment
+ * @netmem: the netmem to use for this fragment
  * @off: the offset to the data with @page
  * @size: the length of the data
  *
@@ -2474,10 +2494,13 @@  static inline void skb_len_add(struct sk_buff *skb, int delta)
  *
  * Does not take any additional reference on the fragment.
  */
-static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
-					struct page *page, int off, int size)
+static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
+					  struct netmem *netmem, int off,
+					  int size)
 {
-	__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
+	struct page *page = netmem_to_page(netmem);
+
+	__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
 
 	/* Propagate page pfmemalloc to the skb if we can. The problem is
 	 * that not all callers have unique ownership of the page but rely
@@ -2485,7 +2508,21 @@  static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 	 */
 	page = compound_head(page);
 	if (page_is_pfmemalloc(page))
-		skb->pfmemalloc	= true;
+		skb->pfmemalloc = true;
+}
+
+static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
+					struct page *page, int off, int size)
+{
+	__skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
+}
+
+static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i,
+					struct netmem *netmem, int off,
+					int size)
+{
+	__skb_fill_netmem_desc(skb, i, netmem, off, size);
+	skb_shinfo(skb)->nr_frags = i + 1;
 }
 
 /**
@@ -2505,8 +2542,7 @@  static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
 				      struct page *page, int off, int size)
 {
-	__skb_fill_page_desc(skb, i, page, off, size);
-	skb_shinfo(skb)->nr_frags = i + 1;
+	skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
 }
 
 /**
@@ -2532,6 +2568,8 @@  static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
 
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		     int size, unsigned int truesize);
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, struct netmem *netmem,
+			    int off, int size, unsigned int truesize);
 
 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 			  unsigned int truesize);
@@ -3422,7 +3460,7 @@  static inline void skb_frag_off_copy(skb_frag_t *fragto,
  */
 static inline struct page *skb_frag_page(const skb_frag_t *frag)
 {
-	return frag->bv_page;
+	return netmem_to_page(frag->bv_page);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83af8aaeb893..053d220aa2f2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -845,16 +845,24 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 }
 EXPORT_SYMBOL(__napi_alloc_skb);
 
-void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
-		     int size, unsigned int truesize)
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, struct netmem *netmem,
+			    int off, int size, unsigned int truesize)
 {
 	DEBUG_NET_WARN_ON_ONCE(size > truesize);
 
-	skb_fill_page_desc(skb, i, page, off, size);
+	skb_fill_netmem_desc(skb, i, netmem, off, size);
 	skb->len += size;
 	skb->data_len += size;
 	skb->truesize += truesize;
 }
+EXPORT_SYMBOL(skb_add_rx_frag_netmem);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+		     int size, unsigned int truesize)
+{
+	skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
+			       truesize);
+}
 EXPORT_SYMBOL(skb_add_rx_frag);
 
 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
@@ -1868,10 +1876,11 @@  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 
 	/* skb frags point to kernel buffers */
 	for (i = 0; i < new_frags - 1; i++) {
-		__skb_fill_page_desc(skb, i, head, 0, psize);
+		__skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
 		head = (struct page *)page_private(head);
 	}
-	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+	__skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
+			       d_off);
 	skb_shinfo(skb)->nr_frags = new_frags;
 
 release:
@@ -3609,7 +3618,8 @@  skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
 		if (plen) {
 			page = virt_to_head_page(from->head);
 			offset = from->data - (unsigned char *)page_address(page);
-			__skb_fill_page_desc(to, 0, page, offset, plen);
+			__skb_fill_netmem_desc(to, 0, page_to_netmem(page),
+					       offset, plen);
 			get_page(page);
 			j = 1;
 			len -= plen;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 65d1f6755f98..5c46db045f4c 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -636,9 +636,15 @@  static int kcm_write_msgs(struct kcm_sock *kcm)
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			msize += skb_shinfo(skb)->frags[i].bv_len;
 
+		/* The cast to struct bio_vec* here assumes the frags are
+		 * struct page based.
+		 */
+		DEBUG_NET_WARN_ON_ONCE(
+			!skb_frag_page(&skb_shinfo(skb)->frags[0]));
+
 		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
-			      skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags,
-			      msize);
+			      (const struct bio_vec *)skb_shinfo(skb)->frags,
+			      skb_shinfo(skb)->nr_frags, msize);
 		iov_iter_advance(&msg.msg_iter, txm->frag_offset);
 
 		do {