diff mbox series

[v7,04/15] net: prepare for non devmem TCP memory providers

Message ID 20241029230521.2385749-5-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 29, 2024, 11:05 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

There is a good bunch of places in generic paths assuming that the only
page pool memory provider is devmem TCP. As we want to reuse the net_iov
and provider infrastructure, we need to patch it up and explicitly check
the provider type when we branch into devmem TCP code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 net/core/devmem.c         | 10 ++++++++--
 net/core/devmem.h         |  8 ++++++++
 net/core/page_pool_user.c | 15 +++++++++------
 net/ipv4/tcp.c            |  6 ++++++
 4 files changed, 31 insertions(+), 8 deletions(-)

Comments

Mina Almasry Nov. 1, 2024, 5:09 p.m. UTC | #1
On Tue, Oct 29, 2024 at 4:06 PM David Wei <dw@davidwei.uk> wrote:
>
> From: Pavel Begunkov <asml.silence@gmail.com>
>
> There is a good bunch of places in generic paths assuming that the only
> page pool memory provider is devmem TCP. As we want to reuse the net_iov
> and provider infrastructure, we need to patch it up and explicitly check
> the provider type when we branch into devmem TCP code.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: David Wei <dw@davidwei.uk>
> ---
>  net/core/devmem.c         | 10 ++++++++--
>  net/core/devmem.h         |  8 ++++++++
>  net/core/page_pool_user.c | 15 +++++++++------
>  net/ipv4/tcp.c            |  6 ++++++
>  4 files changed, 31 insertions(+), 8 deletions(-)
>
> diff --git a/net/core/devmem.c b/net/core/devmem.c
> index 01738029e35c..78983a98e5dc 100644
> --- a/net/core/devmem.c
> +++ b/net/core/devmem.c
> @@ -28,6 +28,12 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
>
>  static const struct memory_provider_ops dmabuf_devmem_ops;
>
> +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops)
> +{
> +       return ops == &dmabuf_devmem_ops;
> +}
> +EXPORT_SYMBOL_GPL(net_is_devmem_page_pool_ops);
> +
>  static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
>                                                struct gen_pool_chunk *chunk,
>                                                void *not_used)
> @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev)
>         unsigned int i;
>
>         for (i = 0; i < dev->real_num_rx_queues; i++) {
> -               binding = dev->_rx[i].mp_params.mp_priv;
> -               if (!binding)
> +               if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops)
>                         continue;
>

Use the net_is_devmem_page_pool_ops helper here?

> +               binding = dev->_rx[i].mp_params.mp_priv;
>                 xa_for_each(&binding->bound_rxqs, xa_idx, rxq)
>                         if (rxq == &dev->_rx[i]) {
>                                 xa_erase(&binding->bound_rxqs, xa_idx);
> diff --git a/net/core/devmem.h b/net/core/devmem.h
> index a2b9913e9a17..a3fdd66bb05b 100644
> --- a/net/core/devmem.h
> +++ b/net/core/devmem.h
> @@ -116,6 +116,8 @@ struct net_iov *
>  net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
>  void net_devmem_free_dmabuf(struct net_iov *ppiov);
>
> +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops);
> +
>  #else
>  struct net_devmem_dmabuf_binding;
>
> @@ -168,6 +170,12 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
>  {
>         return 0;
>  }
> +
> +static inline bool
> +net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops)
> +{
> +       return false;
> +}
>  #endif
>
>  #endif /* _NET_DEVMEM_H */
> diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
> index 48335766c1bf..604862a73535 100644
> --- a/net/core/page_pool_user.c
> +++ b/net/core/page_pool_user.c
> @@ -214,7 +214,7 @@ static int
>  page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
>                   const struct genl_info *info)
>  {
> -       struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
> +       struct net_devmem_dmabuf_binding *binding;
>         size_t inflight, refsz;
>         void *hdr;
>
> @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
>                          pool->user.detach_time))
>                 goto err_cancel;
>
> -       if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
> -               goto err_cancel;
> +       if (net_is_devmem_page_pool_ops(pool->mp_ops)) {
> +               binding = pool->mp_priv;
> +               if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
> +                       goto err_cancel;
> +       }

Worthy of note is that I think Jakub asked for this introspection, and
likely you should also add similar introspection. I.e. page_pool
dumping should likely be improved to dump that it's bound to io_uring
memory. Not sure what io_uring memory 'id' equivalent would be, if
any.

>
>         genlmsg_end(rsp, hdr);
>
> @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool)
>  int page_pool_check_memory_provider(struct net_device *dev,
>                                     struct netdev_rx_queue *rxq)
>  {
> -       struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv;
> +       void *mp_priv = rxq->mp_params.mp_priv;
>         struct page_pool *pool;
>         struct hlist_node *n;
>
> -       if (!binding)
> +       if (!mp_priv)
>                 return 0;
>
>         mutex_lock(&page_pools_lock);
>         hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
> -               if (pool->mp_priv != binding)
> +               if (pool->mp_priv != mp_priv)
>                         continue;
>
>                 if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index e928efc22f80..31e01da61c12 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -277,6 +277,7 @@
>  #include <net/ip.h>
>  #include <net/sock.h>
>  #include <net/rstreason.h>
> +#include <net/page_pool/types.h>
>
>  #include <linux/uaccess.h>
>  #include <asm/ioctls.h>
> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                         }
>
>                         niov = skb_frag_net_iov(frag);
> +                       if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) {
> +                               err = -ENODEV;
> +                               goto out;
> +                       }
> +

I think this check needs to go in the caller. Currently the caller
assumes that if !skb_frags_readable(), then the frag is dma-buf, and
calls tcp_recvmsg_dmabuf on it. The caller needs to check that the
frag is specifically a dma-buf frag now.

Can io_uring frags somehow end up in tcp_recvmsg_locked? You're still
using the tcp stack with io_uring ZC right? So I suspect they might?

--
Thanks,
Mina
Pavel Begunkov Nov. 1, 2024, 5:41 p.m. UTC | #2
On 11/1/24 17:09, Mina Almasry wrote:
> On Tue, Oct 29, 2024 at 4:06 PM David Wei <dw@davidwei.uk> wrote:
...
>> +
>>   static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
>>                                                 struct gen_pool_chunk *chunk,
>>                                                 void *not_used)
>> @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev)
>>          unsigned int i;
>>
>>          for (i = 0; i < dev->real_num_rx_queues; i++) {
>> -               binding = dev->_rx[i].mp_params.mp_priv;
>> -               if (!binding)
>> +               if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops)
>>                          continue;
>>
> 
> Use the net_is_devmem_page_pool_ops helper here?

It could, but the function is there primarily for outside users to
avoid ifdefs and build problems. I don't think it worth reiteration?
I'll change if there is a next version.

...
>> @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
>>                           pool->user.detach_time))
>>                  goto err_cancel;
>>
>> -       if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
>> -               goto err_cancel;
>> +       if (net_is_devmem_page_pool_ops(pool->mp_ops)) {
>> +               binding = pool->mp_priv;
>> +               if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
>> +                       goto err_cancel;
>> +       }
> 
> Worthy of note is that I think Jakub asked for this introspection, and
> likely you should also add similar introspection. I.e. page_pool

I think we can patch it up after merging the series? Depends what Jakub
thinks. In any case, I can't parse io_uring ops here until a later patch
adding those ops, so it'd be a new patch if it's a part of this series.

> dumping should likely be improved to dump that it's bound to io_uring
> memory. Not sure what io_uring memory 'id' equivalent would be, if
> any.

I don't think io_uring have any id to give. What is it for in the
first place? Do you give it to netlink to communicate with devmem
TCP or something similar?

>>          genlmsg_end(rsp, hdr);
>>
>> @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool)
>>   int page_pool_check_memory_provider(struct net_device *dev,
>>                                      struct netdev_rx_queue *rxq)
...
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index e928efc22f80..31e01da61c12 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -277,6 +277,7 @@
>>   #include <net/ip.h>
>>   #include <net/sock.h>
>>   #include <net/rstreason.h>
>> +#include <net/page_pool/types.h>
>>
>>   #include <linux/uaccess.h>
>>   #include <asm/ioctls.h>
>> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>>                          }
>>
>>                          niov = skb_frag_net_iov(frag);
>> +                       if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) {
>> +                               err = -ENODEV;
>> +                               goto out;
>> +                       }
>> +
> 
> I think this check needs to go in the caller. Currently the caller
> assumes that if !skb_frags_readable(), then the frag is dma-buf, and

io_uring originated netmem that are marked unreadable as well
and so will end up in tcp_recvmsg_dmabuf(), then we reject and
fail since they should not be fed to devmem TCP. It should be
fine from correctness perspective.

We need to check frags, and that's the place where we iterate
frags. Another option is to add a loop in tcp_recvmsg_locked
walking over all frags of an skb and doing the checks, but
that's an unnecessary performance burden to devmem.

> calls tcp_recvmsg_dmabuf on it. The caller needs to check that the
> frag is specifically a dma-buf frag now.
> 
> Can io_uring frags somehow end up in tcp_recvmsg_locked? You're still
> using the tcp stack with io_uring ZC right? So I suspect they might?

All of them are using the same socket rx queue, so yes, any of them
can see any type of packet non net_iov / pages
Mina Almasry Nov. 4, 2024, 8:20 p.m. UTC | #3
On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> ...
> >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> >> index e928efc22f80..31e01da61c12 100644
> >> --- a/net/ipv4/tcp.c
> >> +++ b/net/ipv4/tcp.c
> >> @@ -277,6 +277,7 @@
> >>   #include <net/ip.h>
> >>   #include <net/sock.h>
> >>   #include <net/rstreason.h>
> >> +#include <net/page_pool/types.h>
> >>
> >>   #include <linux/uaccess.h>
> >>   #include <asm/ioctls.h>
> >> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
> >>                          }
> >>
> >>                          niov = skb_frag_net_iov(frag);
> >> +                       if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) {
> >> +                               err = -ENODEV;
> >> +                               goto out;
> >> +                       }
> >> +
> >
> > I think this check needs to go in the caller. Currently the caller
> > assumes that if !skb_frags_readable(), then the frag is dma-buf, and
>
> io_uring originated netmem that are marked unreadable as well
> and so will end up in tcp_recvmsg_dmabuf(), then we reject and
> fail since they should not be fed to devmem TCP. It should be
> fine from correctness perspective.
>
> We need to check frags, and that's the place where we iterate
> frags. Another option is to add a loop in tcp_recvmsg_locked
> walking over all frags of an skb and doing the checks, but
> that's an unnecessary performance burden to devmem.
>

Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring
function) is not ideal really. Especially when you're dereferencing
nio->pp to do the check which IIUC will pull a cache line not normally
needed in this code path and may have a performance impact.

We currently have a check in __skb_fill_netmem_desc() that makes sure
all frags added to an skb are pages or dmabuf. I think we need to
improve it to make sure all frags added to an skb are of the same type
(pages, dmabuf, iouring). sending it to skb_copy_datagram_msg or
tcp_recvmsg_dmabuf or error.

I also I'm not sure dereferencing ->pp to check the frag type is ever
OK in such a fast path when ->pp is not usually needed until the skb
is freed? You may have to add a flag to the niov to indicate what type
it is, or change the skb->unreadable flag to a u8 that determines if
it's pages/io_uring/dmabuf.
Pavel Begunkov Nov. 4, 2024, 9:24 p.m. UTC | #4
On 11/4/24 20:20, Mina Almasry wrote:
> On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>> io_uring originated netmem that are marked unreadable as well
>> and so will end up in tcp_recvmsg_dmabuf(), then we reject and
>> fail since they should not be fed to devmem TCP. It should be
>> fine from correctness perspective.
>>
>> We need to check frags, and that's the place where we iterate
>> frags. Another option is to add a loop in tcp_recvmsg_locked
>> walking over all frags of an skb and doing the checks, but
>> that's an unnecessary performance burden to devmem.
>>
> 
> Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring
> function) is not ideal really. Especially when you're dereferencing
> nio->pp to do the check which IIUC will pull a cache line not normally
> needed in this code path and may have a performance impact.

Even if it's cold, all net_iov processed are expected to come
from the same page pool and would be cached. And it should be of
a comparable hotness as dmabuf_genpool_chunk_owner accesses below,
considering that there is enough of processing happening around,
it should be of a worry and can be improve upon later.
David Wei Nov. 11, 2024, 7:01 p.m. UTC | #5
On 2024-11-04 05:20, Mina Almasry wrote:
> On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>> ...
>>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>>>> index e928efc22f80..31e01da61c12 100644
>>>> --- a/net/ipv4/tcp.c
>>>> +++ b/net/ipv4/tcp.c
>>>> @@ -277,6 +277,7 @@
>>>>   #include <net/ip.h>
>>>>   #include <net/sock.h>
>>>>   #include <net/rstreason.h>
>>>> +#include <net/page_pool/types.h>
>>>>
>>>>   #include <linux/uaccess.h>
>>>>   #include <asm/ioctls.h>
>>>> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>>>>                          }
>>>>
>>>>                          niov = skb_frag_net_iov(frag);
>>>> +                       if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) {
>>>> +                               err = -ENODEV;
>>>> +                               goto out;
>>>> +                       }
>>>> +
>>>
>>> I think this check needs to go in the caller. Currently the caller
>>> assumes that if !skb_frags_readable(), then the frag is dma-buf, and
>>
>> io_uring originated netmem that are marked unreadable as well
>> and so will end up in tcp_recvmsg_dmabuf(), then we reject and
>> fail since they should not be fed to devmem TCP. It should be
>> fine from correctness perspective.
>>
>> We need to check frags, and that's the place where we iterate
>> frags. Another option is to add a loop in tcp_recvmsg_locked
>> walking over all frags of an skb and doing the checks, but
>> that's an unnecessary performance burden to devmem.
>>
> 
> Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring
> function) is not ideal really. Especially when you're dereferencing
> nio->pp to do the check which IIUC will pull a cache line not normally
> needed in this code path and may have a performance impact.

This check is needed currently because the curent assumption in core
netdev code is that !skb_frags_readable() means devmem TCP. Longer term,
we need to figure out how to distinguish skb frag providers in both code
and Netlink introspection.

Since your concerns here are primarily around performance rather than
correctness, I suggest we defer this as a follow up series.

> 
> We currently have a check in __skb_fill_netmem_desc() that makes sure
> all frags added to an skb are pages or dmabuf. I think we need to
> improve it to make sure all frags added to an skb are of the same type
> (pages, dmabuf, iouring). sending it to skb_copy_datagram_msg or
> tcp_recvmsg_dmabuf or error.

It should not be possible for drivers to fill in an skb with frags from
different providers. A provider can only change upon a queue reset.

> 
> I also I'm not sure dereferencing ->pp to check the frag type is ever
> OK in such a fast path when ->pp is not usually needed until the skb
> is freed? You may have to add a flag to the niov to indicate what type
> it is, or change the skb->unreadable flag to a u8 that determines if
> it's pages/io_uring/dmabuf.
> 
>
diff mbox series

Patch

diff --git a/net/core/devmem.c b/net/core/devmem.c
index 01738029e35c..78983a98e5dc 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -28,6 +28,12 @@  static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
 
 static const struct memory_provider_ops dmabuf_devmem_ops;
 
+bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops)
+{
+	return ops == &dmabuf_devmem_ops;
+}
+EXPORT_SYMBOL_GPL(net_is_devmem_page_pool_ops);
+
 static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
 					       struct gen_pool_chunk *chunk,
 					       void *not_used)
@@ -316,10 +322,10 @@  void dev_dmabuf_uninstall(struct net_device *dev)
 	unsigned int i;
 
 	for (i = 0; i < dev->real_num_rx_queues; i++) {
-		binding = dev->_rx[i].mp_params.mp_priv;
-		if (!binding)
+		if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops)
 			continue;
 
+		binding = dev->_rx[i].mp_params.mp_priv;
 		xa_for_each(&binding->bound_rxqs, xa_idx, rxq)
 			if (rxq == &dev->_rx[i]) {
 				xa_erase(&binding->bound_rxqs, xa_idx);
diff --git a/net/core/devmem.h b/net/core/devmem.h
index a2b9913e9a17..a3fdd66bb05b 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -116,6 +116,8 @@  struct net_iov *
 net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
 void net_devmem_free_dmabuf(struct net_iov *ppiov);
 
+bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops);
+
 #else
 struct net_devmem_dmabuf_binding;
 
@@ -168,6 +170,12 @@  static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
 {
 	return 0;
 }
+
+static inline bool
+net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops)
+{
+	return false;
+}
 #endif
 
 #endif /* _NET_DEVMEM_H */
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 48335766c1bf..604862a73535 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -214,7 +214,7 @@  static int
 page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
 		  const struct genl_info *info)
 {
-	struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+	struct net_devmem_dmabuf_binding *binding;
 	size_t inflight, refsz;
 	void *hdr;
 
@@ -244,8 +244,11 @@  page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
 			 pool->user.detach_time))
 		goto err_cancel;
 
-	if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
-		goto err_cancel;
+	if (net_is_devmem_page_pool_ops(pool->mp_ops)) {
+		binding = pool->mp_priv;
+		if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
+			goto err_cancel;
+	}
 
 	genlmsg_end(rsp, hdr);
 
@@ -353,16 +356,16 @@  void page_pool_unlist(struct page_pool *pool)
 int page_pool_check_memory_provider(struct net_device *dev,
 				    struct netdev_rx_queue *rxq)
 {
-	struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv;
+	void *mp_priv = rxq->mp_params.mp_priv;
 	struct page_pool *pool;
 	struct hlist_node *n;
 
-	if (!binding)
+	if (!mp_priv)
 		return 0;
 
 	mutex_lock(&page_pools_lock);
 	hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
-		if (pool->mp_priv != binding)
+		if (pool->mp_priv != mp_priv)
 			continue;
 
 		if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e928efc22f80..31e01da61c12 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,6 +277,7 @@ 
 #include <net/ip.h>
 #include <net/sock.h>
 #include <net/rstreason.h>
+#include <net/page_pool/types.h>
 
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
@@ -2476,6 +2477,11 @@  static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 			}
 
 			niov = skb_frag_net_iov(frag);
+			if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) {
+				err = -ENODEV;
+				goto out;
+			}
+
 			end = start + skb_frag_size(frag);
 			copy = end - offset;