Message ID | 20241029230521.2385749-5-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zero copy rx | expand |
On Tue, Oct 29, 2024 at 4:06 PM David Wei <dw@davidwei.uk> wrote: > > From: Pavel Begunkov <asml.silence@gmail.com> > > There is a good bunch of places in generic paths assuming that the only > page pool memory provider is devmem TCP. As we want to reuse the net_iov > and provider infrastructure, we need to patch it up and explicitly check > the provider type when we branch into devmem TCP code. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > Signed-off-by: David Wei <dw@davidwei.uk> > --- > net/core/devmem.c | 10 ++++++++-- > net/core/devmem.h | 8 ++++++++ > net/core/page_pool_user.c | 15 +++++++++------ > net/ipv4/tcp.c | 6 ++++++ > 4 files changed, 31 insertions(+), 8 deletions(-) > > diff --git a/net/core/devmem.c b/net/core/devmem.c > index 01738029e35c..78983a98e5dc 100644 > --- a/net/core/devmem.c > +++ b/net/core/devmem.c > @@ -28,6 +28,12 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); > > static const struct memory_provider_ops dmabuf_devmem_ops; > > +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) > +{ > + return ops == &dmabuf_devmem_ops; > +} > +EXPORT_SYMBOL_GPL(net_is_devmem_page_pool_ops); > + > static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, > struct gen_pool_chunk *chunk, > void *not_used) > @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev) > unsigned int i; > > for (i = 0; i < dev->real_num_rx_queues; i++) { > - binding = dev->_rx[i].mp_params.mp_priv; > - if (!binding) > + if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops) > continue; > Use the net_is_devmem_page_pool_ops helper here? > + binding = dev->_rx[i].mp_params.mp_priv; > xa_for_each(&binding->bound_rxqs, xa_idx, rxq) > if (rxq == &dev->_rx[i]) { > xa_erase(&binding->bound_rxqs, xa_idx); > diff --git a/net/core/devmem.h b/net/core/devmem.h > index a2b9913e9a17..a3fdd66bb05b 100644 > --- a/net/core/devmem.h > +++ b/net/core/devmem.h > @@ -116,6 +116,8 @@ struct net_iov * > net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); > void net_devmem_free_dmabuf(struct net_iov *ppiov); > > +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops); > + > #else > struct net_devmem_dmabuf_binding; > > @@ -168,6 +170,12 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) > { > return 0; > } > + > +static inline bool > +net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) > +{ > + return false; > +} > #endif > > #endif /* _NET_DEVMEM_H */ > diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c > index 48335766c1bf..604862a73535 100644 > --- a/net/core/page_pool_user.c > +++ b/net/core/page_pool_user.c > @@ -214,7 +214,7 @@ static int > page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, > const struct genl_info *info) > { > - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; > + struct net_devmem_dmabuf_binding *binding; > size_t inflight, refsz; > void *hdr; > > @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, > pool->user.detach_time)) > goto err_cancel; > > - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) > - goto err_cancel; > + if (net_is_devmem_page_pool_ops(pool->mp_ops)) { > + binding = pool->mp_priv; > + if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) > + goto err_cancel; > + } Worthy of note is that I think Jakub asked for this introspection, and likely you should also add similar introspection. I.e. page_pool dumping should likely be improved to dump that it's bound to io_uring memory. Not sure what io_uring memory 'id' equivalent would be, if any. > > genlmsg_end(rsp, hdr); > > @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool) > int page_pool_check_memory_provider(struct net_device *dev, > struct netdev_rx_queue *rxq) > { > - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; > + void *mp_priv = rxq->mp_params.mp_priv; > struct page_pool *pool; > struct hlist_node *n; > > - if (!binding) > + if (!mp_priv) > return 0; > > mutex_lock(&page_pools_lock); > hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { > - if (pool->mp_priv != binding) > + if (pool->mp_priv != mp_priv) > continue; > > if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index e928efc22f80..31e01da61c12 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -277,6 +277,7 @@ > #include <net/ip.h> > #include <net/sock.h> > #include <net/rstreason.h> > +#include <net/page_pool/types.h> > > #include <linux/uaccess.h> > #include <asm/ioctls.h> > @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, > } > > niov = skb_frag_net_iov(frag); > + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { > + err = -ENODEV; > + goto out; > + } > + I think this check needs to go in the caller. Currently the caller assumes that if !skb_frags_readable(), then the frag is dma-buf, and calls tcp_recvmsg_dmabuf on it. The caller needs to check that the frag is specifically a dma-buf frag now. Can io_uring frags somehow end up in tcp_recvmsg_locked? You're still using the tcp stack with io_uring ZC right? So I suspect they might? -- Thanks, Mina
On 11/1/24 17:09, Mina Almasry wrote: > On Tue, Oct 29, 2024 at 4:06 PM David Wei <dw@davidwei.uk> wrote: ... >> + >> static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, >> struct gen_pool_chunk *chunk, >> void *not_used) >> @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev) >> unsigned int i; >> >> for (i = 0; i < dev->real_num_rx_queues; i++) { >> - binding = dev->_rx[i].mp_params.mp_priv; >> - if (!binding) >> + if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops) >> continue; >> > > Use the net_is_devmem_page_pool_ops helper here? It could, but the function is there primarily for outside users to avoid ifdefs and build problems. I don't think it worth reiteration? I'll change if there is a next version. ... >> @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, >> pool->user.detach_time)) >> goto err_cancel; >> >> - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) >> - goto err_cancel; >> + if (net_is_devmem_page_pool_ops(pool->mp_ops)) { >> + binding = pool->mp_priv; >> + if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) >> + goto err_cancel; >> + } > > Worthy of note is that I think Jakub asked for this introspection, and > likely you should also add similar introspection. I.e. page_pool I think we can patch it up after merging the series? Depends what Jakub thinks. In any case, I can't parse io_uring ops here until a later patch adding those ops, so it'd be a new patch if it's a part of this series. > dumping should likely be improved to dump that it's bound to io_uring > memory. Not sure what io_uring memory 'id' equivalent would be, if > any. I don't think io_uring have any id to give. What is it for in the first place? Do you give it to netlink to communicate with devmem TCP or something similar? >> genlmsg_end(rsp, hdr); >> >> @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool) >> int page_pool_check_memory_provider(struct net_device *dev, >> struct netdev_rx_queue *rxq) ... >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >> index e928efc22f80..31e01da61c12 100644 >> --- a/net/ipv4/tcp.c >> +++ b/net/ipv4/tcp.c >> @@ -277,6 +277,7 @@ >> #include <net/ip.h> >> #include <net/sock.h> >> #include <net/rstreason.h> >> +#include <net/page_pool/types.h> >> >> #include <linux/uaccess.h> >> #include <asm/ioctls.h> >> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, >> } >> >> niov = skb_frag_net_iov(frag); >> + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { >> + err = -ENODEV; >> + goto out; >> + } >> + > > I think this check needs to go in the caller. Currently the caller > assumes that if !skb_frags_readable(), then the frag is dma-buf, and io_uring originated netmem that are marked unreadable as well and so will end up in tcp_recvmsg_dmabuf(), then we reject and fail since they should not be fed to devmem TCP. It should be fine from correctness perspective. We need to check frags, and that's the place where we iterate frags. Another option is to add a loop in tcp_recvmsg_locked walking over all frags of an skb and doing the checks, but that's an unnecessary performance burden to devmem. > calls tcp_recvmsg_dmabuf on it. The caller needs to check that the > frag is specifically a dma-buf frag now. > > Can io_uring frags somehow end up in tcp_recvmsg_locked? You're still > using the tcp stack with io_uring ZC right? So I suspect they might? All of them are using the same socket rx queue, so yes, any of them can see any type of packet non net_iov / pages
On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote: > ... > >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > >> index e928efc22f80..31e01da61c12 100644 > >> --- a/net/ipv4/tcp.c > >> +++ b/net/ipv4/tcp.c > >> @@ -277,6 +277,7 @@ > >> #include <net/ip.h> > >> #include <net/sock.h> > >> #include <net/rstreason.h> > >> +#include <net/page_pool/types.h> > >> > >> #include <linux/uaccess.h> > >> #include <asm/ioctls.h> > >> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, > >> } > >> > >> niov = skb_frag_net_iov(frag); > >> + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { > >> + err = -ENODEV; > >> + goto out; > >> + } > >> + > > > > I think this check needs to go in the caller. Currently the caller > > assumes that if !skb_frags_readable(), then the frag is dma-buf, and > > io_uring originated netmem that are marked unreadable as well > and so will end up in tcp_recvmsg_dmabuf(), then we reject and > fail since they should not be fed to devmem TCP. It should be > fine from correctness perspective. > > We need to check frags, and that's the place where we iterate > frags. Another option is to add a loop in tcp_recvmsg_locked > walking over all frags of an skb and doing the checks, but > that's an unnecessary performance burden to devmem. > Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring function) is not ideal really. Especially when you're dereferencing nio->pp to do the check which IIUC will pull a cache line not normally needed in this code path and may have a performance impact. We currently have a check in __skb_fill_netmem_desc() that makes sure all frags added to an skb are pages or dmabuf. I think we need to improve it to make sure all frags added to an skb are of the same type (pages, dmabuf, iouring). sending it to skb_copy_datagram_msg or tcp_recvmsg_dmabuf or error. I also I'm not sure dereferencing ->pp to check the frag type is ever OK in such a fast path when ->pp is not usually needed until the skb is freed? You may have to add a flag to the niov to indicate what type it is, or change the skb->unreadable flag to a u8 that determines if it's pages/io_uring/dmabuf.
On 11/4/24 20:20, Mina Almasry wrote: > On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote: ... >> io_uring originated netmem that are marked unreadable as well >> and so will end up in tcp_recvmsg_dmabuf(), then we reject and >> fail since they should not be fed to devmem TCP. It should be >> fine from correctness perspective. >> >> We need to check frags, and that's the place where we iterate >> frags. Another option is to add a loop in tcp_recvmsg_locked >> walking over all frags of an skb and doing the checks, but >> that's an unnecessary performance burden to devmem. >> > > Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring > function) is not ideal really. Especially when you're dereferencing > nio->pp to do the check which IIUC will pull a cache line not normally > needed in this code path and may have a performance impact. Even if it's cold, all net_iov processed are expected to come from the same page pool and would be cached. And it should be of a comparable hotness as dmabuf_genpool_chunk_owner accesses below, considering that there is enough of processing happening around, it should be of a worry and can be improve upon later.
On 2024-11-04 05:20, Mina Almasry wrote: > On Fri, Nov 1, 2024 at 10:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote: >> ... >>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >>>> index e928efc22f80..31e01da61c12 100644 >>>> --- a/net/ipv4/tcp.c >>>> +++ b/net/ipv4/tcp.c >>>> @@ -277,6 +277,7 @@ >>>> #include <net/ip.h> >>>> #include <net/sock.h> >>>> #include <net/rstreason.h> >>>> +#include <net/page_pool/types.h> >>>> >>>> #include <linux/uaccess.h> >>>> #include <asm/ioctls.h> >>>> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, >>>> } >>>> >>>> niov = skb_frag_net_iov(frag); >>>> + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { >>>> + err = -ENODEV; >>>> + goto out; >>>> + } >>>> + >>> >>> I think this check needs to go in the caller. Currently the caller >>> assumes that if !skb_frags_readable(), then the frag is dma-buf, and >> >> io_uring originated netmem that are marked unreadable as well >> and so will end up in tcp_recvmsg_dmabuf(), then we reject and >> fail since they should not be fed to devmem TCP. It should be >> fine from correctness perspective. >> >> We need to check frags, and that's the place where we iterate >> frags. Another option is to add a loop in tcp_recvmsg_locked >> walking over all frags of an skb and doing the checks, but >> that's an unnecessary performance burden to devmem. >> > > Checking each frag in tcp_recvmsg_dmabuf (and the equivalent io_uring > function) is not ideal really. Especially when you're dereferencing > nio->pp to do the check which IIUC will pull a cache line not normally > needed in this code path and may have a performance impact. This check is needed currently because the curent assumption in core netdev code is that !skb_frags_readable() means devmem TCP. Longer term, we need to figure out how to distinguish skb frag providers in both code and Netlink introspection. Since your concerns here are primarily around performance rather than correctness, I suggest we defer this as a follow up series. > > We currently have a check in __skb_fill_netmem_desc() that makes sure > all frags added to an skb are pages or dmabuf. I think we need to > improve it to make sure all frags added to an skb are of the same type > (pages, dmabuf, iouring). sending it to skb_copy_datagram_msg or > tcp_recvmsg_dmabuf or error. It should not be possible for drivers to fill in an skb with frags from different providers. A provider can only change upon a queue reset. > > I also I'm not sure dereferencing ->pp to check the frag type is ever > OK in such a fast path when ->pp is not usually needed until the skb > is freed? You may have to add a flag to the niov to indicate what type > it is, or change the skb->unreadable flag to a u8 that determines if > it's pages/io_uring/dmabuf. > >
diff --git a/net/core/devmem.c b/net/core/devmem.c index 01738029e35c..78983a98e5dc 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -28,6 +28,12 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) +{ + return ops == &dmabuf_devmem_ops; +} +EXPORT_SYMBOL_GPL(net_is_devmem_page_pool_ops); + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev) unsigned int i; for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) + if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops) continue; + binding = dev->_rx[i].mp_params.mp_priv; xa_for_each(&binding->bound_rxqs, xa_idx, rxq) if (rxq == &dev->_rx[i]) { xa_erase(&binding->bound_rxqs, xa_idx); diff --git a/net/core/devmem.h b/net/core/devmem.h index a2b9913e9a17..a3fdd66bb05b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -116,6 +116,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops); + #else struct net_devmem_dmabuf_binding; @@ -168,6 +170,12 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool +net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bf..604862a73535 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -214,7 +214,7 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_devmem_dmabuf_binding *binding; size_t inflight, refsz; void *hdr; @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) - goto err_cancel; + if (net_is_devmem_page_pool_ops(pool->mp_ops)) { + binding = pool->mp_priv; + if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + goto err_cancel; + } genlmsg_end(rsp, hdr); @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *mp_priv = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; - if (!binding) + if (!mp_priv) return 0; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { - if (pool->mp_priv != binding) + if (pool->mp_priv != mp_priv) continue; if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e928efc22f80..31e01da61c12 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,6 +277,7 @@ #include <net/ip.h> #include <net/sock.h> #include <net/rstreason.h> +#include <net/page_pool/types.h> #include <linux/uaccess.h> #include <asm/ioctls.h> @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset;