diff mbox series

[v6,10/15] io_uring/zcrx: add io_zcrx_area

Message ID 20241016185252.3746190-11-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 16, 2024, 6:52 p.m. UTC
From: David Wei <davidhwei@meta.com>

Add io_zcrx_area that represents a region of userspace memory that is
used for zero copy. During ifq registration, userspace passes in the
uaddr and len of userspace memory, which is then pinned by the kernel.
Each net_iov is mapped to one of these pages.

The freelist is a spinlock protected list that keeps track of all the
net_iovs/pages that aren't used.

For now, there is only one area per ifq and area registration happens
implicitly as part of ifq registration. There is no API for
adding/removing areas yet. The struct for area registration is there for
future extensibility once we support multiple areas and TCP devmem.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/uapi/linux/io_uring.h |  9 ++++
 io_uring/rsrc.c               |  2 +-
 io_uring/rsrc.h               |  1 +
 io_uring/zcrx.c               | 93 ++++++++++++++++++++++++++++++++++-
 io_uring/zcrx.h               | 16 ++++++
 5 files changed, 118 insertions(+), 3 deletions(-)

Comments

Jens Axboe Oct. 21, 2024, 3:35 p.m. UTC | #1
On 10/16/24 12:52 PM, David Wei wrote:
> +static int io_zcrx_create_area(struct io_ring_ctx *ctx,
> +			       struct io_zcrx_ifq *ifq,
> +			       struct io_zcrx_area **res,
> +			       struct io_uring_zcrx_area_reg *area_reg)
> +{
> +	struct io_zcrx_area *area;
> +	int i, ret, nr_pages;
> +	struct iovec iov;
> +
> +	if (area_reg->flags || area_reg->rq_area_token)
> +		return -EINVAL;
> +	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
> +		return -EINVAL;
> +	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	iov.iov_base = u64_to_user_ptr(area_reg->addr);
> +	iov.iov_len = area_reg->len;
> +	ret = io_buffer_validate(&iov);
> +	if (ret)
> +		return ret;
> +
> +	ret = -ENOMEM;
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		goto err;
> +
> +	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
> +				   &nr_pages);
> +	if (IS_ERR(area->pages)) {
> +		ret = PTR_ERR(area->pages);
> +		area->pages = NULL;
> +		goto err;
> +	}
> +	area->nia.num_niovs = nr_pages;
> +
> +	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
> +					 GFP_KERNEL | __GFP_ZERO);
> +	if (!area->nia.niovs)
> +		goto err;
> +
> +	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
> +					GFP_KERNEL | __GFP_ZERO);
> +	if (!area->freelist)
> +		goto err;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		area->freelist[i] = i;
> +	}
> +
> +	area->free_count = nr_pages;
> +	area->ifq = ifq;
> +	/* we're only supporting one area per ifq for now */
> +	area->area_id = 0;
> +	area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
> +	spin_lock_init(&area->freelist_lock);
> +	*res = area;
> +	return 0;
> +err:
> +	if (area)
> +		io_zcrx_free_area(area);
> +	return ret;
> +}

Minor nit, but I think this would be nicer returning area and just using
ERR_PTR() for the errors.

Outside of that:

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Pavel Begunkov Oct. 21, 2024, 4:28 p.m. UTC | #2
On 10/21/24 16:35, Jens Axboe wrote:
> On 10/16/24 12:52 PM, David Wei wrote:
>> +static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>> +			       struct io_zcrx_ifq *ifq,
>> +			       struct io_zcrx_area **res,
>> +			       struct io_uring_zcrx_area_reg *area_reg)
>> +{
>> +	struct io_zcrx_area *area;
>> +	int i, ret, nr_pages;
>> +	struct iovec iov;
>> +
>> +	if (area_reg->flags || area_reg->rq_area_token)
>> +		return -EINVAL;
>> +	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
>> +		return -EINVAL;
>> +	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
>> +		return -EINVAL;
>> +
>> +	iov.iov_base = u64_to_user_ptr(area_reg->addr);
>> +	iov.iov_len = area_reg->len;
>> +	ret = io_buffer_validate(&iov);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = -ENOMEM;
>> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
>> +	if (!area)
>> +		goto err;
>> +
>> +	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
>> +				   &nr_pages);
>> +	if (IS_ERR(area->pages)) {
>> +		ret = PTR_ERR(area->pages);
>> +		area->pages = NULL;
>> +		goto err;
>> +	}
>> +	area->nia.num_niovs = nr_pages;
>> +
>> +	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
>> +					 GFP_KERNEL | __GFP_ZERO);
>> +	if (!area->nia.niovs)
>> +		goto err;
>> +
>> +	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
>> +					GFP_KERNEL | __GFP_ZERO);
>> +	if (!area->freelist)
>> +		goto err;
>> +
>> +	for (i = 0; i < nr_pages; i++) {
>> +		area->freelist[i] = i;
>> +	}
>> +
>> +	area->free_count = nr_pages;
>> +	area->ifq = ifq;
>> +	/* we're only supporting one area per ifq for now */
>> +	area->area_id = 0;
>> +	area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
>> +	spin_lock_init(&area->freelist_lock);
>> +	*res = area;
>> +	return 0;
>> +err:
>> +	if (area)
>> +		io_zcrx_free_area(area);
>> +	return ret;
>> +}
> 
> Minor nit, but I think this would be nicer returning area and just using
> ERR_PTR() for the errors.

I'd rather avoid it. Too often null vs IS_ERR checking gets
messed up down the road and the compiler doesn't help with it
at all.

Not related to the patch, but would be nice to have a type safer
way for that, e.g. returning some new type not directly
cast'able to the pointer.

> 
> Outside of that:
> 
> Reviewed-by: Jens Axboe <axboe@kernel.dk>
>
Jens Axboe Oct. 21, 2024, 4:29 p.m. UTC | #3
On 10/21/24 10:28 AM, Pavel Begunkov wrote:
> On 10/21/24 16:35, Jens Axboe wrote:
>> On 10/16/24 12:52 PM, David Wei wrote:
>>> +static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>>> +                   struct io_zcrx_ifq *ifq,
>>> +                   struct io_zcrx_area **res,
>>> +                   struct io_uring_zcrx_area_reg *area_reg)
>>> +{
>>> +    struct io_zcrx_area *area;
>>> +    int i, ret, nr_pages;
>>> +    struct iovec iov;
>>> +
>>> +    if (area_reg->flags || area_reg->rq_area_token)
>>> +        return -EINVAL;
>>> +    if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
>>> +        return -EINVAL;
>>> +    if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
>>> +        return -EINVAL;
>>> +
>>> +    iov.iov_base = u64_to_user_ptr(area_reg->addr);
>>> +    iov.iov_len = area_reg->len;
>>> +    ret = io_buffer_validate(&iov);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    ret = -ENOMEM;
>>> +    area = kzalloc(sizeof(*area), GFP_KERNEL);
>>> +    if (!area)
>>> +        goto err;
>>> +
>>> +    area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
>>> +                   &nr_pages);
>>> +    if (IS_ERR(area->pages)) {
>>> +        ret = PTR_ERR(area->pages);
>>> +        area->pages = NULL;
>>> +        goto err;
>>> +    }
>>> +    area->nia.num_niovs = nr_pages;
>>> +
>>> +    area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +    if (!area->nia.niovs)
>>> +        goto err;
>>> +
>>> +    area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
>>> +                    GFP_KERNEL | __GFP_ZERO);
>>> +    if (!area->freelist)
>>> +        goto err;
>>> +
>>> +    for (i = 0; i < nr_pages; i++) {
>>> +        area->freelist[i] = i;
>>> +    }
>>> +
>>> +    area->free_count = nr_pages;
>>> +    area->ifq = ifq;
>>> +    /* we're only supporting one area per ifq for now */
>>> +    area->area_id = 0;
>>> +    area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
>>> +    spin_lock_init(&area->freelist_lock);
>>> +    *res = area;
>>> +    return 0;
>>> +err:
>>> +    if (area)
>>> +        io_zcrx_free_area(area);
>>> +    return ret;
>>> +}
>>
>> Minor nit, but I think this would be nicer returning area and just using
>> ERR_PTR() for the errors.
> 
> I'd rather avoid it. Too often null vs IS_ERR checking gets
> messed up down the road and the compiler doesn't help with it
> at all.

The main issue imho is when people mix NULL and ERR_PTR, the pure "valid
pointer or non-null error pointer" seem to be OK in terms of
maintainability. But like I said, not a huge deal, and it's not hot path
material so doesn't matter in terms of that.

> Not related to the patch, but would be nice to have a type safer
> way for that, e.g. returning some new type not directly
> cast'able to the pointer.

Definitely, room for improvement in the infrastructure for this.
Pavel Begunkov Oct. 21, 2024, 4:39 p.m. UTC | #4
On 10/21/24 17:29, Jens Axboe wrote:
> On 10/21/24 10:28 AM, Pavel Begunkov wrote:
>> On 10/21/24 16:35, Jens Axboe wrote:
>>> On 10/16/24 12:52 PM, David Wei wrote:
>>>> +static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>>>> +                   struct io_zcrx_ifq *ifq,
>>>> +                   struct io_zcrx_area **res,
>>>> +                   struct io_uring_zcrx_area_reg *area_reg)
>>>> +{
>>>> +    struct io_zcrx_area *area;
>>>> +    int i, ret, nr_pages;
>>>> +    struct iovec iov;
>>>> +
>>>> +    if (area_reg->flags || area_reg->rq_area_token)
>>>> +        return -EINVAL;
>>>> +    if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
>>>> +        return -EINVAL;
>>>> +    if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
>>>> +        return -EINVAL;
>>>> +
>>>> +    iov.iov_base = u64_to_user_ptr(area_reg->addr);
>>>> +    iov.iov_len = area_reg->len;
>>>> +    ret = io_buffer_validate(&iov);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    ret = -ENOMEM;
>>>> +    area = kzalloc(sizeof(*area), GFP_KERNEL);
>>>> +    if (!area)
>>>> +        goto err;
>>>> +
>>>> +    area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
>>>> +                   &nr_pages);
>>>> +    if (IS_ERR(area->pages)) {
>>>> +        ret = PTR_ERR(area->pages);
>>>> +        area->pages = NULL;
>>>> +        goto err;
>>>> +    }
>>>> +    area->nia.num_niovs = nr_pages;
>>>> +
>>>> +    area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
>>>> +                     GFP_KERNEL | __GFP_ZERO);
>>>> +    if (!area->nia.niovs)
>>>> +        goto err;
>>>> +
>>>> +    area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
>>>> +                    GFP_KERNEL | __GFP_ZERO);
>>>> +    if (!area->freelist)
>>>> +        goto err;
>>>> +
>>>> +    for (i = 0; i < nr_pages; i++) {
>>>> +        area->freelist[i] = i;
>>>> +    }
>>>> +
>>>> +    area->free_count = nr_pages;
>>>> +    area->ifq = ifq;
>>>> +    /* we're only supporting one area per ifq for now */
>>>> +    area->area_id = 0;
>>>> +    area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
>>>> +    spin_lock_init(&area->freelist_lock);
>>>> +    *res = area;
>>>> +    return 0;
>>>> +err:
>>>> +    if (area)
>>>> +        io_zcrx_free_area(area);
>>>> +    return ret;
>>>> +}
>>>
>>> Minor nit, but I think this would be nicer returning area and just using
>>> ERR_PTR() for the errors.
>>
>> I'd rather avoid it. Too often null vs IS_ERR checking gets
>> messed up down the road and the compiler doesn't help with it
>> at all.
> 
> The main issue imho is when people mix NULL and ERR_PTR, the pure "valid
> pointer or non-null error pointer" seem to be OK in terms of

Right, I meant it in general, mixing normal pointer types with
the implicit type that can have an error.

> maintainability. But like I said, not a huge deal, and it's not hot path
> material so doesn't matter in terms of that.

I agree it's maintainable, but this way I don't even need to think
about it.

>> Not related to the patch, but would be nice to have a type safer
>> way for that, e.g. returning some new type not directly
>> cast'able to the pointer.
> 
> Definitely, room for improvement in the infrastructure for this.
>
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d398e19f8eea..d43183264dcf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -874,6 +874,15 @@  struct io_uring_zcrx_offsets {
 	__u64	__resv[2];
 };
 
+struct io_uring_zcrx_area_reg {
+	__u64	addr;
+	__u64	len;
+	__u64	rq_area_token;
+	__u32	flags;
+	__u32	__resv1;
+	__u64	__resv2[2];
+};
+
 /*
  * Argument for IORING_REGISTER_ZCRX_IFQ
  */
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 33a3d156a85b..4da644de8843 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -86,7 +86,7 @@  static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 	return 0;
 }
 
-static int io_buffer_validate(struct iovec *iov)
+int io_buffer_validate(struct iovec *iov)
 {
 	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8ed588036210..0933dc99f41d 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -83,6 +83,7 @@  int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned size, unsigned type);
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
+int io_buffer_validate(struct iovec *iov);
 
 static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 4c53fd4f7bb3..a276572fe953 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -10,6 +10,7 @@ 
 #include "kbuf.h"
 #include "memmap.h"
 #include "zcrx.h"
+#include "rsrc.h"
 
 #define IO_RQ_MAX_ENTRIES		32768
 
@@ -38,6 +39,83 @@  static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 	ifq->rqes = NULL;
 }
 
+static void io_zcrx_free_area(struct io_zcrx_area *area)
+{
+	if (area->freelist)
+		kvfree(area->freelist);
+	if (area->nia.niovs)
+		kvfree(area->nia.niovs);
+	if (area->pages) {
+		unpin_user_pages(area->pages, area->nia.num_niovs);
+		kvfree(area->pages);
+	}
+	kfree(area);
+}
+
+static int io_zcrx_create_area(struct io_ring_ctx *ctx,
+			       struct io_zcrx_ifq *ifq,
+			       struct io_zcrx_area **res,
+			       struct io_uring_zcrx_area_reg *area_reg)
+{
+	struct io_zcrx_area *area;
+	int i, ret, nr_pages;
+	struct iovec iov;
+
+	if (area_reg->flags || area_reg->rq_area_token)
+		return -EINVAL;
+	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+		return -EINVAL;
+	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+		return -EINVAL;
+
+	iov.iov_base = u64_to_user_ptr(area_reg->addr);
+	iov.iov_len = area_reg->len;
+	ret = io_buffer_validate(&iov);
+	if (ret)
+		return ret;
+
+	ret = -ENOMEM;
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		goto err;
+
+	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+				   &nr_pages);
+	if (IS_ERR(area->pages)) {
+		ret = PTR_ERR(area->pages);
+		area->pages = NULL;
+		goto err;
+	}
+	area->nia.num_niovs = nr_pages;
+
+	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
+					 GFP_KERNEL | __GFP_ZERO);
+	if (!area->nia.niovs)
+		goto err;
+
+	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
+					GFP_KERNEL | __GFP_ZERO);
+	if (!area->freelist)
+		goto err;
+
+	for (i = 0; i < nr_pages; i++) {
+		area->freelist[i] = i;
+	}
+
+	area->free_count = nr_pages;
+	area->ifq = ifq;
+	/* we're only supporting one area per ifq for now */
+	area->area_id = 0;
+	area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
+	spin_lock_init(&area->freelist_lock);
+	*res = area;
+	return 0;
+err:
+	if (area)
+		io_zcrx_free_area(area);
+	return ret;
+}
+
 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zcrx_ifq *ifq;
@@ -53,6 +131,9 @@  static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
 
 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 {
+	if (ifq->area)
+		io_zcrx_free_area(ifq->area);
+
 	io_free_rbuf_ring(ifq);
 	kfree(ifq);
 }
@@ -60,6 +141,7 @@  static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zcrx_ifq_reg __user *arg)
 {
+	struct io_uring_zcrx_area_reg area;
 	struct io_uring_zcrx_ifq_reg reg;
 	struct io_zcrx_ifq *ifq;
 	size_t ring_sz, rqes_sz;
@@ -91,7 +173,7 @@  int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	}
 	reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
 
-	if (!reg.area_ptr)
+	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
 		return -EFAULT;
 
 	ifq = io_zcrx_ifq_alloc(ctx);
@@ -102,6 +184,10 @@  int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (ret)
 		goto err;
 
+	ret = io_zcrx_create_area(ctx, ifq, &ifq->area, &area);
+	if (ret)
+		goto err;
+
 	ifq->rq_entries = reg.rq_entries;
 	ifq->if_rxq = reg.if_rxq;
 
@@ -116,7 +202,10 @@  int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
-
+	if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
+		ret = -EFAULT;
+		goto err;
+	}
 	ctx->ifq = ifq;
 	return 0;
 err:
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 1f76eecac5fd..a8db61498c67 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,10 +3,26 @@ 
 #define IOU_ZC_RX_H
 
 #include <linux/io_uring_types.h>
+#include <net/page_pool/types.h>
+
+struct io_zcrx_area {
+	struct net_iov_area	nia;
+	struct io_zcrx_ifq	*ifq;
+
+	u16			area_id;
+	struct page		**pages;
+
+	/* freelist */
+	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
+	u32			free_count;
+	u32			*freelist;
+};
 
 struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
+	struct io_zcrx_area		*area;
+
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe 	*rqes;
 	u32				rq_entries;