[RFC,3/3] ublk_drv: add ebpf support

Message ID	20230215004122.28917-4-xiaoguang.wang@linux.alibaba.com (mailing list archive)
State	Superseded
Delegated to:	BPF
Headers	show Return-Path: <bpf-owner@vger.kernel.org> From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com> To: linux-block@vger.kernel.org, io-uring@vger.kernel.org, bpf@vger.kernel.org Cc: ming.lei@redhat.com, axboe@kernel.dk, asml.silence@gmail.com, ZiyangZhang@linux.alibaba.com Subject: [RFC 3/3] ublk_drv: add ebpf support Date: Wed, 15 Feb 2023 08:41:22 +0800 Message-Id: <20230215004122.28917-4-xiaoguang.wang@linux.alibaba.com> In-Reply-To: <20230215004122.28917-1-xiaoguang.wang@linux.alibaba.com> References: <20230215004122.28917-1-xiaoguang.wang@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Add io_uring & ebpf based methods to implement zero-copy for ublk \| expand [RFC,0/3] Add io_uring & ebpf based methods to implement zero-copy for ublk [RFC,1/3] bpf: add UBLK program type [RFC,2/3] io_uring: enable io_uring to submit sqes located in kernel [RFC,3/3] ublk_drv: add ebpf support

Context	Check	Description
netdev/tree_selection	success	Not a local patch, async

Xiaoguang Wang Feb. 15, 2023, 12:41 a.m. UTC

Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
can use this helper to write ebpf prog to support ublk kernel & usersapce
zero copy, please see ublksrv test codes for more info.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/bpf.h       |   1 +
 include/uapi/linux/ublk_cmd.h  |  11 ++
 scripts/bpf_doc.py             |   4 +
 tools/include/uapi/linux/bpf.h |   8 ++
 5 files changed, 229 insertions(+), 2 deletions(-)

Ming Lei Feb. 16, 2023, 8:11 a.m. UTC | #1

On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
> can use this helper to write ebpf prog to support ublk kernel & usersapce
> zero copy, please see ublksrv test codes for more info.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>  drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
>  include/uapi/linux/bpf.h       |   1 +
>  include/uapi/linux/ublk_cmd.h  |  11 ++
>  scripts/bpf_doc.py             |   4 +
>  tools/include/uapi/linux/bpf.h |   8 ++
>  5 files changed, 229 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index b628e9eaefa6..44c289b72864 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -61,6 +61,7 @@
>  struct ublk_rq_data {
>  	struct llist_node node;
>  	struct callback_head work;
> +	struct io_mapped_kbuf *kbuf;
>  };
>  
>  struct ublk_uring_cmd_pdu {
> @@ -163,6 +164,9 @@ struct ublk_device {
>  	unsigned int		nr_queues_ready;
>  	atomic_t		nr_aborted_queues;
>  
> +	struct bpf_prog		*io_prep_prog;
> +	struct bpf_prog		*io_submit_prog;
> +
>  	/*
>  	 * Our ubq->daemon may be killed without any notification, so
>  	 * monitor each queue's daemon periodically
> @@ -189,10 +193,46 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>  
>  static struct miscdevice ublk_misc;
>  
> +struct ublk_io_bpf_ctx {
> +	struct ublk_bpf_ctx ctx;
> +	struct ublk_device *ub;
> +	struct callback_head work;
> +};
> +
> +BPF_CALL_4(bpf_ublk_queue_sqe, struct ublk_io_bpf_ctx *, bpf_ctx,
> +	   struct io_uring_sqe *, sqe, u32, sqe_len, u32, fd)
> +{
> +	struct request *rq;
> +	struct ublk_rq_data *data;
> +	struct io_mapped_kbuf *kbuf;
> +	u16 q_id = bpf_ctx->ctx.q_id;
> +	u16 tag = bpf_ctx->ctx.tag;
> +
> +	rq = blk_mq_tag_to_rq(bpf_ctx->ub->tag_set.tags[q_id], tag);
> +	data = blk_mq_rq_to_pdu(rq);
> +	kbuf = data->kbuf;
> +	io_uring_submit_sqe(fd, sqe, sqe_len, kbuf);
> +	return 0;
> +}
> +
> +const struct bpf_func_proto ublk_bpf_queue_sqe_proto = {
> +	.func = bpf_ublk_queue_sqe,
> +	.gpl_only = false,
> +	.ret_type = RET_INTEGER,
> +	.arg1_type = ARG_ANYTHING,
> +	.arg2_type = ARG_ANYTHING,
> +	.arg3_type = ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> -	return bpf_base_func_proto(func_id);
> +	switch (func_id) {
> +	case BPF_FUNC_ublk_queue_sqe:
> +		return &ublk_bpf_queue_sqe_proto;
> +	default:
> +		return bpf_base_func_proto(func_id);
> +	}
>  }
>  
>  static bool ublk_bpf_is_valid_access(int off, int size,
> @@ -200,6 +240,23 @@ static bool ublk_bpf_is_valid_access(int off, int size,
>  			const struct bpf_prog *prog,
>  			struct bpf_insn_access_aux *info)
>  {
> +	if (off < 0 || off >= sizeof(struct ublk_bpf_ctx))
> +		return false;
> +	if (off % size != 0)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct ublk_bpf_ctx, q_id):
> +		return size == sizeof_field(struct ublk_bpf_ctx, q_id);
> +	case offsetof(struct ublk_bpf_ctx, tag):
> +		return size == sizeof_field(struct ublk_bpf_ctx, tag);
> +	case offsetof(struct ublk_bpf_ctx, op):
> +		return size == sizeof_field(struct ublk_bpf_ctx, op);
> +	case offsetof(struct ublk_bpf_ctx, nr_sectors):
> +		return size == sizeof_field(struct ublk_bpf_ctx, nr_sectors);
> +	case offsetof(struct ublk_bpf_ctx, start_sector):
> +		return size == sizeof_field(struct ublk_bpf_ctx, start_sector);
> +	}
>  	return false;
>  }
>  
> @@ -324,7 +381,7 @@ static void ublk_put_device(struct ublk_device *ub)
>  static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
>  		int qid)
>  {
> -       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
> +	return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
>  }
>  
>  static inline bool ublk_rq_has_data(const struct request *rq)
> @@ -492,12 +549,16 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
>  static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
>  		struct ublk_io *io)
>  {
> +	struct ublk_device *ub = ubq->dev;
>  	const unsigned int rq_bytes = blk_rq_bytes(req);
>  	/*
>  	 * no zero copy, we delay copy WRITE request data into ublksrv
>  	 * context and the big benefit is that pinning pages in current
>  	 * context is pretty fast, see ublk_pin_user_pages
>  	 */
> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
> +		return rq_bytes;

Can you explain a bit why READ isn't supported? Because WRITE zero
copy is supposed to be supported easily with splice based approach,
and I am more interested in READ zc actually.

> +
>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
>  		return rq_bytes;
>  
> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>  	}
>  }
>  
> +static void ublk_bpf_io_submit_fn(struct callback_head *work)
> +{
> +	struct ublk_io_bpf_ctx *bpf_ctx = container_of(work,
> +			struct ublk_io_bpf_ctx, work);
> +
> +	if (bpf_ctx->ub->io_submit_prog)
> +		bpf_prog_run_pin_on_cpu(bpf_ctx->ub->io_submit_prog, bpf_ctx);
> +	kfree(bpf_ctx);
> +}
> +
> +static int ublk_init_uring_kbuf(struct request *rq)
> +{
> +	struct bio_vec *bvec;
> +	struct req_iterator rq_iter;
> +	struct bio_vec tmp;
> +	int nr_bvec = 0;
> +	struct io_mapped_kbuf *kbuf;
> +	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> +
> +	/* Drop previous allocation */
> +	if (data->kbuf) {
> +		kfree(data->kbuf->bvec);
> +		kfree(data->kbuf);
> +		data->kbuf = NULL;
> +	}
> +
> +	kbuf = kmalloc(sizeof(struct io_mapped_kbuf), GFP_NOIO);
> +	if (!kbuf)
> +		return -EIO;
> +
> +	rq_for_each_bvec(tmp, rq, rq_iter)
> +		nr_bvec++;
> +
> +	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO);
> +	if (!bvec) {
> +		kfree(kbuf);
> +		return -EIO;
> +	}
> +	kbuf->bvec = bvec;
> +	rq_for_each_bvec(tmp, rq, rq_iter) {
> +		*bvec = tmp;
> +		bvec++;
> +	}
> +
> +	kbuf->count = blk_rq_bytes(rq);
> +	kbuf->nr_bvecs = nr_bvec;
> +	data->kbuf = kbuf;
> +	return 0;

bio/req bvec table is immutable, so here you can pass its reference
to kbuf directly.

> +}
> +
> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
> +{
> +	int err;
> +	struct ublk_device *ub = ubq->dev;
> +	struct bpf_prog *prog = ub->io_prep_prog;
> +	struct ublk_io_bpf_ctx *bpf_ctx;
> +
> +	if (!prog)
> +		return 0;
> +
> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
> +	if (!bpf_ctx)
> +		return -EIO;
> +
> +	err = ublk_init_uring_kbuf(rq);
> +	if (err < 0) {
> +		kfree(bpf_ctx);
> +		return -EIO;
> +	}
> +	bpf_ctx->ub = ub;
> +	bpf_ctx->ctx.q_id = ubq->q_id;
> +	bpf_ctx->ctx.tag = rq->tag;
> +	bpf_ctx->ctx.op = req_op(rq);
> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);

The above is for setting up target io parameter, which is supposed
to be from userspace, cause it is result of user space logic. If
these parameters are from kernel, the whole logic has to be done
in io_prep_prog.

> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
> +
> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
> +		kfree(bpf_ctx);

task_work_add() is only available in case of ublk builtin.

> +	return 0;
> +}
> +
>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		const struct blk_mq_queue_data *bd)
>  {
> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	if (unlikely(res != BLK_STS_OK))
>  		return BLK_STS_IOERR;
>  
> +	/* Currently just for test. */
> +	ublk_run_bpf_prog(ubq, rq);

Can you explain the above comment a bit? When is the io_prep_prog called
in the non-test version? Or can you post the non-test version in list
for review.

Here it is the key for understanding the whole idea, especially when
is io_prep_prog called finally? How to pass parameters to io_prep_prog?

Given it is ebpf prog, I don't think any userspace parameter can be
passed to io_prep_prog when submitting IO, that means all user logic has
to be done inside io_prep_prog? If yes, not sure if it is one good way,
cause ebpf prog is very limited programming environment, but the user
logic could be as complicated as using btree to map io, or communicating
with remote machine for figuring out the mapping. Loop is just the
simplest direct mapping.


Thanks, 
Ming

Xiaoguang Wang Feb. 16, 2023, 12:12 p.m. UTC | #2

hello,

> On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
>> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
>> can use this helper to write ebpf prog to support ublk kernel & usersapce
>> zero copy, please see ublksrv test codes for more info.
>>
>>  	 */
>> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
>> +		return rq_bytes;
> Can you explain a bit why READ isn't supported? Because WRITE zero
> copy is supposed to be supported easily with splice based approach,
> and I am more interested in READ zc actually.
No special reason, READ op can also be supported. I'll
add this support in patch set v2.
For this RFC patch set, I just tried to show the idea, so
I must admit that current codes are not mature enough :)

>
>> +
>>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
>>  		return rq_bytes;
>>  
>> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>>  	}
>>  }
>>  
>>
>> +	kbuf->bvec = bvec;
>> +	rq_for_each_bvec(tmp, rq, rq_iter) {
>> +		*bvec = tmp;
>> +		bvec++;
>> +	}
>> +
>> +	kbuf->count = blk_rq_bytes(rq);
>> +	kbuf->nr_bvecs = nr_bvec;
>> +	data->kbuf = kbuf;
>> +	return 0;
> bio/req bvec table is immutable, so here you can pass its reference
> to kbuf directly.
Yeah, thanks.

>
>> +}
>> +
>> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
>> +{
>> +	int err;
>> +	struct ublk_device *ub = ubq->dev;
>> +	struct bpf_prog *prog = ub->io_prep_prog;
>> +	struct ublk_io_bpf_ctx *bpf_ctx;
>> +
>> +	if (!prog)
>> +		return 0;
>> +
>> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
>> +	if (!bpf_ctx)
>> +		return -EIO;
>> +
>> +	err = ublk_init_uring_kbuf(rq);
>> +	if (err < 0) {
>> +		kfree(bpf_ctx);
>> +		return -EIO;
>> +	}
>> +	bpf_ctx->ub = ub;
>> +	bpf_ctx->ctx.q_id = ubq->q_id;
>> +	bpf_ctx->ctx.tag = rq->tag;
>> +	bpf_ctx->ctx.op = req_op(rq);
>> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
>> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
> The above is for setting up target io parameter, which is supposed
> to be from userspace, cause it is result of user space logic. If
> these parameters are from kernel, the whole logic has to be done
> in io_prep_prog.
Yeah, it's designed that io_prep_prog implements user space
io logic.

>
>> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
>> +
>> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
>> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
>> +		kfree(bpf_ctx);
> task_work_add() is only available in case of ublk builtin.
Yeah, I'm thinking how to work around it.

>
>> +	return 0;
>> +}
>> +
>>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  		const struct blk_mq_queue_data *bd)
>>  {
>> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  	if (unlikely(res != BLK_STS_OK))
>>  		return BLK_STS_IOERR;
>>  
>> +	/* Currently just for test. */
>> +	ublk_run_bpf_prog(ubq, rq);
> Can you explain the above comment a bit? When is the io_prep_prog called
> in the non-test version? Or can you post the non-test version in list
> for review.
Forgot to delete stale comments, sorry. I'm writing v2 patch set,

> Here it is the key for understanding the whole idea, especially when
> is io_prep_prog called finally? How to pass parameters to io_prep_prog?
Let me explain more about the design:
io_prep_prog has two types of parameters:
1) its call argument: struct ublk_bpf_ctx, see ublk.bpf.c.
ublk_bpf_ctx will describe one kernel io requests about
its op, qid, sectors info. io_prep_prog uses these info to
map target io.
2) ebpf map structure, user space daemon can use map
structure to pass much information from user space to
io_prep_prog, which will help it to initialize target io if necessary.

io_prep_prog is called when ublk_queue_rq() is called, this bpf
prog will initialize one or more sqes according to user logic, and
io_prep_prog will put these sqes in an ebpf map structure, then
execute a task_work_add() to notify ubq_daemon to execute
io_submit_prog. Note, we can not call io_uring_submit_sqe()
in task context that calls ublk_queue_rq(), that context does not
have io_uring instance owned by ubq_daemon.
Later ubq_daemon will call io_submit_prog to submit sqes.

>
> Given it is ebpf prog, I don't think any userspace parameter can be
> passed to io_prep_prog when submitting IO, that means all user logic has
> to be done inside io_prep_prog? If yes, not sure if it is one good way,
> cause ebpf prog is very limited programming environment, but the user
> logic could be as complicated as using btree to map io, or communicating
> with remote machine for figuring out the mapping. Loop is just the
> simplest direct mapping.
Currently, we can use ebpf map structure to pass user space
parameter to io_prep_prog. Also I agree with you that complicated
logic maybe hard to be implemented in ebpf prog, hope ebpf
community will improve this situation gradually.

For userspace target implementations I met so far, they just use
userspace block device solutions to visit distributed filesystem,
involves socket programming and have simple map io logic. We
can prepare socket fd in ebpf map structure, and these map io
logic should be easily implemented in ebpf prog, though I don't
apply this ebpf method to our internal business yet.

Thanks for review.

Regards,
Xiaoguang Wang

>
>
> Thanks, 
> Ming

Ming Lei Feb. 17, 2023, 3:02 a.m. UTC | #3

On Thu, Feb 16, 2023 at 08:12:18PM +0800, Xiaoguang Wang wrote:
> hello,
> 
> > On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
> >> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
> >> can use this helper to write ebpf prog to support ublk kernel & usersapce
> >> zero copy, please see ublksrv test codes for more info.
> >>
> >>  	 */
> >> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
> >> +		return rq_bytes;
> > Can you explain a bit why READ isn't supported? Because WRITE zero
> > copy is supposed to be supported easily with splice based approach,
> > and I am more interested in READ zc actually.
> No special reason, READ op can also be supported. I'll
> add this support in patch set v2.
> For this RFC patch set, I just tried to show the idea, so
> I must admit that current codes are not mature enough :)

OK.

> 
> >
> >> +
> >>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
> >>  		return rq_bytes;
> >>  
> >> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
> >>  	}
> >>  }
> >>  
> >>
> >> +	kbuf->bvec = bvec;
> >> +	rq_for_each_bvec(tmp, rq, rq_iter) {
> >> +		*bvec = tmp;
> >> +		bvec++;
> >> +	}
> >> +
> >> +	kbuf->count = blk_rq_bytes(rq);
> >> +	kbuf->nr_bvecs = nr_bvec;
> >> +	data->kbuf = kbuf;
> >> +	return 0;
> > bio/req bvec table is immutable, so here you can pass its reference
> > to kbuf directly.
> Yeah, thanks.

Also if this request has multiple bios, either you need to submit
multple sqes or copy all bvec into single table. And in case of single bio,
the table reference can be used directly.

> 
> >
> >> +}
> >> +
> >> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
> >> +{
> >> +	int err;
> >> +	struct ublk_device *ub = ubq->dev;
> >> +	struct bpf_prog *prog = ub->io_prep_prog;
> >> +	struct ublk_io_bpf_ctx *bpf_ctx;
> >> +
> >> +	if (!prog)
> >> +		return 0;
> >> +
> >> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
> >> +	if (!bpf_ctx)
> >> +		return -EIO;
> >> +
> >> +	err = ublk_init_uring_kbuf(rq);
> >> +	if (err < 0) {
> >> +		kfree(bpf_ctx);
> >> +		return -EIO;
> >> +	}
> >> +	bpf_ctx->ub = ub;
> >> +	bpf_ctx->ctx.q_id = ubq->q_id;
> >> +	bpf_ctx->ctx.tag = rq->tag;
> >> +	bpf_ctx->ctx.op = req_op(rq);
> >> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
> >> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
> > The above is for setting up target io parameter, which is supposed
> > to be from userspace, cause it is result of user space logic. If
> > these parameters are from kernel, the whole logic has to be done
> > in io_prep_prog.
> Yeah, it's designed that io_prep_prog implements user space
> io logic.

That could be the biggest weakness of this approach, because people
really want to implement complicated logic in userspace, which should
be the biggest value of ublk, but now seems you move kernel C
programming into ebpf userspace programming, I don't think ebpf
is good at handling complicated userspace logic.

> 
> >
> >> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
> >> +
> >> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
> >> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
> >> +		kfree(bpf_ctx);
> > task_work_add() is only available in case of ublk builtin.
> Yeah, I'm thinking how to work around it.
> 
> >
> >> +	return 0;
> >> +}
> >> +
> >>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  		const struct blk_mq_queue_data *bd)
> >>  {
> >> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  	if (unlikely(res != BLK_STS_OK))
> >>  		return BLK_STS_IOERR;
> >>  
> >> +	/* Currently just for test. */
> >> +	ublk_run_bpf_prog(ubq, rq);
> > Can you explain the above comment a bit? When is the io_prep_prog called
> > in the non-test version? Or can you post the non-test version in list
> > for review.
> Forgot to delete stale comments, sorry. I'm writing v2 patch set,

OK, got it, so looks ublk_run_bpf_prog is designed to run two progs
loaded from two control commands.

> 
> > Here it is the key for understanding the whole idea, especially when
> > is io_prep_prog called finally? How to pass parameters to io_prep_prog?
> Let me explain more about the design:
> io_prep_prog has two types of parameters:
> 1) its call argument: struct ublk_bpf_ctx, see ublk.bpf.c.
> ublk_bpf_ctx will describe one kernel io requests about
> its op, qid, sectors info. io_prep_prog uses these info to
> map target io.
> 2) ebpf map structure, user space daemon can use map
> structure to pass much information from user space to
> io_prep_prog, which will help it to initialize target io if necessary.
> 
> io_prep_prog is called when ublk_queue_rq() is called, this bpf
> prog will initialize one or more sqes according to user logic, and
> io_prep_prog will put these sqes in an ebpf map structure, then
> execute a task_work_add() to notify ubq_daemon to execute
> io_submit_prog. Note, we can not call io_uring_submit_sqe()
> in task context that calls ublk_queue_rq(), that context does not
> have io_uring instance owned by ubq_daemon.
> Later ubq_daemon will call io_submit_prog to submit sqes.

Submitting sqe from kernel looks interesting, but I guess
performance may be hurt, given plugging(batching) can't be applied
any more, which is supposed to affect io perf a lot.



Thanks,
Ming

Ming Lei Feb. 17, 2023, 10:46 a.m. UTC | #4

On Fri, Feb 17, 2023 at 11:02:14AM +0800, Ming Lei wrote:
> On Thu, Feb 16, 2023 at 08:12:18PM +0800, Xiaoguang Wang wrote:
> > hello,

...

> > io_prep_prog is called when ublk_queue_rq() is called, this bpf
> > prog will initialize one or more sqes according to user logic, and
> > io_prep_prog will put these sqes in an ebpf map structure, then
> > execute a task_work_add() to notify ubq_daemon to execute
> > io_submit_prog. Note, we can not call io_uring_submit_sqe()
> > in task context that calls ublk_queue_rq(), that context does not
> > have io_uring instance owned by ubq_daemon.
> > Later ubq_daemon will call io_submit_prog to submit sqes.
> 
> Submitting sqe from kernel looks interesting, but I guess
> performance may be hurt, given plugging(batching) can't be applied
> any more, which is supposed to affect io perf a lot.

If submitting SQE in kernel is really doable, maybe we can add another
command, such as, UBLK_IO_SUBMIT_SQE(just like UBLK_IO_NEED_GET_DATA),
and pass the built SQE(which represents part of user logic result) as
io_uring command payload, and ask ublk driver to build buffer for this
SQE, then submit this SQE in kernel.

But there is SQE order problem, net usually requires SQEs to be linked
and submitted in order, with this way, it becomes not easy to maintain
SQEs order(some linked in user, and some in kernel).

Thanks,
Ming

Xiaoguang Wang Feb. 22, 2023, 2:13 p.m. UTC | #5

hi,

I spent some time to write v2, especially think about how to work
around task_work_add is not exported, so sorry for late response.
>
>>> The above is for setting up target io parameter, which is supposed
>>> to be from userspace, cause it is result of user space logic. If
>>> these parameters are from kernel, the whole logic has to be done
>>> in io_prep_prog.
>> Yeah, it's designed that io_prep_prog implements user space
>> io logic.
> That could be the biggest weakness of this approach, because people
> really want to implement complicated logic in userspace, which should
> be the biggest value of ublk, but now seems you move kernel C
> programming into ebpf userspace programming, I don't think ebpf
> is good at handling complicated userspace logic.
Absolutely agree with you, ebpf has strict programming rules,
I spent more time than I had thought at startup for support loop
target ebpf prog(ublk.bpf.c). Later I'll try to collaborate with my
colleagues, to see whether we can program their userspace logic
into ebpf prog or partially.
 
>> io_prep_prog is called when ublk_queue_rq() is called, this bpf
>> prog will initialize one or more sqes according to user logic, and
>> io_prep_prog will put these sqes in an ebpf map structure, then
>> execute a task_work_add() to notify ubq_daemon to execute
>> io_submit_prog. Note, we can not call io_uring_submit_sqe()
>> in task context that calls ublk_queue_rq(), that context does not
>> have io_uring instance owned by ubq_daemon.
>> Later ubq_daemon will call io_submit_prog to submit sqes.
> Submitting sqe from kernel looks interesting, but I guess
> performance may be hurt, given plugging(batching) can't be applied
> any more, which is supposed to affect io perf a lot.
Yes, agree, but I didn't have much time to improve this yet.
Currently, I mainly try to use this feature on large ios, to
reduce memory copy overhead, which consumes much
cpu resource, our clients really hope we can reduce it.

Regards,
Xiaoguang Wang

>
>
>
> Thanks,
> Ming

[RFC,3/3] ublk_drv: add ebpf support

Checks

Commit Message

Comments

Patch