[RFC,08/22] ublk: bpf: add bpf struct_ops

Message ID	20250107120417.1237392-9-tom.leiming@gmail.com (mailing list archive)
State	RFC
Headers	show Received: from mail-pl1-f181.google.com (mail-pl1-f181.google.com [209.85.214.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9E3431EE7BB; Tue, 7 Jan 2025 12:08:22 +0000 (UTC) From: Ming Lei <tom.leiming@gmail.com> To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org Cc: bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org>, Martin KaFai Lau <martin.lau@linux.dev>, Yonghong Song <yonghong.song@linux.dev>, Ming Lei <tom.leiming@gmail.com> Subject: [RFC PATCH 08/22] ublk: bpf: add bpf struct_ops Date: Tue, 7 Jan 2025 20:03:59 +0800 Message-ID: <20250107120417.1237392-9-tom.leiming@gmail.com> In-Reply-To: <20250107120417.1237392-1-tom.leiming@gmail.com> References: <20250107120417.1237392-1-tom.leiming@gmail.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	ublk: support bpf \| expand [RFC,00/22] ublk: support bpf [RFC,01/22] ublk: remove two unused fields from 'struct ublk_queue' [RFC,02/22] ublk: convert several bool type fields into bitfield of `ublk_queue` [RFC,03/22] ublk: add helper of ublk_need_map_io() [RFC,04/22] ublk: move ublk into one standalone directory [RFC,05/22] ublk: move private definitions into private header [RFC,06/22] ublk: move several helpers to private header [RFC,07/22] ublk: bpf: add bpf prog attach helpers [RFC,08/22] ublk: bpf: add bpf struct_ops [RFC,09/22] ublk: bpf: attach bpf prog to ublk device [RFC,10/22] ublk: bpf: add kfunc for ublk bpf prog [RFC,11/22] ublk: bpf: enable ublk-bpf [RFC,12/22] selftests: ublk: add tests for the ublk-bpf initial implementation [RFC,13/22] selftests: ublk: add tests for covering io split [RFC,14/22] selftests: ublk: add tests for covering redirecting to userspace [RFC,15/22] ublk: bpf: add bpf aio kfunc [RFC,16/22] ublk: bpf: add bpf aio struct_ops [RFC,17/22] ublk: bpf: attach bpf aio prog to ublk device [RFC,18/22] ublk: bpf: add several ublk bpf aio kfuncs [RFC,19/22] ublk: bpf: wire bpf aio with ublk io handling [RFC,20/22] selftests: add tests for ublk bpf aio [RFC,21/22] selftests: add tests for covering both bpf aio and split [RFC,22/22] ublk: document ublk-bpf & bpf-aio

Context	Check	Description
netdev/tree_selection	success	Not a local patch, async

diff --git a/drivers/block/ublk/Kconfig b/drivers/block/ublk/Kconfig index b06e3df09779..23aa97d51956 100644 --- a/drivers/block/ublk/Kconfig +++ b/drivers/block/ublk/Kconfig @@ -34,3 +34,19 @@ config BLKDEV_UBLK_LEGACY_OPCODES Say N if you don't want to support legacy command opcode. It is suggested to enable N if your application(ublk server) switches to ioctl command encoding. + +config UBLK_BPF + bool "UBLK-BPF support" + depends on BPF + depends on BLK_DEV_UBLK + help + This option allows to support eBPF programs on the UBLK subsystem. + eBPF programs can handle fast IO code path directly in kernel space, + and avoid to switch to ublk daemon userspace conext, meantime zero + copy can be supported directly. + + Usually target code need to partition into two parts: fast io code path + which is run as eBPF prog in kernel context, and slow & complicated + meta/admin code path which is run in ublk daemon userspace context. + And use efficient bpf map for communication between user mode and + kernel bpf prog. diff --git a/drivers/block/ublk/Makefile b/drivers/block/ublk/Makefile index 30e06b74dd82..7058b0fc13bf 100644 --- a/drivers/block/ublk/Makefile +++ b/drivers/block/ublk/Makefile @@ -4,4 +4,7 @@ ccflags-y += -I$(src) ublk_drv-$(CONFIG_BLK_DEV_UBLK) := main.o +ifeq ($(CONFIG_UBLK_BPF), y) +ublk_drv-$(CONFIG_BLK_DEV_UBLK) += bpf_ops.o +endif obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o diff --git a/drivers/block/ublk/bpf.h b/drivers/block/ublk/bpf.h new file mode 100644 index 000000000000..e3505c9ab86a --- /dev/null +++ b/drivers/block/ublk/bpf.h @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#ifndef UBLK_INT_BPF_HEADER +#define UBLK_INT_BPF_HEADER + +#include "bpf_reg.h" + +typedef unsigned long ublk_bpf_return_t; +typedef ublk_bpf_return_t (*queue_io_cmd_t)(struct ublk_bpf_io *io, unsigned int); +typedef void (*release_io_cmd_t)(struct ublk_bpf_io *io); + +#ifdef CONFIG_UBLK_BPF +#include <linux/filter.h> + +/* + * enum ublk_bpf_disposition - how to dispose the bpf io command + * + * @UBLK_BPF_IO_QUEUED: io command queued completely by bpf prog, so this + * cmd needn't to be forwarded to ublk daemon any more + * @UBLK_BPF_IO_REDIRECT: io command can't be queued by bpf prog, so this + * cmd will be forwarded to ublk daemon + * @UBLK_BPF_IO_CONTINUE: io command is being queued, and can be disposed + * further by bpf prog, so bpf callback will be called further + */ +enum ublk_bpf_disposition { + UBLK_BPF_IO_QUEUED = 0, + UBLK_BPF_IO_REDIRECT, + UBLK_BPF_IO_CONTINUE, +}; + +/** + * struct ublk_bpf_ops - A BPF struct_ops of callbacks allowing to implement + * ublk target from bpf program + * @id: ops id + * @queue_io_cmd: callback for queuing io command in ublk io context + * @queue_io_cmd_daemon: callback for queuing io command in ublk daemon + */ +struct ublk_bpf_ops { + /* struct_ops id, used for ublk device to attach prog */ + unsigned id; + + /* queue io command from ublk io context, can't be sleepable */ + queue_io_cmd_t queue_io_cmd; + + /* queue io command from target io daemon context, can be sleepable */ + queue_io_cmd_t queue_io_cmd_daemon; + + /* called when the io command reference drops to zero, can't be sleepable */ + release_io_cmd_t release_io_cmd; + + /* private: don't show in doc, must be the last field */ + struct bpf_prog_provider provider; +}; + +#define UBLK_BPF_DISPOSITION_BITS (4) +#define UBLK_BPF_DISPOSITION_SHIFT (BITS_PER_LONG - UBLK_BPF_DISPOSITION_BITS) + +static inline enum ublk_bpf_disposition ublk_bpf_get_disposition(ublk_bpf_return_t ret) +{ + return ret >> UBLK_BPF_DISPOSITION_SHIFT; +} + +static inline unsigned int ublk_bpf_get_return_bytes(ublk_bpf_return_t ret) +{ + return ret & ((1UL << UBLK_BPF_DISPOSITION_SHIFT) - 1); +} + +static inline ublk_bpf_return_t ublk_bpf_return_val(enum ublk_bpf_disposition rc, + unsigned int bytes) +{ + return (ublk_bpf_return_t) ((unsigned long)rc << UBLK_BPF_DISPOSITION_SHIFT) | bytes; +} + +static inline struct request *ublk_bpf_get_req(const struct ublk_bpf_io *io) +{ + struct ublk_rq_data *data = container_of(io, struct ublk_rq_data, bpf_data); + struct request *req = blk_mq_rq_from_pdu(data); + + return req; +} + +static inline void ublk_bpf_io_dec_ref(struct ublk_bpf_io *io) +{ + if (refcount_dec_and_test(&io->ref)) { + struct request *req = ublk_bpf_get_req(io); + + if (req->mq_hctx) { + const struct ublk_queue *ubq = req->mq_hctx->driver_data; + + if (ubq->bpf_ops && ubq->bpf_ops->release_io_cmd) + ubq->bpf_ops->release_io_cmd(io); + } + + if (test_bit(UBLK_BPF_IO_COMPLETED, &io->flags)) { + smp_rmb(); + __clear_bit(UBLK_BPF_IO_PREP, &io->flags); + __ublk_complete_rq_with_res(req, io->res); + } + } +} + +static inline void ublk_bpf_complete_io_cmd(struct ublk_bpf_io *io, int res) +{ + io->res = res; + smp_wmb(); + set_bit(UBLK_BPF_IO_COMPLETED, &io->flags); + ublk_bpf_io_dec_ref(io); +} + + +bool ublk_run_bpf_handler(struct ublk_queue *ubq, struct request *req, + queue_io_cmd_t cb); + +/* + * Return true if bpf prog handled this io command, otherwise return false + * so that this io command will be forwarded to userspace + */ +static inline bool ublk_run_bpf_prog(struct ublk_queue *ubq, + struct request *req, + queue_io_cmd_t cb, + bool fail_on_null) +{ + if (likely(cb)) + return ublk_run_bpf_handler(ubq, req, cb); + + /* bpf prog is un-registered */ + if (fail_on_null && !ubq->bpf_ops) { + __ublk_complete_rq_with_res(req, -EOPNOTSUPP); + return true; + } + + return false; +} + +static inline queue_io_cmd_t ublk_get_bpf_io_cb(struct ublk_queue *ubq) +{ + return ubq->bpf_ops ? ubq->bpf_ops->queue_io_cmd : NULL; +} + +static inline queue_io_cmd_t ublk_get_bpf_io_cb_daemon(struct ublk_queue *ubq) +{ + return ubq->bpf_ops ? ubq->bpf_ops->queue_io_cmd_daemon : NULL; +} + +static inline queue_io_cmd_t ublk_get_bpf_any_io_cb(struct ublk_queue *ubq) +{ + if (ublk_get_bpf_io_cb(ubq)) + return ublk_get_bpf_io_cb(ubq); + + return ublk_get_bpf_io_cb_daemon(ubq); +} + +int ublk_bpf_struct_ops_init(void); + +#else + +static inline bool ublk_run_bpf_prog(struct ublk_queue *ubq, + struct request *req, + queue_io_cmd_t cb, + bool fail_on_null) +{ + return false; +} + +static inline queue_io_cmd_t ublk_get_bpf_io_cb(struct ublk_queue *ubq) +{ + return NULL; +} + +static inline queue_io_cmd_t ublk_get_bpf_io_cb_daemon(struct ublk_queue *ubq) +{ + return NULL; +} + +static inline queue_io_cmd_t ublk_get_bpf_any_io_cb(struct ublk_queue *ubq) +{ + return NULL; +} + +static inline int ublk_bpf_struct_ops_init(void) +{ + return 0; +} +#endif +#endif diff --git a/drivers/block/ublk/bpf_ops.c b/drivers/block/ublk/bpf_ops.c new file mode 100644 index 000000000000..6ac2aebd477e --- /dev/null +++ b/drivers/block/ublk/bpf_ops.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Red Hat */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/xarray.h> + +#include "ublk.h" +#include "bpf.h" + +static DEFINE_XARRAY(ublk_ops); +static DEFINE_MUTEX(ublk_bpf_ops_lock); + +static bool ublk_bpf_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int ublk_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* ublk prog can change nothing */ + if (size > 0) + return -EACCES; + + return NOT_INIT; +} + +static const struct bpf_verifier_ops ublk_bpf_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = ublk_bpf_ops_is_valid_access, + .btf_struct_access = ublk_bpf_ops_btf_struct_access, +}; + +static int ublk_bpf_ops_init(struct btf *btf) +{ + return 0; +} + +static int ublk_bpf_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct ublk_bpf_ops, queue_io_cmd): + case offsetof(struct ublk_bpf_ops, release_io_cmd): + if (prog->sleepable) + return -EINVAL; + case offsetof(struct ublk_bpf_ops, queue_io_cmd_daemon): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int ublk_bpf_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct ublk_bpf_ops *uops; + struct ublk_bpf_ops *kops; + u32 moff; + + uops = (const struct ublk_bpf_ops *)udata; + kops = (struct ublk_bpf_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct ublk_bpf_ops, id): + /* For dev_id, this function has to copy it and return 1 to + * indicate that the data has been handled by the struct_ops + * type, or the verifier will reject the map if the value of + * those fields is not zero. + */ + kops->id = uops->id; + return 1; + } + return 0; +} + +static int ublk_bpf_reg(void *kdata, struct bpf_link *link) +{ + struct ublk_bpf_ops *ops = kdata; + struct ublk_bpf_ops *curr; + int ret = -EBUSY; + + mutex_lock(&ublk_bpf_ops_lock); + if (!xa_load(&ublk_ops, ops->id)) { + curr = kmalloc(sizeof(*curr), GFP_KERNEL); + if (curr) { + *curr = *ops; + bpf_prog_provider_init(&curr->provider); + ret = xa_err(xa_store(&ublk_ops, ops->id, curr, GFP_KERNEL)); + } else { + ret = -ENOMEM; + } + } + mutex_unlock(&ublk_bpf_ops_lock); + + return ret; +} + +static void ublk_bpf_unreg(void *kdata, struct bpf_link *link) +{ + struct ublk_bpf_ops *ops = kdata; + struct ublk_bpf_ops *curr; + LIST_HEAD(consumer_list); + struct bpf_prog_consumer *consumer, *tmp; + + mutex_lock(&ublk_bpf_ops_lock); + curr = xa_erase(&ublk_ops, ops->id); + if (curr) + list_splice_init(&curr->provider.list, &consumer_list); + mutex_unlock(&ublk_bpf_ops_lock); + + list_for_each_entry_safe(consumer, tmp, &consumer_list, node) + bpf_prog_consumer_detach(consumer, true); + kfree(curr); +} + +static void ublk_bpf_prep_io(struct ublk_bpf_io *io, + const struct ublksrv_io_desc *iod) +{ + io->flags = 0; + io->res = 0; + io->iod = iod; + __set_bit(UBLK_BPF_IO_PREP, &io->flags); + /* one is for submission, another is for completion */ + refcount_set(&io->ref, 2); +} + +/* Return true if io cmd is queued, otherwise forward it to userspace */ +bool ublk_run_bpf_handler(struct ublk_queue *ubq, struct request *req, + queue_io_cmd_t cb) +{ + ublk_bpf_return_t ret; + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_bpf_io *bpf_io = &data->bpf_data; + const unsigned long total = iod->nr_sectors << 9; + unsigned int done = 0; + bool res = true; + int err; + + if (!test_bit(UBLK_BPF_IO_PREP, &bpf_io->flags)) + ublk_bpf_prep_io(bpf_io, iod); + + do { + enum ublk_bpf_disposition rc; + unsigned int bytes; + + ret = cb(bpf_io, done); + rc = ublk_bpf_get_disposition(ret); + + if (rc == UBLK_BPF_IO_QUEUED) + goto exit; + + if (rc == UBLK_BPF_IO_REDIRECT) + break; + + if (unlikely(rc != UBLK_BPF_IO_CONTINUE)) { + printk_ratelimited(KERN_ERR "%s: unknown rc code %d\n", + __func__, rc); + err = -EINVAL; + goto fail; + } + + bytes = ublk_bpf_get_return_bytes(ret); + if (unlikely((bytes & 511) || !bytes)) { + err = -EREMOTEIO; + goto fail; + } else if (unlikely(bytes > total - done)) { + err = -ENOSPC; + goto fail; + } else { + done += bytes; + } + } while (done < total); + + /* + * If any bytes are queued, we can't forward to userspace + * immediately because it is too complicated to support two side + * completion. + * + * But the request will be updated and retried after the queued + * part is completed, then it can be forwarded to userspace too. + */ + res = done > 0; + if (!res) { + /* will redirect to userspace, so forget bpf handling */ + __clear_bit(UBLK_BPF_IO_PREP, &bpf_io->flags); + refcount_dec(&bpf_io->ref); + } + goto exit; +fail: + res = true; + ublk_bpf_complete_io_cmd(bpf_io, err); +exit: + ublk_bpf_io_dec_ref(bpf_io); + return res; +} + +static ublk_bpf_return_t ublk_bpf_run_io_task(struct ublk_bpf_io *io, + unsigned int offset) +{ + return ublk_bpf_return_val(UBLK_BPF_IO_REDIRECT, 0); +} + +static ublk_bpf_return_t ublk_bpf_queue_io_cmd(struct ublk_bpf_io *io, + unsigned int offset) +{ + return ublk_bpf_return_val(UBLK_BPF_IO_REDIRECT, 0); +} + +static void ublk_bpf_release_io_cmd(struct ublk_bpf_io *io) +{ +} + +static struct ublk_bpf_ops __bpf_ublk_bpf_ops = { + .queue_io_cmd = ublk_bpf_queue_io_cmd, + .queue_io_cmd_daemon = ublk_bpf_run_io_task, + .release_io_cmd = ublk_bpf_release_io_cmd, +}; + +static struct bpf_struct_ops bpf_ublk_bpf_ops = { + .verifier_ops = &ublk_bpf_verifier_ops, + .init = ublk_bpf_ops_init, + .check_member = ublk_bpf_ops_check_member, + .init_member = ublk_bpf_ops_init_member, + .reg = ublk_bpf_reg, + .unreg = ublk_bpf_unreg, + .name = "ublk_bpf_ops", + .cfi_stubs = &__bpf_ublk_bpf_ops, + .owner = THIS_MODULE, +}; + +int __init ublk_bpf_struct_ops_init(void) +{ + int err; + + err = register_bpf_struct_ops(&bpf_ublk_bpf_ops, ublk_bpf_ops); + if (err) + pr_warn("error while registering ublk bpf struct ops: %d", err); + + return 0; +} diff --git a/drivers/block/ublk/main.c b/drivers/block/ublk/main.c index aefb414ebf6c..29d3e7f656a7 100644 --- a/drivers/block/ublk/main.c +++ b/drivers/block/ublk/main.c @@ -43,6 +43,7 @@ #include <linux/kref.h> #include "ublk.h" +#include "bpf.h" static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); @@ -1061,6 +1062,10 @@ static inline void __ublk_rq_task_work(struct request *req, mapped_bytes >> 9; } + if (ublk_support_bpf(ubq) && ublk_run_bpf_prog(ubq, req, + ublk_get_bpf_io_cb_daemon(ubq), true)) + return; + ublk_init_req_ref(ubq, req); ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } @@ -1088,6 +1093,10 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + if (ublk_support_bpf(ubq) && ublk_run_bpf_prog(ubq, rq, + ublk_get_bpf_io_cb(ubq), false)) + return; + if (llist_add(&data->node, &ubq->io_cmds)) { struct ublk_io *io = &ubq->ios[rq->tag]; @@ -1265,8 +1274,24 @@ static void ublk_commit_completion(struct ublk_device *ub, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ub_cmd->zone_append_lba; - if (likely(!blk_should_fake_timeout(req->q))) - ublk_put_req_ref(ubq, req); + if (likely(!blk_should_fake_timeout(req->q))) { + /* + * userspace may have setup everything, but still let bpf + * prog to handle io by returning -EAGAIN, this way provides + * single bpf io handle fast path, and should simplify things + * a lot. + */ + if (ublk_support_bpf(ubq) && io->res == -EAGAIN) { + if(!ublk_run_bpf_prog(ubq, req, + ublk_get_bpf_any_io_cb(ubq), true)) { + /* give up now */ + io->res = -EIO; + ublk_put_req_ref(ubq, req); + } + } else { + ublk_put_req_ref(ubq, req); + } + } } /* diff --git a/drivers/block/ublk/ublk.h b/drivers/block/ublk/ublk.h index 76aee4225c78..e9ceadbc616d 100644 --- a/drivers/block/ublk/ublk.h +++ b/drivers/block/ublk/ublk.h @@ -33,10 +33,26 @@ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) +enum { + UBLK_BPF_IO_PREP = 0, + UBLK_BPF_IO_COMPLETED = 1, +}; + +struct ublk_bpf_io { + const struct ublksrv_io_desc *iod; + unsigned long flags; + refcount_t ref; + int res; +}; + struct ublk_rq_data { struct llist_node node; struct kref ref; + +#ifdef CONFIG_UBLK_BPF + struct ublk_bpf_io bpf_data; +#endif }; struct ublk_uring_cmd_pdu { @@ -104,6 +120,10 @@ struct ublk_queue { struct llist_head io_cmds; +#ifdef CONFIG_UBLK_BPF + struct ublk_bpf_ops *bpf_ops; +#endif + unsigned short force_abort:1; unsigned short timeout:1; unsigned short canceling:1; @@ -161,8 +181,21 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); } +static inline bool ublk_support_bpf(const struct ublk_queue *ubq) +{ + return false; +} + struct ublk_device *ublk_get_device_from_id(int idx); void ublk_put_device(struct ublk_device *ub); void __ublk_complete_rq(struct request *req); +static inline void __ublk_complete_rq_with_res(struct request *req, int res) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[req->tag]; + + io->res = res; + __ublk_complete_rq(req); +} #endif

[RFC,08/22] ublk: bpf: add bpf struct_ops

Checks

Commit Message

Patch