[RFC,1/2] block: add support for redirecting IO completion through eBPF

Message ID	20191014122833.64908-2-houtao1@huawei.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=kJgC=YH=vger.kernel.org=linux-block-owner@kernel.org> From: Hou Tao <houtao1@huawei.com> To: <linux-block@vger.kernel.org>, <bpf@vger.kernel.org>, <netdev@vger.kernel.org>, <axboe@kernel.dk>, <ast@kernel.org> CC: <hare@suse.com>, <osandov@fb.com>, <ming.lei@redhat.com>, <damien.lemoal@wdc.com>, <bvanassche@acm.org>, <daniel@iogearbox.net>, <kafai@fb.com>, <songliubraving@fb.com>, <yhs@fb.com> Subject: [RFC PATCH 1/2] block: add support for redirecting IO completion through eBPF Date: Mon, 14 Oct 2019 20:28:32 +0800 Message-ID: <20191014122833.64908-2-houtao1@huawei.com> In-Reply-To: <20191014122833.64908-1-houtao1@huawei.com> References: <20191014122833.64908-1-houtao1@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: 7BIT Content-Type: text/plain; charset=US-ASCII Sender: linux-block-owner@vger.kernel.org Precedence: bulk
Series	block: use eBPF to redirect IO completion \| expand [RFC,0/2] block: use eBPF to redirect IO completion [RFC,1/2] block: add support for redirecting IO completion through eBPF [RFC,2/2] selftests/bpf: add test program for redirecting IO completion CPU

diff --git a/block/Makefile b/block/Makefile index 9ef57ace90d4..0adb0f655e8c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ blk-rq-qos.o + badblocks.o partitions/ blk-rq-qos.o blk-bpf.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/blk-bpf.c b/block/blk-bpf.c new file mode 100644 index 000000000000..d9e3b1caead4 --- /dev/null +++ b/block/blk-bpf.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Hou Tao <houtao1@huawei.com> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/fs.h> +#include <linux/bpf_blkdev.h> +#include <linux/blkdev.h> + +extern const struct file_operations def_blk_fops; + +static DEFINE_SPINLOCK(blkdev_bpf_lock); + +const struct bpf_prog_ops blkdev_prog_ops = { +}; + +static const struct bpf_func_proto * +blkdev_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + default: + return NULL; + } +} + +const struct bpf_verifier_ops blkdev_verifier_ops = { + .get_func_proto = blkdev_prog_func_proto, +}; + +static struct request_queue *blkdev_rq_by_file(struct file *filp) +{ + struct block_device *bdev; + + if (filp->f_op != &def_blk_fops) + return ERR_PTR(-EINVAL); + + bdev = I_BDEV(filp->f_mapping->host); + + return bdev->bd_queue; +} + +int blkdev_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + int ret = 0; + struct file *filp; + struct request_queue *rq; + + filp = fget(attr->target_fd); + if (!filp) { + ret = -EINVAL; + goto fget_err; + } + + rq = blkdev_rq_by_file(filp); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto to_rq_err; + } + + spin_lock(&blkdev_bpf_lock); + if (rq->prog) { + ret = -EBUSY; + goto set_prog_err; + } + + rcu_assign_pointer(rq->prog, prog); + +set_prog_err: + spin_unlock(&blkdev_bpf_lock); +to_rq_err: + fput(filp); +fget_err: + return ret; +} + +int blkdev_bpf_prog_detach(const union bpf_attr *attr) +{ + int ret = 0; + struct file *filp; + struct request_queue *rq; + struct bpf_prog *old_prog; + + filp = fget(attr->target_fd); + if (!filp) { + ret = -EINVAL; + goto fget_err; + } + + rq = blkdev_rq_by_file(filp); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto to_rq_err; + } + + old_prog = NULL; + spin_lock(&blkdev_bpf_lock); + if (!rq->prog) { + ret = -ENODATA; + goto clr_prog_err; + } + rcu_swap_protected(rq->prog, old_prog, 1); + +clr_prog_err: + spin_unlock(&blkdev_bpf_lock); + if (old_prog) + bpf_prog_put(old_prog); +to_rq_err: + fput(filp); +fget_err: + return ret; +} + diff --git a/block/blk-mq.c b/block/blk-mq.c index 20a49be536b5..5ac6fe6dbcd0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> +#include <linux/filter.h> #include <trace/events/block.h> @@ -584,6 +585,9 @@ static void __blk_mq_complete_request(struct request *rq) struct request_queue *q = rq->q; bool shared = false; int cpu; + int ccpu; + int bpf_ccpu = -1; + struct bpf_prog *prog; WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* @@ -610,15 +614,25 @@ static void __blk_mq_complete_request(struct request *rq) return; } + rcu_read_lock(); + prog = rcu_dereference_protected(q->prog, 1); + if (prog) + bpf_ccpu = BPF_PROG_RUN(q->prog, NULL); + rcu_read_unlock(); + cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); + if (bpf_ccpu < 0 || !cpu_online(bpf_ccpu)) { + ccpu = ctx->cpu; + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + } else + ccpu = bpf_ccpu; - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { + if (cpu != ccpu && !shared && cpu_online(ccpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); + smp_call_function_single_async(ccpu, &rq->csd); } else { q->mq_ops->complete(rq); } diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 457d9ba3eb20..1139a5352a59 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -11,6 +11,7 @@ #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/topology.h> +#include <linux/filter.h> #include "blk.h" @@ -101,20 +102,32 @@ void __blk_complete_request(struct request *req) int cpu, ccpu = req->mq_ctx->cpu; unsigned long flags; bool shared = false; + int bpf_ccpu = -1; + struct bpf_prog *prog; BUG_ON(!q->mq_ops->complete); - local_irq_save(flags); - cpu = smp_processor_id(); + rcu_read_lock(); + prog = rcu_dereference_protected(q->prog, 1); + if (prog) + bpf_ccpu = BPF_PROG_RUN(q->prog, NULL); + rcu_read_unlock(); /* - * Select completion CPU + * Select completion CPU. + * If a valid CPU number is returned by eBPF program, use it directly. */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); + local_irq_save(flags); + cpu = smp_processor_id(); + if (bpf_ccpu < 0 || !cpu_online(bpf_ccpu)) { + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && + ccpu != -1) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ccpu); + } else + ccpu = cpu; } else - ccpu = cpu; + ccpu = bpf_ccpu; /* * If current CPU and requested CPU share a cache, run the softirq on diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d9db32fb75ee..849589c3c51c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -397,6 +397,8 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +struct bpf_prog; + struct request_queue { struct request *last_merge; struct elevator_queue *elevator; @@ -590,6 +592,7 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + struct bpf_prog __rcu *prog; }; #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ diff --git a/include/linux/bpf_blkdev.h b/include/linux/bpf_blkdev.h new file mode 100644 index 000000000000..0777428bc6e2 --- /dev/null +++ b/include/linux/bpf_blkdev.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_BLKDEV_H__ +#define __BPF_BLKDEV_H__ + +extern int blkdev_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +extern int blkdev_bpf_prog_detach(const union bpf_attr *attr); + +#endif /* !__BPF_BLKDEV_H__ */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 36a9c2325176..008facd336e5 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -38,6 +38,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_BLKDEV, blkdev) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77c6be96d676..36aa35e29be2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_BLKDEV, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_BLKDEV_IOC_CPU, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..9724c0809f21 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> +#include <linux/bpf_blkdev.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -1942,6 +1943,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SETSOCKOPT: ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; break; + case BPF_BLKDEV_IOC_CPU: + ptype = BPF_PROG_TYPE_BLKDEV; + break; default: return -EINVAL; } @@ -1966,6 +1970,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = skb_flow_dissector_bpf_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_BLKDEV: + ret = blkdev_bpf_prog_attach(attr, ptype, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -2029,6 +2036,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SETSOCKOPT: ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; break; + case BPF_BLKDEV_IOC_CPU: + return blkdev_bpf_prog_detach(attr); default: return -EINVAL; }

[RFC,1/2] block: add support for redirecting IO completion through eBPF

Commit Message

Comments

Patch