diff mbox series

[v5,02/11] block: Block Device Filtering Mechanism

Message ID 20230612135228.10702-3-sergei.shtepa@veeam.com (mailing list archive)
State New, archived
Headers show
Series blksnap - block devices snapshots module | expand

Commit Message

Sergei Shtepa June 12, 2023, 1:52 p.m. UTC
The block device filtering mechanism is an API that allows to attach
block device filters. Block device filters allow perform additional
processing for I/O units.

The idea of handling I/O units on block devices is not new. Back in the
2.6 kernel, there was an undocumented possibility of handling I/O units
by substituting the make_request_fn() function, which belonged to the
request_queue structure. But none of the in-tree kernel modules used
this feature, and it was eliminated in the 5.10 kernel.

The block device filtering mechanism returns the ability to handle I/O
units. It is possible to safely attach filter to a block device "on the
fly" without changing the structure of block devices stack.

Co-developed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Tested-by: Donald Buczek <buczek@molgen.mpg.de>
Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
---
 MAINTAINERS                     |   3 +
 block/Makefile                  |   3 +-
 block/bdev.c                    |   1 +
 block/blk-core.c                |  27 ++++
 block/blk-filter.c              | 213 ++++++++++++++++++++++++++++++++
 block/blk.h                     |  11 ++
 block/genhd.c                   |  10 ++
 block/ioctl.c                   |   7 ++
 block/partitions/core.c         |  10 ++
 include/linux/blk-filter.h      |  51 ++++++++
 include/linux/blk_types.h       |   2 +
 include/linux/blkdev.h          |   1 +
 include/uapi/linux/blk-filter.h |  35 ++++++
 include/uapi/linux/fs.h         |   3 +
 14 files changed, 376 insertions(+), 1 deletion(-)
 create mode 100644 block/blk-filter.c
 create mode 100644 include/linux/blk-filter.h
 create mode 100644 include/uapi/linux/blk-filter.h

Comments

Yu Kuai July 11, 2023, 2:02 a.m. UTC | #1
Hi,

在 2023/06/12 21:52, Sergei Shtepa 写道:
> The block device filtering mechanism is an API that allows to attach
> block device filters. Block device filters allow perform additional
> processing for I/O units.
> 
> The idea of handling I/O units on block devices is not new. Back in the
> 2.6 kernel, there was an undocumented possibility of handling I/O units
> by substituting the make_request_fn() function, which belonged to the
> request_queue structure. But none of the in-tree kernel modules used
> this feature, and it was eliminated in the 5.10 kernel.
> 
> The block device filtering mechanism returns the ability to handle I/O
> units. It is possible to safely attach filter to a block device "on the
> fly" without changing the structure of block devices stack.
> 
> Co-developed-by: Christoph Hellwig <hch@infradead.org>
> Signed-off-by: Christoph Hellwig <hch@infradead.org>
> Tested-by: Donald Buczek <buczek@molgen.mpg.de>
> Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> ---
>   MAINTAINERS                     |   3 +
>   block/Makefile                  |   3 +-
>   block/bdev.c                    |   1 +
>   block/blk-core.c                |  27 ++++
>   block/blk-filter.c              | 213 ++++++++++++++++++++++++++++++++
>   block/blk.h                     |  11 ++
>   block/genhd.c                   |  10 ++
>   block/ioctl.c                   |   7 ++
>   block/partitions/core.c         |  10 ++
>   include/linux/blk-filter.h      |  51 ++++++++
>   include/linux/blk_types.h       |   2 +
>   include/linux/blkdev.h          |   1 +
>   include/uapi/linux/blk-filter.h |  35 ++++++
>   include/uapi/linux/fs.h         |   3 +
>   14 files changed, 376 insertions(+), 1 deletion(-)
>   create mode 100644 block/blk-filter.c
>   create mode 100644 include/linux/blk-filter.h
>   create mode 100644 include/uapi/linux/blk-filter.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d801b8985b43..8336b6143a71 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3585,6 +3585,9 @@ M:	Sergei Shtepa <sergei.shtepa@veeam.com>
>   L:	linux-block@vger.kernel.org
>   S:	Supported
>   F:	Documentation/block/blkfilter.rst
> +F:	block/blk-filter.c
> +F:	include/linux/blk-filter.h
> +F:	include/uapi/linux/blk-filter.h
>   
>   BLOCK LAYER
>   M:	Jens Axboe <axboe@kernel.dk>
> diff --git a/block/Makefile b/block/Makefile
> index 46ada9dc8bbf..041c54eb0240 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -9,7 +9,8 @@ obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
>   			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
>   			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
>   			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
> -			disk-events.o blk-ia-ranges.o early-lookup.o
> +			disk-events.o blk-ia-ranges.o early-lookup.o \
> +			blk-filter.o
>   
>   obj-$(CONFIG_BOUNCE)		+= bounce.o
>   obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
> diff --git a/block/bdev.c b/block/bdev.c
> index 5c46ff107706..369f73b6097a 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -429,6 +429,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
>   		return NULL;
>   	}
>   	bdev->bd_disk = disk;
> +	bdev->bd_filter = NULL;
>   	return bdev;
>   }
>   
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 2ae22bebeb3e..ede04c6ad021 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -18,6 +18,7 @@
>   #include <linux/blkdev.h>
>   #include <linux/blk-pm.h>
>   #include <linux/blk-integrity.h>
> +#include <linux/blk-filter.h>
>   #include <linux/highmem.h>
>   #include <linux/mm.h>
>   #include <linux/pagemap.h>
> @@ -586,8 +587,24 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
>   	return BLK_STS_OK;
>   }
>   
> +static bool submit_bio_filter(struct bio *bio)
> +{
> +	if (bio_flagged(bio, BIO_FILTERED))
> +		return false;
> +
> +	bio_set_flag(bio, BIO_FILTERED);
> +	return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
> +}
> +
>   static void __submit_bio(struct bio *bio)
>   {
> +	/*
> +	 * If there is a filter driver attached, check if the BIO needs to go to
> +	 * the filter driver first, which can then pass on the bio or consume it.
> +	 */
> +	if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
> +		return;
> +
>   	if (unlikely(!blk_crypto_bio_prep(&bio)))
>   		return;
>   
> @@ -677,6 +694,15 @@ static void __submit_bio_noacct_mq(struct bio *bio)
>   	current->bio_list = NULL;
>   }
>   
> +/**
> + * submit_bio_noacct_nocheck - re-submit a bio to the block device layer for I/O
> + *	from block device filter.
> + * @bio:  The bio describing the location in memory and on the device.
> + *
> + * This is a version of submit_bio() that shall only be used for I/O that is
> + * resubmitted to lower level by block device filters.  All file  systems and
> + * other upper level users of the block layer should use submit_bio() instead.
> + */
>   void submit_bio_noacct_nocheck(struct bio *bio)
>   {
>   	blk_cgroup_bio_start(bio);
> @@ -704,6 +730,7 @@ void submit_bio_noacct_nocheck(struct bio *bio)
>   	else
>   		__submit_bio_noacct(bio);
>   }
> +EXPORT_SYMBOL_GPL(submit_bio_noacct_nocheck);
>   
>   /**
>    * submit_bio_noacct - re-submit a bio to the block device layer for I/O
> diff --git a/block/blk-filter.c b/block/blk-filter.c
> new file mode 100644
> index 000000000000..bf31da6acf67
> --- /dev/null
> +++ b/block/blk-filter.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2023 Veeam Software Group GmbH */
> +#include <linux/blk-filter.h>
> +#include <linux/blk-mq.h>
> +#include <linux/module.h>
> +
> +#include "blk.h"
> +
> +static LIST_HEAD(blkfilters);
> +static DEFINE_SPINLOCK(blkfilters_lock);
> +
> +static inline struct blkfilter_operations *__blkfilter_find(const char *name)
> +{
> +	struct blkfilter_operations *ops;
> +
> +	list_for_each_entry(ops, &blkfilters, link)
> +		if (strncmp(ops->name, name, BLKFILTER_NAME_LENGTH) == 0)
> +			return ops;
> +
> +	return NULL;
> +}
> +
> +static inline struct blkfilter_operations *blkfilter_find_get(const char *name)
> +{
> +	struct blkfilter_operations *ops;
> +
> +	spin_lock(&blkfilters_lock);
> +	ops = __blkfilter_find(name);
> +	if (ops && !try_module_get(ops->owner))
> +		ops = NULL;
> +	spin_unlock(&blkfilters_lock);
> +
> +	return ops;
> +}
> +
> +int blkfilter_ioctl_attach(struct block_device *bdev,
> +		    struct blkfilter_name __user *argp)
> +{
> +	struct blkfilter_name name;
> +	struct blkfilter_operations *ops;
> +	struct blkfilter *flt;
> +	int ret;
> +
> +	if (copy_from_user(&name, argp, sizeof(name)))
> +		return -EFAULT;
> +
> +	ops = blkfilter_find_get(name.name);
> +	if (!ops)
> +		return -ENOENT;
> +
> +	ret = freeze_bdev(bdev);
> +	if (ret)
> +		goto out_put_module;
> +	blk_mq_freeze_queue(bdev->bd_queue);
> +
> +	if (bdev->bd_filter) {
> +		if (bdev->bd_filter->ops == ops)
> +			ret = -EALREADY;
> +		else
> +			ret = -EBUSY;
> +		goto out_unfreeze;
> +	}
> +
> +	flt = ops->attach(bdev);
> +	if (IS_ERR(flt)) {
> +		ret = PTR_ERR(flt);
> +		goto out_unfreeze;
> +	}
> +
> +	flt->ops = ops;
> +	bdev->bd_filter = flt;
> +
> +out_unfreeze:
> +	blk_mq_unfreeze_queue(bdev->bd_queue);
> +	thaw_bdev(bdev);
> +out_put_module:
> +	if (ret)
> +		module_put(ops->owner);
> +	return ret;
> +}
> +
> +static void __blkfilter_detach(struct block_device *bdev)
> +{
> +	struct blkfilter *flt = bdev->bd_filter;
> +	const struct blkfilter_operations *ops = flt->ops;
> +
> +	bdev->bd_filter = NULL;
> +	ops->detach(flt);
> +	module_put(ops->owner);
> +}
> +
> +void blkfilter_detach(struct block_device *bdev)
> +{
> +	if (bdev->bd_filter) {
> +		blk_mq_freeze_queue(bdev->bd_queue);
> +		__blkfilter_detach(bdev);
> +		blk_mq_unfreeze_queue(bdev->bd_queue);
> +	}
> +}
> +
> +int blkfilter_ioctl_detach(struct block_device *bdev,
> +		    struct blkfilter_name __user *argp)
> +{
> +	struct blkfilter_name name;
> +	int error = 0;
> +
> +	if (copy_from_user(&name, argp, sizeof(name)))
> +		return -EFAULT;
> +
> +	blk_mq_freeze_queue(bdev->bd_queue);
> +	if (!bdev->bd_filter) {
> +		error = -ENOENT;
> +		goto out_unfreeze;
> +	}
> +	if (strncmp(bdev->bd_filter->ops->name, name.name,
> +			BLKFILTER_NAME_LENGTH)) {
> +		error = -EINVAL;
> +		goto out_unfreeze;
> +	}
> +
> +	__blkfilter_detach(bdev);
> +out_unfreeze:
> +	blk_mq_unfreeze_queue(bdev->bd_queue);
> +	return error;
> +}
> +
> +int blkfilter_ioctl_ctl(struct block_device *bdev,
> +		    struct blkfilter_ctl __user *argp)
> +{
> +	struct blkfilter_ctl ctl;
> +	struct blkfilter *flt;
> +	int ret;
> +
> +	if (copy_from_user(&ctl, argp, sizeof(ctl)))
> +		return -EFAULT;
> +
> +	ret = blk_queue_enter(bdev_get_queue(bdev), 0);
> +	if (ret)
> +		return ret;
> +
> +	flt = bdev->bd_filter;
> +	if (!flt || strncmp(flt->ops->name, ctl.name, BLKFILTER_NAME_LENGTH)) {
> +		ret = -ENOENT;
> +		goto out_queue_exit;
> +	}
> +
> +	if (!flt->ops->ctl) {
> +		ret = -ENOTTY;
> +		goto out_queue_exit;
> +	}
> +
> +	ret = flt->ops->ctl(flt, ctl.cmd, u64_to_user_ptr(ctl.opt),
> +			    &ctl.optlen);
> +out_queue_exit:
> +	blk_queue_exit(bdev_get_queue(bdev));
> +	return ret;
> +}
> +
> +ssize_t blkfilter_show(struct block_device *bdev, char *buf)
> +{
> +	ssize_t ret = 0;
> +
> +	blk_mq_freeze_queue(bdev->bd_queue);
> +	if (bdev->bd_filter)
> +		ret = sprintf(buf, "%s\n", bdev->bd_filter->ops->name);
> +	else
> +		ret = sprintf(buf, "\n");
> +	blk_mq_unfreeze_queue(bdev->bd_queue);
> +
> +	return ret;
> +}
> +
> +/**
> + * blkfilter_register() - Register block device filter operations
> + * @ops:	The operations to register.
> + *
> + * Return:
> + *	0 if succeeded,
> + *	-EBUSY if a block device filter with the same name is already
> + *	registered.
> + */
> +int blkfilter_register(struct blkfilter_operations *ops)
> +{
> +	struct blkfilter_operations *found;
> +	int ret = 0;
> +
> +	spin_lock(&blkfilters_lock);
> +	found = __blkfilter_find(ops->name);
> +	if (found)
> +		ret = -EBUSY;
> +	else
> +		list_add_tail(&ops->link, &blkfilters);
> +	spin_unlock(&blkfilters_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blkfilter_register);
> +
> +/**
> + * blkfilter_unregister() - Unregister block device filter operations
> + * @ops:	The operations to unregister.
> + *
> + * Important: before unloading, it is necessary to detach the filter from all
> + * block devices.
> + *
> + */
> +void blkfilter_unregister(struct blkfilter_operations *ops)
> +{
> +	spin_lock(&blkfilters_lock);
> +	list_del(&ops->link);
> +	spin_unlock(&blkfilters_lock);
> +}
> +EXPORT_SYMBOL_GPL(blkfilter_unregister);
> diff --git a/block/blk.h b/block/blk.h
> index 9582fcd0df41..2c14fa938d8c 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -7,6 +7,8 @@
>   #include <xen/xen.h>
>   #include "blk-crypto-internal.h"
>   
> +struct blkfilter_ctl;
> +struct blkfilter_name;
>   struct elevator_type;
>   
>   /* Max future timer expiry for timeouts */
> @@ -454,6 +456,15 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
>   
>   extern const struct address_space_operations def_blk_aops;
>   
> +int blkfilter_ioctl_attach(struct block_device *bdev,
> +		    struct blkfilter_name __user *argp);
> +int blkfilter_ioctl_detach(struct block_device *bdev,
> +		    struct blkfilter_name __user *argp);
> +int blkfilter_ioctl_ctl(struct block_device *bdev,
> +		    struct blkfilter_ctl __user *argp);
> +void blkfilter_detach(struct block_device *bdev);
> +ssize_t blkfilter_show(struct block_device *bdev, char *buf);
> +
>   int disk_register_independent_access_ranges(struct gendisk *disk);
>   void disk_unregister_independent_access_ranges(struct gendisk *disk);
>   
> diff --git a/block/genhd.c b/block/genhd.c
> index 4e5fd6aaa883..d9aca0797886 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -25,6 +25,7 @@
>   #include <linux/pm_runtime.h>
>   #include <linux/badblocks.h>
>   #include <linux/part_stat.h>
> +#include <linux/blk-filter.h>
>   #include "blk-throttle.h"
>   
>   #include "blk.h"
> @@ -648,6 +649,7 @@ void del_gendisk(struct gendisk *disk)
>   		remove_inode_hash(part->bd_inode);
>   		fsync_bdev(part);
>   		__invalidate_device(part, true);
> +		blkfilter_detach(part);
>   	}
>   	mutex_unlock(&disk->open_mutex);
>   
> @@ -1033,6 +1035,12 @@ static ssize_t diskseq_show(struct device *dev,
>   	return sprintf(buf, "%llu\n", disk->diskseq);
>   }
>   
> +static ssize_t disk_filter_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	return blkfilter_show(dev_to_bdev(dev), buf);
> +}
> +
>   static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
>   static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
>   static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
> @@ -1046,6 +1054,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
>   static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
>   static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
>   static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
> +static DEVICE_ATTR(filter, 0444, disk_filter_show, NULL);
>   
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   ssize_t part_fail_show(struct device *dev,
> @@ -1092,6 +1101,7 @@ static struct attribute *disk_attrs[] = {
>   	&dev_attr_events_async.attr,
>   	&dev_attr_events_poll_msecs.attr,
>   	&dev_attr_diskseq.attr,
> +	&dev_attr_filter.attr,
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   	&dev_attr_fail.attr,
>   #endif
> diff --git a/block/ioctl.c b/block/ioctl.c
> index c7d7d4345edb..170020d1ce0e 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -2,6 +2,7 @@
>   #include <linux/capability.h>
>   #include <linux/compat.h>
>   #include <linux/blkdev.h>
> +#include <linux/blk-filter.h>
>   #include <linux/export.h>
>   #include <linux/gfp.h>
>   #include <linux/blkpg.h>
> @@ -546,6 +547,12 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
>   		return blkdev_pr_preempt(bdev, argp, true);
>   	case IOC_PR_CLEAR:
>   		return blkdev_pr_clear(bdev, argp);
> +	case BLKFILTER_ATTACH:
> +		return blkfilter_ioctl_attach(bdev, argp);
> +	case BLKFILTER_DETACH:
> +		return blkfilter_ioctl_detach(bdev, argp);
> +	case BLKFILTER_CTL:
> +		return blkfilter_ioctl_ctl(bdev, argp);
>   	default:
>   		return -ENOIOCTLCMD;
>   	}
> diff --git a/block/partitions/core.c b/block/partitions/core.c
> index 87a21942d606..8e2834566c38 100644
> --- a/block/partitions/core.c
> +++ b/block/partitions/core.c
> @@ -10,6 +10,7 @@
>   #include <linux/ctype.h>
>   #include <linux/vmalloc.h>
>   #include <linux/raid/detect.h>
> +#include <linux/blk-filter.h>
>   #include "check.h"
>   
>   static int (*const check_part[])(struct parsed_partitions *) = {
> @@ -200,6 +201,12 @@ static ssize_t part_discard_alignment_show(struct device *dev,
>   	return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
>   }
>   
> +static ssize_t part_filter_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	return blkfilter_show(dev_to_bdev(dev), buf);
> +}
> +
>   static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
>   static DEVICE_ATTR(start, 0444, part_start_show, NULL);
>   static DEVICE_ATTR(size, 0444, part_size_show, NULL);
> @@ -208,6 +215,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
>   static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
>   static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
>   static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
> +static DEVICE_ATTR(filter, 0444, part_filter_show, NULL);
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   static struct device_attribute dev_attr_fail =
>   	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
> @@ -222,6 +230,7 @@ static struct attribute *part_attrs[] = {
>   	&dev_attr_discard_alignment.attr,
>   	&dev_attr_stat.attr,
>   	&dev_attr_inflight.attr,
> +	&dev_attr_filter.attr,
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   	&dev_attr_fail.attr,
>   #endif
> @@ -284,6 +293,7 @@ static void delete_partition(struct block_device *part)
>   
>   	fsync_bdev(part);
>   	__invalidate_device(part, true);
> +	blkfilter_detach(part);

bdev_disk_changed() is not handled, where delete_partition() and
add_partition() will be called, this means blkfilter for partiton will
be removed after partition rescan. Am I missing something?

Thanks,
Kuai
>   
>   	drop_partition(part);
>   }
> diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h
> new file mode 100644
> index 000000000000..0afdb40f3bab
> --- /dev/null
> +++ b/include/linux/blk-filter.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright (C) 2023 Veeam Software Group GmbH */
> +#ifndef _LINUX_BLK_FILTER_H
> +#define _LINUX_BLK_FILTER_H
> +
> +#include <uapi/linux/blk-filter.h>
> +
> +struct bio;
> +struct block_device;
> +struct blkfilter_operations;
> +
> +/**
> + * struct blkfilter - Block device filter.
> + *
> + * @ops:	Block device filter operations.
> + *
> + * For each filtered block device, the filter creates a data structure
> + * associated with this device. The data in this structure is specific to the
> + * filter, but it must contain a pointer to the block device filter account.
> + */
> +struct blkfilter {
> +	const struct blkfilter_operations *ops;
> +};
> +
> +/**
> + * struct blkfilter_operations - Block device filter operations.
> + *
> + * @link:	Entry in the global list of filter drivers
> + *		(must not be accessed by the driver).
> + * @owner:	Module implementing the filter driver.
> + * @name:	Name of the filter driver.
> + * @attach:	Attach the filter driver to the block device.
> + * @detach:	Detach the filter driver from the block device.
> + * @ctl:	Send a control command to the filter driver.
> + * @submit_bio:	Handle bio submissions to the filter driver.
> + */
> +struct blkfilter_operations {
> +	struct list_head link;
> +	struct module *owner;
> +	const char *name;
> +	struct blkfilter *(*attach)(struct block_device *bdev);
> +	void (*detach)(struct blkfilter *flt);
> +	int (*ctl)(struct blkfilter *flt, const unsigned int cmd,
> +		   __u8 __user *buf, __u32 *plen);
> +	bool (*submit_bio)(struct bio *bio);
> +};
> +
> +int blkfilter_register(struct blkfilter_operations *ops);
> +void blkfilter_unregister(struct blkfilter_operations *ops);
> +
> +#endif /* _UAPI_LINUX_BLK_FILTER_H */
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index deb69eeab6bd..5ba313b3b11d 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -75,6 +75,7 @@ struct block_device {
>   	 * path
>   	 */
>   	struct device		bd_device;
> +	struct blkfilter	*bd_filter;
>   } __randomize_layout;
>   
>   #define bdev_whole(_bdev) \
> @@ -341,6 +342,7 @@ enum {
>   	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
>   	BIO_REMAPPED,
>   	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
> +	BIO_FILTERED,		/* bio has already been filtered */
>   	BIO_FLAG_LAST
>   };
>   
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f4c339d9dd03..e7a4a4866792 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -842,6 +842,7 @@ void blk_request_module(dev_t devt);
>   
>   extern int blk_register_queue(struct gendisk *disk);
>   extern void blk_unregister_queue(struct gendisk *disk);
> +void submit_bio_noacct_nocheck(struct bio *bio);
>   void submit_bio_noacct(struct bio *bio);
>   struct bio *bio_split_to_limits(struct bio *bio);
>   
> diff --git a/include/uapi/linux/blk-filter.h b/include/uapi/linux/blk-filter.h
> new file mode 100644
> index 000000000000..18885dc1b717
> --- /dev/null
> +++ b/include/uapi/linux/blk-filter.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/* Copyright (C) 2023 Veeam Software Group GmbH */
> +#ifndef _UAPI_LINUX_BLK_FILTER_H
> +#define _UAPI_LINUX_BLK_FILTER_H
> +
> +#include <linux/types.h>
> +
> +#define BLKFILTER_NAME_LENGTH	32
> +
> +/**
> + * struct blkfilter_name - parameter for BLKFILTER_ATTACH and BLKFILTER_DETACH
> + *      ioctl.
> + *
> + * @name:       Name of block device filter.
> + */
> +struct blkfilter_name {
> +	__u8 name[BLKFILTER_NAME_LENGTH];
> +};
> +
> +/**
> + * struct blkfilter_ctl - parameter for BLKFILTER_CTL ioctl
> + *
> + * @name:	Name of block device filter.
> + * @cmd:	The filter-specific operation code of the command.
> + * @optlen:	Size of data at @opt.
> + * @opt:	Userspace buffer with options.
> + */
> +struct blkfilter_ctl {
> +	__u8 name[BLKFILTER_NAME_LENGTH];
> +	__u32 cmd;
> +	__u32 optlen;
> +	__u64 opt;
> +};
> +
> +#endif /* _UAPI_LINUX_BLK_FILTER_H */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index b7b56871029c..7904f157b245 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -189,6 +189,9 @@ struct fsxattr {
>    * A jump here: 130-136 are reserved for zoned block devices
>    * (see uapi/linux/blkzoned.h)
>    */
> +#define BLKFILTER_ATTACH	_IOWR(0x12, 140, struct blkfilter_name)
> +#define BLKFILTER_DETACH	_IOWR(0x12, 141, struct blkfilter_name)
> +#define BLKFILTER_CTL		_IOWR(0x12, 142, struct blkfilter_ctl)
>   
>   #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
>   #define FIBMAP	   _IO(0x00,1)	/* bmap access */
>
Yu Kuai July 12, 2023, 10:04 a.m. UTC | #2
Hi,

在 2023/07/11 10:02, Yu Kuai 写道:

>> +static bool submit_bio_filter(struct bio *bio)
>> +{
>> +    if (bio_flagged(bio, BIO_FILTERED))
>> +        return false;
>> +
>> +    bio_set_flag(bio, BIO_FILTERED);
>> +    return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
>> +}
>> +
>>   static void __submit_bio(struct bio *bio)
>>   {
>> +    /*
>> +     * If there is a filter driver attached, check if the BIO needs 
>> to go to
>> +     * the filter driver first, which can then pass on the bio or 
>> consume it.
>> +     */
>> +    if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
>> +        return;
>> +
>>       if (unlikely(!blk_crypto_bio_prep(&bio)))
>>           return;

...

>> +static void __blkfilter_detach(struct block_device *bdev)
>> +{
>> +    struct blkfilter *flt = bdev->bd_filter;
>> +    const struct blkfilter_operations *ops = flt->ops;
>> +
>> +    bdev->bd_filter = NULL;
>> +    ops->detach(flt);
>> +    module_put(ops->owner);
>> +}
>> +
>> +void blkfilter_detach(struct block_device *bdev)
>> +{
>> +    if (bdev->bd_filter) {
>> +        blk_mq_freeze_queue(bdev->bd_queue);

And this is not sate as well, for bio-based device, q_usage_counter is
not grabbed while submit_bio_filter() is called, hence there is a risk
of uaf from submit_bio_filter().

Thanks,
Kuai
Yu Kuai July 12, 2023, 12:34 p.m. UTC | #3
Hi,

在 2023/07/12 18:04, Yu Kuai 写道:
> Hi,
> 
> 在 2023/07/11 10:02, Yu Kuai 写道:
> 
>>> +static bool submit_bio_filter(struct bio *bio)
>>> +{
>>> +    if (bio_flagged(bio, BIO_FILTERED))
>>> +        return false;
>>> +
>>> +    bio_set_flag(bio, BIO_FILTERED);
>>> +    return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
>>> +}
>>> +
>>>   static void __submit_bio(struct bio *bio)
>>>   {
>>> +    /*
>>> +     * If there is a filter driver attached, check if the BIO needs 
>>> to go to
>>> +     * the filter driver first, which can then pass on the bio or 
>>> consume it.
>>> +     */
>>> +    if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
>>> +        return;
>>> +
>>>       if (unlikely(!blk_crypto_bio_prep(&bio)))
>>>           return;
> 
> ...
> 
>>> +static void __blkfilter_detach(struct block_device *bdev)
>>> +{
>>> +    struct blkfilter *flt = bdev->bd_filter;
>>> +    const struct blkfilter_operations *ops = flt->ops;
>>> +
>>> +    bdev->bd_filter = NULL;
>>> +    ops->detach(flt);
>>> +    module_put(ops->owner);
>>> +}
>>> +
>>> +void blkfilter_detach(struct block_device *bdev)
>>> +{
>>> +    if (bdev->bd_filter) {
>>> +        blk_mq_freeze_queue(bdev->bd_queue);
> 
> And this is not sate as well, for bio-based device, q_usage_counter is
> not grabbed while submit_bio_filter() is called, hence there is a risk
> of uaf from submit_bio_filter().

And there is another question, can blkfilter_detach() from
del_gendisk/delete_partiton and ioctl concurrent? I think it's a
problem.

Thanks,
Kuai
> 
> Thanks,
> Kuai
> 
> .
>
Sergei Shtepa July 17, 2023, 2:39 p.m. UTC | #4
On 7/11/23 04:02, Yu Kuai wrote:
> bdev_disk_changed() is not handled, where delete_partition() and
> add_partition() will be called, this means blkfilter for partiton will
> be removed after partition rescan. Am I missing something?

Yes, when the bdev_disk_changed() is called, all disk block devices
are deleted and new ones are re-created. Therefore, the information
about the attached filters will be lost. This is equivalent to
removing the disk and adding it back.

For the blksnap module, partition rescan will mean the loss of the
change trackers data. If a snapshot was created, then such
a partition rescan will cause the snapshot to be corrupted.

There was an idea to do filtering at the disk level,
but I abandoned it.
Sergei Shtepa July 17, 2023, 4:22 p.m. UTC | #5
On 7/12/23 12:04, Yu Kuai wrote:
> Subject:
> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
> From:
> Yu Kuai <yukuai1@huaweicloud.com>
> Date:
> 7/12/23, 12:04
> 
> To:
> Yu Kuai <yukuai1@huaweicloud.com>, Sergei Shtepa <sergei.shtepa@veeam.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
> CC:
> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
> 
> 
> Hi,
> 
> 在 2023/07/11 10:02, Yu Kuai 写道:
> 
>>> +static bool submit_bio_filter(struct bio *bio)
>>> +{
>>> +    if (bio_flagged(bio, BIO_FILTERED))
>>> +        return false;
>>> +
>>> +    bio_set_flag(bio, BIO_FILTERED);
>>> +    return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
>>> +}
>>> +
>>>   static void __submit_bio(struct bio *bio)
>>>   {
>>> +    /*
>>> +     * If there is a filter driver attached, check if the BIO needs to go to
>>> +     * the filter driver first, which can then pass on the bio or consume it.
>>> +     */
>>> +    if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
>>> +        return;
>>> +
>>>       if (unlikely(!blk_crypto_bio_prep(&bio)))
>>>           return;
> 
> ...
> 
>>> +static void __blkfilter_detach(struct block_device *bdev)
>>> +{
>>> +    struct blkfilter *flt = bdev->bd_filter;
>>> +    const struct blkfilter_operations *ops = flt->ops;
>>> +
>>> +    bdev->bd_filter = NULL;
>>> +    ops->detach(flt);
>>> +    module_put(ops->owner);
>>> +}
>>> +
>>> +void blkfilter_detach(struct block_device *bdev)
>>> +{
>>> +    if (bdev->bd_filter) {
>>> +        blk_mq_freeze_queue(bdev->bd_queue);
> 
> And this is not sate as well, for bio-based device, q_usage_counter is
> not grabbed while submit_bio_filter() is called, hence there is a risk
> of uaf from submit_bio_filter().
> 
> Thanks,
> Kuai
> 

Hi Kuai.

Indeed, the filter call is performed before calling bio_queue_enter().
I must admit, you are very attentive. 

I didn't keep track of the change in the position of the
bio_queue_enter() function deeper on the stack.

I think I need to add a check for q_usage_counter, for the debug build.
So that I don't miss it in the future.
Sergei Shtepa July 17, 2023, 5:39 p.m. UTC | #6
Hi.

On 7/12/23 14:34, Yu Kuai wrote:
> Subject:
> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
> From:
> Yu Kuai <yukuai1@huaweicloud.com>
> Date:
> 7/12/23, 14:34
> 
> To:
> Yu Kuai <yukuai1@huaweicloud.com>, Sergei Shtepa <sergei.shtepa@veeam.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
> CC:
> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
> 
> 
> Hi,
> 
> 在 2023/07/12 18:04, Yu Kuai 写道:
>> Hi,
>>
>> 在 2023/07/11 10:02, Yu Kuai 写道:
>>
>>>> +static bool submit_bio_filter(struct bio *bio)
>>>> +{
>>>> +    if (bio_flagged(bio, BIO_FILTERED))
>>>> +        return false;
>>>> +
>>>> +    bio_set_flag(bio, BIO_FILTERED);
>>>> +    return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
>>>> +}
>>>> +
>>>>   static void __submit_bio(struct bio *bio)
>>>>   {
>>>> +    /*
>>>> +     * If there is a filter driver attached, check if the BIO needs to go to
>>>> +     * the filter driver first, which can then pass on the bio or consume it.
>>>> +     */
>>>> +    if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
>>>> +        return;
>>>> +
>>>>       if (unlikely(!blk_crypto_bio_prep(&bio)))
>>>>           return;
>>
>> ...
>>
>>>> +static void __blkfilter_detach(struct block_device *bdev)
>>>> +{
>>>> +    struct blkfilter *flt = bdev->bd_filter;
>>>> +    const struct blkfilter_operations *ops = flt->ops;
>>>> +
>>>> +    bdev->bd_filter = NULL;
>>>> +    ops->detach(flt);
>>>> +    module_put(ops->owner);
>>>> +}
>>>> +
>>>> +void blkfilter_detach(struct block_device *bdev)
>>>> +{
>>>> +    if (bdev->bd_filter) {
>>>> +        blk_mq_freeze_queue(bdev->bd_queue);
>>
>> And this is not sate as well, for bio-based device, q_usage_counter is
>> not grabbed while submit_bio_filter() is called, hence there is a risk
>> of uaf from submit_bio_filter().
> 
> And there is another question, can blkfilter_detach() from
> del_gendisk/delete_partiton and ioctl concurrent? I think it's a
> problem.
> 

Yes, it looks like if two threads execute the blkfilter_detach() function,
then a problem is possible. The blk_mq_freeze_queue() function does not
block threads.
But for this, it is necessary that the IOCTL for the block device and
its removal are performed simultaneously. Is this possible?

I suppose that using mutex bdev->bd_disk->open_mutex in
blkfilter_ioctl_attach(), blkfilter_ioctl_detach() and
blkfilter_ioctl_ctl() can fix the problem. What do you think?


> Thanks,
> Kuai
>>
>> Thanks,
>> Kuai
>>
>> .
>>
>
Yu Kuai July 18, 2023, 1:21 a.m. UTC | #7
Hi,

在 2023/07/18 1:39, Sergei Shtepa 写道:
> Hi.
> 
> On 7/12/23 14:34, Yu Kuai wrote:
>> Subject:
>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>> From:
>> Yu Kuai <yukuai1@huaweicloud.com>
>> Date:
>> 7/12/23, 14:34
>>
>> To:
>> Yu Kuai <yukuai1@huaweicloud.com>, Sergei Shtepa <sergei.shtepa@veeam.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>> CC:
>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>
>>
>> Hi,
>>
>> 在 2023/07/12 18:04, Yu Kuai 写道:
>>> Hi,
>>>
>>> 在 2023/07/11 10:02, Yu Kuai 写道:
>>>
>>>>> +static bool submit_bio_filter(struct bio *bio)
>>>>> +{
>>>>> +    if (bio_flagged(bio, BIO_FILTERED))
>>>>> +        return false;
>>>>> +
>>>>> +    bio_set_flag(bio, BIO_FILTERED);
>>>>> +    return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
>>>>> +}
>>>>> +
>>>>>    static void __submit_bio(struct bio *bio)
>>>>>    {
>>>>> +    /*
>>>>> +     * If there is a filter driver attached, check if the BIO needs to go to
>>>>> +     * the filter driver first, which can then pass on the bio or consume it.
>>>>> +     */
>>>>> +    if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
>>>>> +        return;
>>>>> +
>>>>>        if (unlikely(!blk_crypto_bio_prep(&bio)))
>>>>>            return;
>>>
>>> ...
>>>
>>>>> +static void __blkfilter_detach(struct block_device *bdev)
>>>>> +{
>>>>> +    struct blkfilter *flt = bdev->bd_filter;
>>>>> +    const struct blkfilter_operations *ops = flt->ops;
>>>>> +
>>>>> +    bdev->bd_filter = NULL;
>>>>> +    ops->detach(flt);
>>>>> +    module_put(ops->owner);
>>>>> +}
>>>>> +
>>>>> +void blkfilter_detach(struct block_device *bdev)
>>>>> +{
>>>>> +    if (bdev->bd_filter) {
>>>>> +        blk_mq_freeze_queue(bdev->bd_queue);
>>>
>>> And this is not sate as well, for bio-based device, q_usage_counter is
>>> not grabbed while submit_bio_filter() is called, hence there is a risk
>>> of uaf from submit_bio_filter().
>>
>> And there is another question, can blkfilter_detach() from
>> del_gendisk/delete_partiton and ioctl concurrent? I think it's a
>> problem.
>>
> 
> Yes, it looks like if two threads execute the blkfilter_detach() function,
> then a problem is possible. The blk_mq_freeze_queue() function does not
> block threads.
> But for this, it is necessary that the IOCTL for the block device and
> its removal are performed simultaneously. Is this possible?

I think it's possible, ioctl only requires to open the disk/partition,
and it won't block disk removal:

t1:			t2:

open dev
ioctl
			remove dev
			 del_gendisk
  blkfilter_detach	  blkfilter_detach

> 
> I suppose that using mutex bdev->bd_disk->open_mutex in
> blkfilter_ioctl_attach(), blkfilter_ioctl_detach() and
> blkfilter_ioctl_ctl() can fix the problem. What do you think?

I think it's ok, and blkfilter ioctl must check disk_live() while
holding the lock.

Thanks,
Kuai

> 
> 
>> Thanks,
>> Kuai
>>>
>>> Thanks,
>>> Kuai
>>>
>>> .
>>>
>>
> .
>
Yu Kuai July 18, 2023, 1:37 a.m. UTC | #8
Hi,

在 2023/07/17 22:39, Sergei Shtepa 写道:
> 
> 
> On 7/11/23 04:02, Yu Kuai wrote:
>> bdev_disk_changed() is not handled, where delete_partition() and
>> add_partition() will be called, this means blkfilter for partiton will
>> be removed after partition rescan. Am I missing something?
> 
> Yes, when the bdev_disk_changed() is called, all disk block devices
> are deleted and new ones are re-created. Therefore, the information
> about the attached filters will be lost. This is equivalent to
> removing the disk and adding it back.
> 
> For the blksnap module, partition rescan will mean the loss of the
> change trackers data. If a snapshot was created, then such
> a partition rescan will cause the snapshot to be corrupted.
> 

I haven't review blksnap code yet, but this sounds like a problem.

possible solutions I have in mind:

1. Store blkfilter for each partition from bdev_disk_changed() before
delete_partition(), and add blkfilter back after add_partition().

2. Store blkfilter from gendisk as a xarray, and protect it by
'open_mutex' like 'part_tbl', block_device can keep the pointer to
reference blkfilter so that performance from fast path is ok, and the
lifetime of blkfiter can be managed separately.

> There was an idea to do filtering at the disk level,
> but I abandoned it.
> .
> 
I think it's better to do filtering at the partition level as well.

Thanks,
Kuai
Sergei Shtepa July 18, 2023, 11:25 a.m. UTC | #9
Hi.

On 7/18/23 03:37, Yu Kuai wrote:
> Subject:
> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
> From:
> Yu Kuai <yukuai1@huaweicloud.com>
> Date:
> 7/18/23, 03:37
> 
> To:
> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
> CC:
> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
> 
> 
> Hi,
> 
> 在 2023/07/17 22:39, Sergei Shtepa 写道:
>>
>>
>> On 7/11/23 04:02, Yu Kuai wrote:
>>> bdev_disk_changed() is not handled, where delete_partition() and
>>> add_partition() will be called, this means blkfilter for partiton will
>>> be removed after partition rescan. Am I missing something?
>>
>> Yes, when the bdev_disk_changed() is called, all disk block devices
>> are deleted and new ones are re-created. Therefore, the information
>> about the attached filters will be lost. This is equivalent to
>> removing the disk and adding it back.
>>
>> For the blksnap module, partition rescan will mean the loss of the
>> change trackers data. If a snapshot was created, then such
>> a partition rescan will cause the snapshot to be corrupted.
>>
> 
> I haven't review blksnap code yet, but this sounds like a problem.

I can't imagine a case where this could be a problem.
Partition rescan is possible only if the file system has not been
mounted on any of the disk partitions. Ioctl BLKRRPART will return
-EBUSY. Therefore, during normal operation of the system, rescan is
not performed.
And if the file systems have not been mounted, it is possible that
the disk partition structure has changed or the disk in the media
device has changed. In this case, it is better to detach the
filter, otherwise it may lead to incorrect operation of the module.

We can add prechange/postchange callback functions so that the
filter can track rescan process. But at the moment, this is not
necessary for the blksnap module. 

Therefore, I will refrain from making changes for now.

> 
> possible solutions I have in mind:
> 
> 1. Store blkfilter for each partition from bdev_disk_changed() before
> delete_partition(), and add blkfilter back after add_partition().
> 
> 2. Store blkfilter from gendisk as a xarray, and protect it by
> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
> reference blkfilter so that performance from fast path is ok, and the
> lifetime of blkfiter can be managed separately.
> 
>> There was an idea to do filtering at the disk level,
>> but I abandoned it.
>> .
>>
> I think it's better to do filtering at the partition level as well.
> 
> Thanks,
> Kuai
>
Yu Kuai July 18, 2023, 12:32 p.m. UTC | #10
Hi,

在 2023/07/18 19:25, Sergei Shtepa 写道:
> Hi.
> 
> On 7/18/23 03:37, Yu Kuai wrote:
>> Subject:
>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>> From:
>> Yu Kuai <yukuai1@huaweicloud.com>
>> Date:
>> 7/18/23, 03:37
>>
>> To:
>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>> CC:
>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>
>>
>> Hi,
>>
>> 在 2023/07/17 22:39, Sergei Shtepa 写道:
>>>
>>>
>>> On 7/11/23 04:02, Yu Kuai wrote:
>>>> bdev_disk_changed() is not handled, where delete_partition() and
>>>> add_partition() will be called, this means blkfilter for partiton will
>>>> be removed after partition rescan. Am I missing something?
>>>
>>> Yes, when the bdev_disk_changed() is called, all disk block devices
>>> are deleted and new ones are re-created. Therefore, the information
>>> about the attached filters will be lost. This is equivalent to
>>> removing the disk and adding it back.
>>>
>>> For the blksnap module, partition rescan will mean the loss of the
>>> change trackers data. If a snapshot was created, then such
>>> a partition rescan will cause the snapshot to be corrupted.
>>>
>>
>> I haven't review blksnap code yet, but this sounds like a problem.
> 
> I can't imagine a case where this could be a problem.
> Partition rescan is possible only if the file system has not been
> mounted on any of the disk partitions. Ioctl BLKRRPART will return
> -EBUSY. Therefore, during normal operation of the system, rescan is
> not performed.
> And if the file systems have not been mounted, it is possible that
> the disk partition structure has changed or the disk in the media
> device has changed. In this case, it is better to detach the
> filter, otherwise it may lead to incorrect operation of the module.
> 
> We can add prechange/postchange callback functions so that the
> filter can track rescan process. But at the moment, this is not
> necessary for the blksnap module.

So you mean that blkfilter is only used for the case that partition
is mounted? (Or you mean that partition is opened)

Then, I think you mean that filter should only be used for the partition
that is opended? Otherwise, filter can be gone at any time since
partition rescan can be gone.

//user
1. attach filter
		// other context rescan partition
2. mount fs
// user will found filter is gone.

Thanks,
Kuai

> 
> Therefore, I will refrain from making changes for now.
> 
>>
>> possible solutions I have in mind:
>>
>> 1. Store blkfilter for each partition from bdev_disk_changed() before
>> delete_partition(), and add blkfilter back after add_partition().
>>
>> 2. Store blkfilter from gendisk as a xarray, and protect it by
>> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
>> reference blkfilter so that performance from fast path is ok, and the
>> lifetime of blkfiter can be managed separately.
>>
>>> There was an idea to do filtering at the disk level,
>>> but I abandoned it.
>>> .
>>>
>> I think it's better to do filtering at the partition level as well.
>>
>> Thanks,
>> Kuai
>>
> .
>
Sergei Shtepa July 18, 2023, 4:33 p.m. UTC | #11
On 7/18/23 14:32, Yu Kuai wrote:
> Subject:
> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
> From:
> Yu Kuai <yukuai1@huaweicloud.com>
> Date:
> 7/18/23, 14:32
> 
> To:
> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
> CC:
> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
> 
> 
> Hi,
> 
> 在 2023/07/18 19:25, Sergei Shtepa 写道:
>> Hi.
>>
>> On 7/18/23 03:37, Yu Kuai wrote:
>>> Subject:
>>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>>> From:
>>> Yu Kuai <yukuai1@huaweicloud.com>
>>> Date:
>>> 7/18/23, 03:37
>>>
>>> To:
>>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>>> CC:
>>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>>
>>>
>>> Hi,
>>>
>>> 在 2023/07/17 22:39, Sergei Shtepa 写道:
>>>>
>>>>
>>>> On 7/11/23 04:02, Yu Kuai wrote:
>>>>> bdev_disk_changed() is not handled, where delete_partition() and
>>>>> add_partition() will be called, this means blkfilter for partiton will
>>>>> be removed after partition rescan. Am I missing something?
>>>>
>>>> Yes, when the bdev_disk_changed() is called, all disk block devices
>>>> are deleted and new ones are re-created. Therefore, the information
>>>> about the attached filters will be lost. This is equivalent to
>>>> removing the disk and adding it back.
>>>>
>>>> For the blksnap module, partition rescan will mean the loss of the
>>>> change trackers data. If a snapshot was created, then such
>>>> a partition rescan will cause the snapshot to be corrupted.
>>>>
>>>
>>> I haven't review blksnap code yet, but this sounds like a problem.
>>
>> I can't imagine a case where this could be a problem.
>> Partition rescan is possible only if the file system has not been
>> mounted on any of the disk partitions. Ioctl BLKRRPART will return
>> -EBUSY. Therefore, during normal operation of the system, rescan is
>> not performed.
>> And if the file systems have not been mounted, it is possible that
>> the disk partition structure has changed or the disk in the media
>> device has changed. In this case, it is better to detach the
>> filter, otherwise it may lead to incorrect operation of the module.
>>
>> We can add prechange/postchange callback functions so that the
>> filter can track rescan process. But at the moment, this is not
>> necessary for the blksnap module.
> 
> So you mean that blkfilter is only used for the case that partition
> is mounted? (Or you mean that partition is opened)
> 
> Then, I think you mean that filter should only be used for the partition
> that is opended? Otherwise, filter can be gone at any time since
> partition rescan can be gone.
> 
> //user
> 1. attach filter
>         // other context rescan partition
> 2. mount fs
> // user will found filter is gone.

Mmm...  The fact is that at the moment the user of the filter is the
blksnap module. There are no other filter users yet. The blksnap module
solves the problem of creating snapshots, primarily for backup purposes.
Therefore, the main use case is to attach a filter for an already running
system, where all partitions are marked up, file systems are mounted.

If the server is being serviced, during which the disk is being
re-partitioned, then disabling the filter is normal. In this case, the
change tracker will be reset, and at the next backup, the filter will be
attached again.

But if I were still solving the problem of saving the filter when rescanning,
then it is necessary to take into account the UUID and name of the partition
(struct partition_meta_info). It is unacceptable that due to a change in the
structure of partitions, the filter is attached to another partition by mistake.
The changed() callback would also be good to add so that the filter receives
a notification that the block device has been updated.

But I'm not sure that this should be done, since if some code is not used in
the kernel, then it should not be in the kernel.

> 
> Thanks,
> Kuai
> 
>>
>> Therefore, I will refrain from making changes for now.
>>
>>>
>>> possible solutions I have in mind:
>>>
>>> 1. Store blkfilter for each partition from bdev_disk_changed() before
>>> delete_partition(), and add blkfilter back after add_partition().
>>>
>>> 2. Store blkfilter from gendisk as a xarray, and protect it by
>>> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
>>> reference blkfilter so that performance from fast path is ok, and the
>>> lifetime of blkfiter can be managed separately.
>>>
>>>> There was an idea to do filtering at the disk level,
>>>> but I abandoned it.
>>>> .
>>>>
>>> I think it's better to do filtering at the partition level as well.
>>>
>>> Thanks,
>>> Kuai
>>>
>> .
>>
>
Yu Kuai July 19, 2023, 7:28 a.m. UTC | #12
Hi,

在 2023/07/19 0:33, Sergei Shtepa 写道:
> 
> 
> On 7/18/23 14:32, Yu Kuai wrote:
>> Subject:
>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>> From:
>> Yu Kuai <yukuai1@huaweicloud.com>
>> Date:
>> 7/18/23, 14:32
>>
>> To:
>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>> CC:
>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>
>>
>> Hi,
>>
>> 在 2023/07/18 19:25, Sergei Shtepa 写道:
>>> Hi.
>>>
>>> On 7/18/23 03:37, Yu Kuai wrote:
>>>> Subject:
>>>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>>>> From:
>>>> Yu Kuai <yukuai1@huaweicloud.com>
>>>> Date:
>>>> 7/18/23, 03:37
>>>>
>>>> To:
>>>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>>>> CC:
>>>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>>>
>>>>
>>>> Hi,
>>>>
>>>> 在 2023/07/17 22:39, Sergei Shtepa 写道:
>>>>>
>>>>>
>>>>> On 7/11/23 04:02, Yu Kuai wrote:
>>>>>> bdev_disk_changed() is not handled, where delete_partition() and
>>>>>> add_partition() will be called, this means blkfilter for partiton will
>>>>>> be removed after partition rescan. Am I missing something?
>>>>>
>>>>> Yes, when the bdev_disk_changed() is called, all disk block devices
>>>>> are deleted and new ones are re-created. Therefore, the information
>>>>> about the attached filters will be lost. This is equivalent to
>>>>> removing the disk and adding it back.
>>>>>
>>>>> For the blksnap module, partition rescan will mean the loss of the
>>>>> change trackers data. If a snapshot was created, then such
>>>>> a partition rescan will cause the snapshot to be corrupted.
>>>>>
>>>>
>>>> I haven't review blksnap code yet, but this sounds like a problem.
>>>
>>> I can't imagine a case where this could be a problem.
>>> Partition rescan is possible only if the file system has not been
>>> mounted on any of the disk partitions. Ioctl BLKRRPART will return
>>> -EBUSY. Therefore, during normal operation of the system, rescan is
>>> not performed.
>>> And if the file systems have not been mounted, it is possible that
>>> the disk partition structure has changed or the disk in the media
>>> device has changed. In this case, it is better to detach the
>>> filter, otherwise it may lead to incorrect operation of the module.
>>>
>>> We can add prechange/postchange callback functions so that the
>>> filter can track rescan process. But at the moment, this is not
>>> necessary for the blksnap module.
>>
>> So you mean that blkfilter is only used for the case that partition
>> is mounted? (Or you mean that partition is opened)
>>
>> Then, I think you mean that filter should only be used for the partition
>> that is opended? Otherwise, filter can be gone at any time since
>> partition rescan can be gone.
>>
>> //user
>> 1. attach filter
>>          // other context rescan partition
>> 2. mount fs
>> // user will found filter is gone.
> 
> Mmm...  The fact is that at the moment the user of the filter is the
> blksnap module. There are no other filter users yet. The blksnap module
> solves the problem of creating snapshots, primarily for backup purposes.
> Therefore, the main use case is to attach a filter for an already running
> system, where all partitions are marked up, file systems are mounted.
> 
> If the server is being serviced, during which the disk is being
> re-partitioned, then disabling the filter is normal. In this case, the
> change tracker will be reset, and at the next backup, the filter will be
> attached again.

Thanks for the explanation, I was thinking that blkshap can replace
dm-snapshot.

Thanks,
Kuai

> 
> But if I were still solving the problem of saving the filter when rescanning,
> then it is necessary to take into account the UUID and name of the partition
> (struct partition_meta_info). It is unacceptable that due to a change in the
> structure of partitions, the filter is attached to another partition by mistake.
> The changed() callback would also be good to add so that the filter receives
> a notification that the block device has been updated.
> 
> But I'm not sure that this should be done, since if some code is not used in
> the kernel, then it should not be in the kernel.
> 
>>
>> Thanks,
>> Kuai
>>
>>>
>>> Therefore, I will refrain from making changes for now.
>>>
>>>>
>>>> possible solutions I have in mind:
>>>>
>>>> 1. Store blkfilter for each partition from bdev_disk_changed() before
>>>> delete_partition(), and add blkfilter back after add_partition().
>>>>
>>>> 2. Store blkfilter from gendisk as a xarray, and protect it by
>>>> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
>>>> reference blkfilter so that performance from fast path is ok, and the
>>>> lifetime of blkfiter can be managed separately.
>>>>
>>>>> There was an idea to do filtering at the disk level,
>>>>> but I abandoned it.
>>>>> .
>>>>>
>>>> I think it's better to do filtering at the partition level as well.
>>>>
>>>> Thanks,
>>>> Kuai
>>>>
>>> .
>>>
>>
> .
>
Sergei Shtepa July 19, 2023, 8:36 a.m. UTC | #13
On 7/19/23 09:28, Yu Kuai wrote:
> Subject:
> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
> From:
> Yu Kuai <yukuai1@huaweicloud.com>
> Date:
> 7/19/23, 09:28
> 
> To:
> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
> CC:
> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
> 
> 
> Hi,
> 
> 在 2023/07/19 0:33, Sergei Shtepa 写道:
>>
>>
>> On 7/18/23 14:32, Yu Kuai wrote:
>>> Subject:
>>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>>> From:
>>> Yu Kuai <yukuai1@huaweicloud.com>
>>> Date:
>>> 7/18/23, 14:32
>>>
>>> To:
>>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>>> CC:
>>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>>
>>>
>>> Hi,
>>>
>>> 在 2023/07/18 19:25, Sergei Shtepa 写道:
>>>> Hi.
>>>>
>>>> On 7/18/23 03:37, Yu Kuai wrote:
>>>>> Subject:
>>>>> Re: [PATCH v5 02/11] block: Block Device Filtering Mechanism
>>>>> From:
>>>>> Yu Kuai <yukuai1@huaweicloud.com>
>>>>> Date:
>>>>> 7/18/23, 03:37
>>>>>
>>>>> To:
>>>>> Sergei Shtepa <sergei.shtepa@veeam.com>, Yu Kuai <yukuai1@huaweicloud.com>, axboe@kernel.dk, hch@infradead.org, corbet@lwn.net, snitzer@kernel.org
>>>>> CC:
>>>>> viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com, willy@infradead.org, dlemoal@kernel.org, linux@weissschuh.net, jack@suse.cz, ming.lei@redhat.com, linux-block@vger.kernel.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Donald Buczek <buczek@molgen.mpg.de>, "yukuai (C)" <yukuai3@huawei.com>
>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>> 在 2023/07/17 22:39, Sergei Shtepa 写道:
>>>>>>
>>>>>>
>>>>>> On 7/11/23 04:02, Yu Kuai wrote:
>>>>>>> bdev_disk_changed() is not handled, where delete_partition() and
>>>>>>> add_partition() will be called, this means blkfilter for partiton will
>>>>>>> be removed after partition rescan. Am I missing something?
>>>>>>
>>>>>> Yes, when the bdev_disk_changed() is called, all disk block devices
>>>>>> are deleted and new ones are re-created. Therefore, the information
>>>>>> about the attached filters will be lost. This is equivalent to
>>>>>> removing the disk and adding it back.
>>>>>>
>>>>>> For the blksnap module, partition rescan will mean the loss of the
>>>>>> change trackers data. If a snapshot was created, then such
>>>>>> a partition rescan will cause the snapshot to be corrupted.
>>>>>>
>>>>>
>>>>> I haven't review blksnap code yet, but this sounds like a problem.
>>>>
>>>> I can't imagine a case where this could be a problem.
>>>> Partition rescan is possible only if the file system has not been
>>>> mounted on any of the disk partitions. Ioctl BLKRRPART will return
>>>> -EBUSY. Therefore, during normal operation of the system, rescan is
>>>> not performed.
>>>> And if the file systems have not been mounted, it is possible that
>>>> the disk partition structure has changed or the disk in the media
>>>> device has changed. In this case, it is better to detach the
>>>> filter, otherwise it may lead to incorrect operation of the module.
>>>>
>>>> We can add prechange/postchange callback functions so that the
>>>> filter can track rescan process. But at the moment, this is not
>>>> necessary for the blksnap module.
>>>
>>> So you mean that blkfilter is only used for the case that partition
>>> is mounted? (Or you mean that partition is opened)
>>>
>>> Then, I think you mean that filter should only be used for the partition
>>> that is opended? Otherwise, filter can be gone at any time since
>>> partition rescan can be gone.
>>>
>>> //user
>>> 1. attach filter
>>>          // other context rescan partition
>>> 2. mount fs
>>> // user will found filter is gone.
>>
>> Mmm...  The fact is that at the moment the user of the filter is the
>> blksnap module. There are no other filter users yet. The blksnap module
>> solves the problem of creating snapshots, primarily for backup purposes.
>> Therefore, the main use case is to attach a filter for an already running
>> system, where all partitions are marked up, file systems are mounted.
>>
>> If the server is being serviced, during which the disk is being
>> re-partitioned, then disabling the filter is normal. In this case, the
>> change tracker will be reset, and at the next backup, the filter will be
>> attached again.
> 
> Thanks for the explanation, I was thinking that blkshap can replace
> dm-snapshot.

Thanks!
At the moment I am creating blksnap with the Veeam product needs in mind.
I would be glad if blksnap would be useful in other products as well.
If you have any thoughts/questions/suggestions/comments, then write to me
directly. I'll be happy to discuss everything.
To work on the patch, I use the branch here
Link: https://github.com/SergeiShtepa/linux/tree/blksnap-master
The user-space libs, tools and tests, compatible with the upstream is here
Link: https://github.com/veeam/blksnap/tree/stable-v2.0
Perhaps it will be useful to you.

> 
> Thanks,
> Kuai
> 
>>
>> But if I were still solving the problem of saving the filter when rescanning,
>> then it is necessary to take into account the UUID and name of the partition
>> (struct partition_meta_info). It is unacceptable that due to a change in the
>> structure of partitions, the filter is attached to another partition by mistake.
>> The changed() callback would also be good to add so that the filter receives
>> a notification that the block device has been updated.
>>
>> But I'm not sure that this should be done, since if some code is not used in
>> the kernel, then it should not be in the kernel.
>>
>>>
>>> Thanks,
>>> Kuai
>>>
>>>>
>>>> Therefore, I will refrain from making changes for now.
>>>>
>>>>>
>>>>> possible solutions I have in mind:
>>>>>
>>>>> 1. Store blkfilter for each partition from bdev_disk_changed() before
>>>>> delete_partition(), and add blkfilter back after add_partition().
>>>>>
>>>>> 2. Store blkfilter from gendisk as a xarray, and protect it by
>>>>> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
>>>>> reference blkfilter so that performance from fast path is ok, and the
>>>>> lifetime of blkfiter can be managed separately.
>>>>>
>>>>>> There was an idea to do filtering at the disk level,
>>>>>> but I abandoned it.
>>>>>> .
>>>>>>
>>>>> I think it's better to do filtering at the partition level as well.
>>>>>
>>>>> Thanks,
>>>>> Kuai
>>>>>
>>>> .
>>>>
>>>
>> .
>>
>
Christoph Hellwig July 20, 2023, 6:14 a.m. UTC | #14
On Tue, Jul 18, 2023 at 09:37:33AM +0800, Yu Kuai wrote:
> I haven't review blksnap code yet, but this sounds like a problem.
> 
> possible solutions I have in mind:
> 
> 1. Store blkfilter for each partition from bdev_disk_changed() before
> delete_partition(), and add blkfilter back after add_partition().
> 
> 2. Store blkfilter from gendisk as a xarray, and protect it by
> 'open_mutex' like 'part_tbl', block_device can keep the pointer to
> reference blkfilter so that performance from fast path is ok, and the
> lifetime of blkfiter can be managed separately.

The whole point of bdev_disk_changed is that the partitions might not
be the same ones as before..
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index d801b8985b43..8336b6143a71 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3585,6 +3585,9 @@  M:	Sergei Shtepa <sergei.shtepa@veeam.com>
 L:	linux-block@vger.kernel.org
 S:	Supported
 F:	Documentation/block/blkfilter.rst
+F:	block/blk-filter.c
+F:	include/linux/blk-filter.h
+F:	include/uapi/linux/blk-filter.h
 
 BLOCK LAYER
 M:	Jens Axboe <axboe@kernel.dk>
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..041c54eb0240 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,8 @@  obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
-			disk-events.o blk-ia-ranges.o early-lookup.o
+			disk-events.o blk-ia-ranges.o early-lookup.o \
+			blk-filter.o
 
 obj-$(CONFIG_BOUNCE)		+= bounce.o
 obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
diff --git a/block/bdev.c b/block/bdev.c
index 5c46ff107706..369f73b6097a 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -429,6 +429,7 @@  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+	bdev->bd_filter = NULL;
 	return bdev;
 }
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 2ae22bebeb3e..ede04c6ad021 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -18,6 +18,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/blk-pm.h>
 #include <linux/blk-integrity.h>
+#include <linux/blk-filter.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -586,8 +587,24 @@  static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
+static bool submit_bio_filter(struct bio *bio)
+{
+	if (bio_flagged(bio, BIO_FILTERED))
+		return false;
+
+	bio_set_flag(bio, BIO_FILTERED);
+	return bio->bi_bdev->bd_filter->ops->submit_bio(bio);
+}
+
 static void __submit_bio(struct bio *bio)
 {
+	/*
+	 * If there is a filter driver attached, check if the BIO needs to go to
+	 * the filter driver first, which can then pass on the bio or consume it.
+	 */
+	if (bio->bi_bdev->bd_filter && submit_bio_filter(bio))
+		return;
+
 	if (unlikely(!blk_crypto_bio_prep(&bio)))
 		return;
 
@@ -677,6 +694,15 @@  static void __submit_bio_noacct_mq(struct bio *bio)
 	current->bio_list = NULL;
 }
 
+/**
+ * submit_bio_noacct_nocheck - re-submit a bio to the block device layer for I/O
+ *	from block device filter.
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level by block device filters.  All file  systems and
+ * other upper level users of the block layer should use submit_bio() instead.
+ */
 void submit_bio_noacct_nocheck(struct bio *bio)
 {
 	blk_cgroup_bio_start(bio);
@@ -704,6 +730,7 @@  void submit_bio_noacct_nocheck(struct bio *bio)
 	else
 		__submit_bio_noacct(bio);
 }
+EXPORT_SYMBOL_GPL(submit_bio_noacct_nocheck);
 
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
diff --git a/block/blk-filter.c b/block/blk-filter.c
new file mode 100644
index 000000000000..bf31da6acf67
--- /dev/null
+++ b/block/blk-filter.c
@@ -0,0 +1,213 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#include <linux/blk-filter.h>
+#include <linux/blk-mq.h>
+#include <linux/module.h>
+
+#include "blk.h"
+
+static LIST_HEAD(blkfilters);
+static DEFINE_SPINLOCK(blkfilters_lock);
+
+static inline struct blkfilter_operations *__blkfilter_find(const char *name)
+{
+	struct blkfilter_operations *ops;
+
+	list_for_each_entry(ops, &blkfilters, link)
+		if (strncmp(ops->name, name, BLKFILTER_NAME_LENGTH) == 0)
+			return ops;
+
+	return NULL;
+}
+
+static inline struct blkfilter_operations *blkfilter_find_get(const char *name)
+{
+	struct blkfilter_operations *ops;
+
+	spin_lock(&blkfilters_lock);
+	ops = __blkfilter_find(name);
+	if (ops && !try_module_get(ops->owner))
+		ops = NULL;
+	spin_unlock(&blkfilters_lock);
+
+	return ops;
+}
+
+int blkfilter_ioctl_attach(struct block_device *bdev,
+		    struct blkfilter_name __user *argp)
+{
+	struct blkfilter_name name;
+	struct blkfilter_operations *ops;
+	struct blkfilter *flt;
+	int ret;
+
+	if (copy_from_user(&name, argp, sizeof(name)))
+		return -EFAULT;
+
+	ops = blkfilter_find_get(name.name);
+	if (!ops)
+		return -ENOENT;
+
+	ret = freeze_bdev(bdev);
+	if (ret)
+		goto out_put_module;
+	blk_mq_freeze_queue(bdev->bd_queue);
+
+	if (bdev->bd_filter) {
+		if (bdev->bd_filter->ops == ops)
+			ret = -EALREADY;
+		else
+			ret = -EBUSY;
+		goto out_unfreeze;
+	}
+
+	flt = ops->attach(bdev);
+	if (IS_ERR(flt)) {
+		ret = PTR_ERR(flt);
+		goto out_unfreeze;
+	}
+
+	flt->ops = ops;
+	bdev->bd_filter = flt;
+
+out_unfreeze:
+	blk_mq_unfreeze_queue(bdev->bd_queue);
+	thaw_bdev(bdev);
+out_put_module:
+	if (ret)
+		module_put(ops->owner);
+	return ret;
+}
+
+static void __blkfilter_detach(struct block_device *bdev)
+{
+	struct blkfilter *flt = bdev->bd_filter;
+	const struct blkfilter_operations *ops = flt->ops;
+
+	bdev->bd_filter = NULL;
+	ops->detach(flt);
+	module_put(ops->owner);
+}
+
+void blkfilter_detach(struct block_device *bdev)
+{
+	if (bdev->bd_filter) {
+		blk_mq_freeze_queue(bdev->bd_queue);
+		__blkfilter_detach(bdev);
+		blk_mq_unfreeze_queue(bdev->bd_queue);
+	}
+}
+
+int blkfilter_ioctl_detach(struct block_device *bdev,
+		    struct blkfilter_name __user *argp)
+{
+	struct blkfilter_name name;
+	int error = 0;
+
+	if (copy_from_user(&name, argp, sizeof(name)))
+		return -EFAULT;
+
+	blk_mq_freeze_queue(bdev->bd_queue);
+	if (!bdev->bd_filter) {
+		error = -ENOENT;
+		goto out_unfreeze;
+	}
+	if (strncmp(bdev->bd_filter->ops->name, name.name,
+			BLKFILTER_NAME_LENGTH)) {
+		error = -EINVAL;
+		goto out_unfreeze;
+	}
+
+	__blkfilter_detach(bdev);
+out_unfreeze:
+	blk_mq_unfreeze_queue(bdev->bd_queue);
+	return error;
+}
+
+int blkfilter_ioctl_ctl(struct block_device *bdev,
+		    struct blkfilter_ctl __user *argp)
+{
+	struct blkfilter_ctl ctl;
+	struct blkfilter *flt;
+	int ret;
+
+	if (copy_from_user(&ctl, argp, sizeof(ctl)))
+		return -EFAULT;
+
+	ret = blk_queue_enter(bdev_get_queue(bdev), 0);
+	if (ret)
+		return ret;
+
+	flt = bdev->bd_filter;
+	if (!flt || strncmp(flt->ops->name, ctl.name, BLKFILTER_NAME_LENGTH)) {
+		ret = -ENOENT;
+		goto out_queue_exit;
+	}
+
+	if (!flt->ops->ctl) {
+		ret = -ENOTTY;
+		goto out_queue_exit;
+	}
+
+	ret = flt->ops->ctl(flt, ctl.cmd, u64_to_user_ptr(ctl.opt),
+			    &ctl.optlen);
+out_queue_exit:
+	blk_queue_exit(bdev_get_queue(bdev));
+	return ret;
+}
+
+ssize_t blkfilter_show(struct block_device *bdev, char *buf)
+{
+	ssize_t ret = 0;
+
+	blk_mq_freeze_queue(bdev->bd_queue);
+	if (bdev->bd_filter)
+		ret = sprintf(buf, "%s\n", bdev->bd_filter->ops->name);
+	else
+		ret = sprintf(buf, "\n");
+	blk_mq_unfreeze_queue(bdev->bd_queue);
+
+	return ret;
+}
+
+/**
+ * blkfilter_register() - Register block device filter operations
+ * @ops:	The operations to register.
+ *
+ * Return:
+ *	0 if succeeded,
+ *	-EBUSY if a block device filter with the same name is already
+ *	registered.
+ */
+int blkfilter_register(struct blkfilter_operations *ops)
+{
+	struct blkfilter_operations *found;
+	int ret = 0;
+
+	spin_lock(&blkfilters_lock);
+	found = __blkfilter_find(ops->name);
+	if (found)
+		ret = -EBUSY;
+	else
+		list_add_tail(&ops->link, &blkfilters);
+	spin_unlock(&blkfilters_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkfilter_register);
+
+/**
+ * blkfilter_unregister() - Unregister block device filter operations
+ * @ops:	The operations to unregister.
+ *
+ * Important: before unloading, it is necessary to detach the filter from all
+ * block devices.
+ *
+ */
+void blkfilter_unregister(struct blkfilter_operations *ops)
+{
+	spin_lock(&blkfilters_lock);
+	list_del(&ops->link);
+	spin_unlock(&blkfilters_lock);
+}
+EXPORT_SYMBOL_GPL(blkfilter_unregister);
diff --git a/block/blk.h b/block/blk.h
index 9582fcd0df41..2c14fa938d8c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,6 +7,8 @@ 
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
 
+struct blkfilter_ctl;
+struct blkfilter_name;
 struct elevator_type;
 
 /* Max future timer expiry for timeouts */
@@ -454,6 +456,15 @@  long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 extern const struct address_space_operations def_blk_aops;
 
+int blkfilter_ioctl_attach(struct block_device *bdev,
+		    struct blkfilter_name __user *argp);
+int blkfilter_ioctl_detach(struct block_device *bdev,
+		    struct blkfilter_name __user *argp);
+int blkfilter_ioctl_ctl(struct block_device *bdev,
+		    struct blkfilter_ctl __user *argp);
+void blkfilter_detach(struct block_device *bdev);
+ssize_t blkfilter_show(struct block_device *bdev, char *buf);
+
 int disk_register_independent_access_ranges(struct gendisk *disk);
 void disk_unregister_independent_access_ranges(struct gendisk *disk);
 
diff --git a/block/genhd.c b/block/genhd.c
index 4e5fd6aaa883..d9aca0797886 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,6 +25,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
 #include <linux/part_stat.h>
+#include <linux/blk-filter.h>
 #include "blk-throttle.h"
 
 #include "blk.h"
@@ -648,6 +649,7 @@  void del_gendisk(struct gendisk *disk)
 		remove_inode_hash(part->bd_inode);
 		fsync_bdev(part);
 		__invalidate_device(part, true);
+		blkfilter_detach(part);
 	}
 	mutex_unlock(&disk->open_mutex);
 
@@ -1033,6 +1035,12 @@  static ssize_t diskseq_show(struct device *dev,
 	return sprintf(buf, "%llu\n", disk->diskseq);
 }
 
+static ssize_t disk_filter_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return blkfilter_show(dev_to_bdev(dev), buf);
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1046,6 +1054,7 @@  static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
+static DEVICE_ATTR(filter, 0444, disk_filter_show, NULL);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1092,6 +1101,7 @@  static struct attribute *disk_attrs[] = {
 	&dev_attr_events_async.attr,
 	&dev_attr_events_poll_msecs.attr,
 	&dev_attr_diskseq.attr,
+	&dev_attr_filter.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/block/ioctl.c b/block/ioctl.c
index c7d7d4345edb..170020d1ce0e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -2,6 +2,7 @@ 
 #include <linux/capability.h>
 #include <linux/compat.h>
 #include <linux/blkdev.h>
+#include <linux/blk-filter.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
@@ -546,6 +547,12 @@  static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 		return blkdev_pr_preempt(bdev, argp, true);
 	case IOC_PR_CLEAR:
 		return blkdev_pr_clear(bdev, argp);
+	case BLKFILTER_ATTACH:
+		return blkfilter_ioctl_attach(bdev, argp);
+	case BLKFILTER_DETACH:
+		return blkfilter_ioctl_detach(bdev, argp);
+	case BLKFILTER_CTL:
+		return blkfilter_ioctl_ctl(bdev, argp);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 87a21942d606..8e2834566c38 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -10,6 +10,7 @@ 
 #include <linux/ctype.h>
 #include <linux/vmalloc.h>
 #include <linux/raid/detect.h>
+#include <linux/blk-filter.h>
 #include "check.h"
 
 static int (*const check_part[])(struct parsed_partitions *) = {
@@ -200,6 +201,12 @@  static ssize_t part_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
 }
 
+static ssize_t part_filter_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return blkfilter_show(dev_to_bdev(dev), buf);
+}
+
 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
 static DEVICE_ATTR(start, 0444, part_start_show, NULL);
 static DEVICE_ATTR(size, 0444, part_size_show, NULL);
@@ -208,6 +215,7 @@  static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(filter, 0444, part_filter_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
@@ -222,6 +230,7 @@  static struct attribute *part_attrs[] = {
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+	&dev_attr_filter.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -284,6 +293,7 @@  static void delete_partition(struct block_device *part)
 
 	fsync_bdev(part);
 	__invalidate_device(part, true);
+	blkfilter_detach(part);
 
 	drop_partition(part);
 }
diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h
new file mode 100644
index 000000000000..0afdb40f3bab
--- /dev/null
+++ b/include/linux/blk-filter.h
@@ -0,0 +1,51 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#ifndef _LINUX_BLK_FILTER_H
+#define _LINUX_BLK_FILTER_H
+
+#include <uapi/linux/blk-filter.h>
+
+struct bio;
+struct block_device;
+struct blkfilter_operations;
+
+/**
+ * struct blkfilter - Block device filter.
+ *
+ * @ops:	Block device filter operations.
+ *
+ * For each filtered block device, the filter creates a data structure
+ * associated with this device. The data in this structure is specific to the
+ * filter, but it must contain a pointer to the block device filter account.
+ */
+struct blkfilter {
+	const struct blkfilter_operations *ops;
+};
+
+/**
+ * struct blkfilter_operations - Block device filter operations.
+ *
+ * @link:	Entry in the global list of filter drivers
+ *		(must not be accessed by the driver).
+ * @owner:	Module implementing the filter driver.
+ * @name:	Name of the filter driver.
+ * @attach:	Attach the filter driver to the block device.
+ * @detach:	Detach the filter driver from the block device.
+ * @ctl:	Send a control command to the filter driver.
+ * @submit_bio:	Handle bio submissions to the filter driver.
+ */
+struct blkfilter_operations {
+	struct list_head link;
+	struct module *owner;
+	const char *name;
+	struct blkfilter *(*attach)(struct block_device *bdev);
+	void (*detach)(struct blkfilter *flt);
+	int (*ctl)(struct blkfilter *flt, const unsigned int cmd,
+		   __u8 __user *buf, __u32 *plen);
+	bool (*submit_bio)(struct bio *bio);
+};
+
+int blkfilter_register(struct blkfilter_operations *ops);
+void blkfilter_unregister(struct blkfilter_operations *ops);
+
+#endif /* _UAPI_LINUX_BLK_FILTER_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index deb69eeab6bd..5ba313b3b11d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -75,6 +75,7 @@  struct block_device {
 	 * path
 	 */
 	struct device		bd_device;
+	struct blkfilter	*bd_filter;
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
@@ -341,6 +342,7 @@  enum {
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+	BIO_FILTERED,		/* bio has already been filtered */
 	BIO_FLAG_LAST
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f4c339d9dd03..e7a4a4866792 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -842,6 +842,7 @@  void blk_request_module(dev_t devt);
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
+void submit_bio_noacct_nocheck(struct bio *bio);
 void submit_bio_noacct(struct bio *bio);
 struct bio *bio_split_to_limits(struct bio *bio);
 
diff --git a/include/uapi/linux/blk-filter.h b/include/uapi/linux/blk-filter.h
new file mode 100644
index 000000000000..18885dc1b717
--- /dev/null
+++ b/include/uapi/linux/blk-filter.h
@@ -0,0 +1,35 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2023 Veeam Software Group GmbH */
+#ifndef _UAPI_LINUX_BLK_FILTER_H
+#define _UAPI_LINUX_BLK_FILTER_H
+
+#include <linux/types.h>
+
+#define BLKFILTER_NAME_LENGTH	32
+
+/**
+ * struct blkfilter_name - parameter for BLKFILTER_ATTACH and BLKFILTER_DETACH
+ *      ioctl.
+ *
+ * @name:       Name of block device filter.
+ */
+struct blkfilter_name {
+	__u8 name[BLKFILTER_NAME_LENGTH];
+};
+
+/**
+ * struct blkfilter_ctl - parameter for BLKFILTER_CTL ioctl
+ *
+ * @name:	Name of block device filter.
+ * @cmd:	The filter-specific operation code of the command.
+ * @optlen:	Size of data at @opt.
+ * @opt:	Userspace buffer with options.
+ */
+struct blkfilter_ctl {
+	__u8 name[BLKFILTER_NAME_LENGTH];
+	__u32 cmd;
+	__u32 optlen;
+	__u64 opt;
+};
+
+#endif /* _UAPI_LINUX_BLK_FILTER_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..7904f157b245 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -189,6 +189,9 @@  struct fsxattr {
  * A jump here: 130-136 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
  */
+#define BLKFILTER_ATTACH	_IOWR(0x12, 140, struct blkfilter_name)
+#define BLKFILTER_DETACH	_IOWR(0x12, 141, struct blkfilter_name)
+#define BLKFILTER_CTL		_IOWR(0x12, 142, struct blkfilter_ctl)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */