diff mbox series

[01/20] block, blk_filter: enable block device filters

Message ID 1655135593-1900-2-git-send-email-sergei.shtepa@veeam.com (mailing list archive)
State New, archived
Headers show
Series blksnap - creating non-persistent snapshots for backup | expand

Commit Message

Sergei Shtepa June 13, 2022, 3:52 p.m. UTC
Allows to attach block device filters to the block devices.
Kernel modules can use this functionality to extend the
capabilities of the block layer.

Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
---
 block/Kconfig             |   8 +++
 block/bdev.c              | 129 ++++++++++++++++++++++++++++++++++++++
 block/blk-core.c          |  88 ++++++++++++++++++++++++++
 include/linux/blk_types.h |  22 +++++++
 include/linux/blkdev.h    |  81 ++++++++++++++++++++++++
 5 files changed, 328 insertions(+)

Comments

Randy Dunlap June 13, 2022, 9:50 p.m. UTC | #1
Hi--

On 6/13/22 08:52, Sergei Shtepa wrote:
> Allows to attach block device filters to the block devices.
> Kernel modules can use this functionality to extend the
> capabilities of the block layer.
> 
> Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> ---
>  block/Kconfig             |   8 +++
>  block/bdev.c              | 129 ++++++++++++++++++++++++++++++++++++++
>  block/blk-core.c          |  88 ++++++++++++++++++++++++++
>  include/linux/blk_types.h |  22 +++++++
>  include/linux/blkdev.h    |  81 ++++++++++++++++++++++++
>  5 files changed, 328 insertions(+)
> 

> diff --git a/block/bdev.c b/block/bdev.c
> index 5fe06c1f2def..4bcd9f4378e3 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -426,8 +426,15 @@ static void init_once(void *data)
>  	inode_init_once(&ei->vfs_inode);
>  }
>  
> +#ifdef CONFIG_BLK_FILTER
> +static void bdev_filter_cleanup(struct block_device *bdev);
> +#endif
> +
>  static void bdev_evict_inode(struct inode *inode)
>  {
> +#ifdef CONFIG_BLK_FILTER
> +	bdev_filter_cleanup(I_BDEV(inode));
> +#endif
>  	truncate_inode_pages_final(&inode->i_data);
>  	invalidate_inode_buffers(inode); /* is it needed here? */
>  	clear_inode(inode);
> @@ -503,6 +510,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
>  		return NULL;
>  	}
>  	bdev->bd_disk = disk;
> +
> +#ifdef CONFIG_BLK_FILTER
> +	memset(bdev->bd_filters, 0, sizeof(bdev->bd_filters));
> +	spin_lock_init(&bdev->bd_filters_lock);
> +#endif
>  	return bdev;
>  }
>  
> @@ -1071,3 +1083,120 @@ void sync_bdevs(bool wait)
>  	spin_unlock(&blockdev_superblock->s_inode_list_lock);
>  	iput(old_inode);
>  }
> +
> +#ifdef CONFIG_BLK_FILTER
> +static void bdev_filter_cleanup(struct block_device *bdev)
> +{
> +	int altitude;
> +	struct bdev_filter *flt;
> +
> +	for (altitude = 0; altitude < bdev_filter_alt_end; altitude++) {
> +		spin_lock(&bdev->bd_filters_lock);
> +		flt = bdev->bd_filters[altitude];
> +		bdev->bd_filters[altitude] = NULL;
> +		spin_unlock(&bdev->bd_filters_lock);
> +
> +		bdev_filter_put(flt);
> +	}
> +}
> +
> +/**
> + * bdev_filter_attach - Attach a filter to the original block device.
> + * @bdev:
> + *	Block device.
> + * @name:
> + *	Name of the block device filter.
> + * @altitude:
> + *	Altituda number of the block device filter.

What is "Altituda"?  Just a typo?

> + * @flt:
> + *	Pointer to the filter structure.
> + *
> + * Before adding a filter, it is necessary to initialize &struct bdev_filter.
> + *
> + * The bdev_filter_detach() function allows to detach the filter from the block
> + * device.
> + *
> + * Return:
> + * 0 - OK
> + * -EALREADY - a filter with this name already exists
> + */
> +int bdev_filter_attach(struct block_device *bdev, const char *name,
> +		       const enum bdev_filter_altitudes altitude,
> +		       struct bdev_filter *flt)
> +{
> +	int ret = 0;
> +
> +	spin_lock(&bdev->bd_filters_lock);
> +	if (bdev->bd_filters[altitude])
> +		ret = -EALREADY;
> +	else
> +		bdev->bd_filters[altitude] = flt;
> +	spin_unlock(&bdev->bd_filters_lock);
> +
> +	if (!ret)
> +		pr_info("block device filter '%s' has been attached to %d:%d",
> +			name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(bdev_filter_attach);
> +
> +/**
> + * bdev_filter_detach - Detach a filter from the block device.
> + * @bdev:
> + *	Block device.
> + * @name:
> + *	Name of the block device filter.
> + * @altitude:
> + *	Altituda number of the block device filter.

Ditto.

> + *
> + * The filter should be added using the bdev_filter_attach() function.
> + *
> + * Return:
> + * 0 - OK
> + * -ENOENT - the filter was not found in the linked list
> + */
> +int bdev_filter_detach(struct block_device *bdev, const char *name,
> +		       const enum bdev_filter_altitudes altitude)
> +{
> +	struct bdev_filter *flt = NULL;
> +
> +	spin_lock(&bdev->bd_filters_lock);
> +	flt = bdev->bd_filters[altitude];
> +	bdev->bd_filters[altitude] = NULL;
> +	spin_unlock(&bdev->bd_filters_lock);
> +
> +	if (!flt)
> +		return -ENOENT;
> +
> +	bdev_filter_put(flt);
> +	pr_info("block device filter '%s' has been detached from %d:%d",
> +		name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(bdev_filter_detach);
> +
> +/**
> + * bdev_filter_get_by_altitude - Get filter by altitude.
> + * @bdev:
> + *	Pointer to the block device structure.

kernel-doc says:
bdev.c:1190: warning: Function parameter or member 'altitude' not described in '
bdev_filter_get_by_altitude'

> + *
> + * Return:
> + * pointer - pointer to filters structure from &struct blk_filter
> + * NULL - no filter has been set
> + */
> +struct bdev_filter *bdev_filter_get_by_altitude(struct block_device *bdev,
> +				const enum bdev_filter_altitudes altitude)
> +{
> +	struct bdev_filter *flt;
> +
> +	spin_lock(&bdev->bd_filters_lock);
> +	flt = bdev->bd_filters[altitude];
> +	if (flt)
> +		bdev_filter_get(flt);
> +	spin_unlock(&bdev->bd_filters_lock);
> +
> +	return flt;
> +}
> +EXPORT_SYMBOL_GPL(bdev_filter_get_by_altitude);
> +#endif



> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 608d577734c2..24cb5293897f 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1573,4 +1573,85 @@ struct io_comp_batch {
>  
>  #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
>  
> +#ifdef CONFIG_BLK_FILTER
> +/**
> + * enum bdev_filter_result - The result of bio processing by
> + *	the block device filter.
> + *
> + * @bdev_filter_skip:
> + *	Original bio does not need to be submitted.
> + * @bdev_filter_pass:
> + *	It is necessary to submit the original request.
> + * @bdev_filter_repeat:
> + *	Bio processing has not been completed, a second call is required.
> + * @bdev_filter_redirect:
> + *	Original bio was redirected to another block device. The set
> + *	of filters on it is different, so processing must be repeated.
> + */
> +enum bdev_filter_result {
> +	bdev_filter_skip = 0,
> +	bdev_filter_pass,
> +	bdev_filter_repeat,
> +	bdev_filter_redirect
> +};
> +struct bdev_filter;
> +/**
> + * bdev_filter_operations - List of callback functions for the filter.

blkdev.h:1607: warning: cannot understand function prototype: 'struct bdev_filter_operations '

Missing "struct " before the struct name above.

> + *
> + * @submit_bio_cb:
> + *	A callback function for bio processing.
> + * @detach_cb:
> + *	A callback function to disable the filter when removing a block
> + *	device from the system.
> + */
> +struct bdev_filter_operations {
> +	enum bdev_filter_result (*submit_bio_cb)(struct bio *bio,
> +						 struct bdev_filter *flt);
> +	void (*detach_cb)(struct kref *kref);
> +};
> +/**
> + * struct bdev_filter - Block device filter.
> + *
> + * @kref:
> + *	Kernel reference counter.
> + * @fops:
> + *	The pointer to &struct bdev_filter_operations with callback
> + *	functions for the filter.
> + */
> +struct bdev_filter {
> +	struct kref kref;
> +	const struct bdev_filter_operations *fops;
> +};


thanks.
kernel test robot June 14, 2022, 9:01 a.m. UTC | #2
Hi Sergei,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on v5.19-rc2 next-20220614]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Sergei-Shtepa/blksnap-creating-non-persistent-snapshots-for-backup/20220614-025950
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: openrisc-randconfig-c023-20220613 (https://download.01.org/0day-ci/archive/20220614/202206141610.QsI0Qgur-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 11.3.0

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>


cocci warnings: (new ones prefixed by >>)
>> block/blk-core.c:792:3-4: Unneeded semicolon
Sergei Shtepa June 14, 2022, 9:19 a.m. UTC | #3
Hi Randy.

Thanks for the review.
I agree with all the comments and will pay more attention to the documentation.
Sergei Shtepa June 14, 2022, 9:21 a.m. UTC | #4
Hi Randy.

Thanks for the review.
I agree with all the comments and will pay more attention to the documentation.
Christoph Hellwig July 6, 2022, 12:59 p.m. UTC | #5
Hi Sergei,

thanks for picking up this work again.

I think we can simply the blk filter concept quite a bit:

 a) we only need one filter driver, at last for now
 b) we don't need an extra lock and can just rely on freezing
    the queue and a cmpxchg for attach/detach

The benefit is that there is basically no I/O path overhead now, and
the cod is so simple that we don't need a new config option.

This is untested, and actually breaks blksnap, although I've left
comments in there to fix it after adjusting it enough to at least
compile.  Let me know if this makes sense to you, and I'll try to
come up with comments on the blksnap code as well.

diff --git a/block/Kconfig b/block/Kconfig
index 83472a07822ab..444c5ab3b67e2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -224,14 +224,6 @@ config BLK_MQ_RDMA
 config BLK_PM
 	def_bool PM
 
-config BLK_FILTER
-	bool "Enable block device filters"
-	default n
-	help
-	  Enabling this lets the block layer filters handle bio requests.
-	  Kernel modules can use this feature to extend the functionality
-	  of the block layer.
-
 # do not use in new code
 config BLOCK_HOLDER_DEPRECATED
 	bool
diff --git a/block/bdev.c b/block/bdev.c
index 4bcd9f4378e3b..2db19ed899baa 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -12,6 +12,7 @@
 #include <linux/major.h>
 #include <linux/device_cgroup.h>
 #include <linux/blkdev.h>
+#include <linux/blk-filter.h>
 #include <linux/blk-integrity.h>
 #include <linux/backing-dev.h>
 #include <linux/module.h>
@@ -426,15 +427,13 @@ static void init_once(void *data)
 	inode_init_once(&ei->vfs_inode);
 }
 
-#ifdef CONFIG_BLK_FILTER
-static void bdev_filter_cleanup(struct block_device *bdev);
-#endif
-
 static void bdev_evict_inode(struct inode *inode)
 {
-#ifdef CONFIG_BLK_FILTER
-	bdev_filter_cleanup(I_BDEV(inode));
-#endif
+	struct bdev_filter *flt = I_BDEV(inode)->bd_filter;
+
+	if (flt)
+		flt->ops->detach(flt);
+
 	truncate_inode_pages_final(&inode->i_data);
 	invalidate_inode_buffers(inode); /* is it needed here? */
 	clear_inode(inode);
@@ -510,11 +509,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
-
-#ifdef CONFIG_BLK_FILTER
-	memset(bdev->bd_filters, 0, sizeof(bdev->bd_filters));
-	spin_lock_init(&bdev->bd_filters_lock);
-#endif
 	return bdev;
 }
 
@@ -1084,119 +1078,45 @@ void sync_bdevs(bool wait)
 	iput(old_inode);
 }
 
-#ifdef CONFIG_BLK_FILTER
-static void bdev_filter_cleanup(struct block_device *bdev)
-{
-	int altitude;
-	struct bdev_filter *flt;
-
-	for (altitude = 0; altitude < bdev_filter_alt_end; altitude++) {
-		spin_lock(&bdev->bd_filters_lock);
-		flt = bdev->bd_filters[altitude];
-		bdev->bd_filters[altitude] = NULL;
-		spin_unlock(&bdev->bd_filters_lock);
-
-		bdev_filter_put(flt);
-	}
-}
-
 /**
- * bdev_filter_attach - Attach a filter to the original block device.
- * @bdev:
- *	Block device.
- * @name:
- *	Name of the block device filter.
- * @altitude:
- *	Altituda number of the block device filter.
- * @flt:
- *	Pointer to the filter structure.
- *
- * Before adding a filter, it is necessary to initialize &struct bdev_filter.
- *
- * The bdev_filter_detach() function allows to detach the filter from the block
- * device.
- *
- * Return:
- * 0 - OK
- * -EALREADY - a filter with this name already exists
+ * bdev_filter_attach - attach a filter to a block device
+ * @bdev:	block device to attach to
+ * @flt:	filter to attach
  */
-int bdev_filter_attach(struct block_device *bdev, const char *name,
-		       const enum bdev_filter_altitudes altitude,
-		       struct bdev_filter *flt)
+int bdev_filter_attach(struct block_device *bdev, struct bdev_filter *flt)
 {
-	int ret = 0;
+	struct bdev_filter *old;
 
-	spin_lock(&bdev->bd_filters_lock);
-	if (bdev->bd_filters[altitude])
-		ret = -EALREADY;
-	else
-		bdev->bd_filters[altitude] = flt;
-	spin_unlock(&bdev->bd_filters_lock);
+	blk_mq_freeze_queue(bdev_get_queue(bdev));
+	old = cmpxchg(&bdev->bd_filter, NULL, flt);
+	blk_mq_unfreeze_queue(bdev_get_queue(bdev));
 
-	if (!ret)
-		pr_info("block device filter '%s' has been attached to %d:%d",
-			name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+	if (old)
+		return -EALREADY;
 
-	return ret;
+	pr_info("block device filter '%s' has been attached to %pg",
+		flt->ops->name, bdev->bd_disk);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(bdev_filter_attach);
 
 /**
- * bdev_filter_detach - Detach a filter from the block device.
- * @bdev:
- *	Block device.
- * @name:
- *	Name of the block device filter.
- * @altitude:
- *	Altituda number of the block device filter.
- *
- * The filter should be added using the bdev_filter_attach() function.
- *
- * Return:
- * 0 - OK
- * -ENOENT - the filter was not found in the linked list
+ * bdev_filter_detach - detach a filter a the block device.
+ * @bdev:	block device to detach from
+ * @flt:	filter to detach
  */
-int bdev_filter_detach(struct block_device *bdev, const char *name,
-		       const enum bdev_filter_altitudes altitude)
+void bdev_filter_detach(struct block_device *bdev, struct bdev_filter *flt)
 {
-	struct bdev_filter *flt = NULL;
+	struct bdev_filter *old;
 
-	spin_lock(&bdev->bd_filters_lock);
-	flt = bdev->bd_filters[altitude];
-	bdev->bd_filters[altitude] = NULL;
-	spin_unlock(&bdev->bd_filters_lock);
+	blk_mq_freeze_queue(bdev_get_queue(bdev));
+	old = cmpxchg(&bdev->bd_filter, flt, NULL);
+	blk_mq_unfreeze_queue(bdev_get_queue(bdev));
 
-	if (!flt)
-		return -ENOENT;
+	if (old != flt)
+		return;
 
-	bdev_filter_put(flt);
-	pr_info("block device filter '%s' has been detached from %d:%d",
-		name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-	return 0;
+	pr_info("block device filter '%s' has been detached from %pg",
+		flt->ops->name, bdev->bd_disk);
 }
 EXPORT_SYMBOL_GPL(bdev_filter_detach);
-
-/**
- * bdev_filter_get_by_altitude - Get filter by altitude.
- * @bdev:
- *	Pointer to the block device structure.
- *
- * Return:
- * pointer - pointer to filters structure from &struct blk_filter
- * NULL - no filter has been set
- */
-struct bdev_filter *bdev_filter_get_by_altitude(struct block_device *bdev,
-				const enum bdev_filter_altitudes altitude)
-{
-	struct bdev_filter *flt;
-
-	spin_lock(&bdev->bd_filters_lock);
-	flt = bdev->bd_filters[altitude];
-	if (flt)
-		bdev_filter_get(flt);
-	spin_unlock(&bdev->bd_filters_lock);
-
-	return flt;
-}
-EXPORT_SYMBOL_GPL(bdev_filter_get_by_altitude);
-#endif
diff --git a/block/blk-core.c b/block/blk-core.c
index cbe004a2b8bdf..b2b55a26ad682 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-filter.h>
 #include <linux/blk-pm.h>
 #include <linux/blk-integrity.h>
 #include <linux/highmem.h>
@@ -701,85 +702,31 @@ void submit_bio_noacct_nocheck(struct bio *bio)
 		__submit_bio_noacct(bio);
 }
 
-#ifdef CONFIG_BLK_FILTER
-
-/**
- * __filter_bio() - Process bio by the block device filter.
- * @flt:
- *	Block device filter.
- * @bio:
- *	Original I/O unit.
- *
- * Return:
- * bdev_filter_pass - original bio should be submitted
- * bdev_filter_skip - do not submit original bio
- * bdev_filter_redirect - repeat bio processing for another block device
- */
-static inline enum bdev_filter_result __filter_bio(struct bdev_filter *flt,
-						   struct bio *bio)
+static bool bio_call_filter(struct bio *bio)
 {
+	struct block_device *bdev = bio->bi_bdev;
+	struct bdev_filter *flt = bdev->bd_filter;
 	enum bdev_filter_result result;
-	struct bio *new_bio;
-	struct bio_list bio_list[2] = { };
 
-	do {
-		bio_list_init(&bio_list[0]);
-		current->bio_list = bio_list;
+	if (bio_flagged(bio, BIO_FILTERED))
+		return true;
 
-		result = flt->fops->submit_bio_cb(bio, flt);
+	do {
+		struct bio_list bio_list[2] = { };
+		struct bio *new_bio;
 
+		current->bio_list = bio_list;
+		result = flt->ops->submit_bio(bio, flt);
 		current->bio_list = NULL;
 
 		while ((new_bio = bio_list_pop(&bio_list[0]))) {
 			bio_set_flag(new_bio, BIO_FILTERED);
 			submit_bio_noacct(new_bio);
 		};
-	} while (result == bdev_filter_repeat);
-
-	return result;
-}
-
-/**
- * filter_bio() - Pass bio to the block device filters.
- * @bio:
- *	Original I/O unit.
- *
- * Return:
- * true - original bio should be submitted
- * false - do not submit original bio
- */
-static bool filter_bio(struct bio *bio)
-{
-	enum bdev_filter_result result = bdev_filter_pass;
+	} while (result == BIO_FILTER_REPEAT);
 
-	if (bio_flagged(bio, BIO_FILTERED))
-		return true;
-	do {
-		struct block_device *bdev = bio->bi_bdev;
-		unsigned int altitude = 0;
-
-		while (altitude < bdev_filter_alt_end) {
-			struct bdev_filter *flt;
-
-			spin_lock(&bdev->bd_filters_lock);
-			flt = bdev->bd_filters[altitude];
-			if (flt)
-				bdev_filter_get(flt);
-			spin_unlock(&bdev->bd_filters_lock);
-
-			if (flt) {
-				result = __filter_bio(flt, bio);
-				bdev_filter_put(flt);
-				if (result != bdev_filter_pass)
-					break;
-			}
-			altitude++;
-		}
-	} while (result == bdev_filter_redirect);
-
-	return (result == bdev_filter_pass);
+	return result == BIO_FILTER_PASS;
 }
-#endif
 
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
@@ -814,14 +761,14 @@ void submit_bio_noacct(struct bio *bio)
 		goto end_io;
 	if (unlikely(bio_check_ro(bio)))
 		goto end_io;
-#ifdef CONFIG_BLK_FILTER
+
 	/*
-	 * It looks like should_fail_bio() and bio_check_ro() can be placed
-	 * in a separate block device filter for debugging.
+	 * Call the filter driver if there is one, and return if it consumed the
+	 * bio.
 	 */
-	if (!filter_bio(bio))
+	if (unlikely(bdev->bd_filter && !bio_call_filter(bio)))
 		goto end_io;
-#endif
+
 	if (!bio_flagged(bio, BIO_REMAPPED)) {
 		if (unlikely(bio_check_eod(bio)))
 			goto end_io;
diff --git a/drivers/block/blksnap/Kconfig b/drivers/block/blksnap/Kconfig
index 8588a89e30ad3..6f5ead17d4fac 100644
--- a/drivers/block/blksnap/Kconfig
+++ b/drivers/block/blksnap/Kconfig
@@ -6,7 +6,6 @@
 
 config BLK_SNAP
 	tristate "Block device snapshot and change tracker module"
-	depends on BLK_FILTER
 	help
 	  Allow to create snapshots and track block changes for a block
 	  devices. Designed for creating backups for any block devices
diff --git a/drivers/block/blksnap/tracker.c b/drivers/block/blksnap/tracker.c
index 705e64321cb29..aefac9e369ef5 100644
--- a/drivers/block/blksnap/tracker.c
+++ b/drivers/block/blksnap/tracker.c
@@ -58,11 +58,18 @@ void tracker_free(struct tracker *tracker)
 	refcount_dec(&trackers_counter);
 }
 
+/*
+ * XXX: this API needs to go away, filter drivers need to keep track of their
+ * instances.  Even if blksnap wants to index by the dev_t it could trivially
+ * do thatwith e.g. an xarray.
+ */
+extern struct bdev_filter *bdev_filter_get_by_bdev(struct block_device *bdev);
+
 struct tracker *tracker_get_by_dev(struct block_device *bdev)
 {
 	struct bdev_filter *flt;
 
-	flt = bdev_filter_get_by_altitude(bdev, bdev_filter_alt_blksnap);
+	flt = bdev_filter_get_by_bdev(bdev);
 	if (IS_ERR(flt))
 		return ERR_PTR(PTR_ERR(flt));
 	if (!flt)
@@ -70,10 +77,10 @@ struct tracker *tracker_get_by_dev(struct block_device *bdev)
 	return container_of(flt, struct tracker, flt);
 }
 
-static enum bdev_filter_result tracker_submit_bio_cb(struct bio *bio,
+static enum bdev_filter_result tracker_submit_bio(struct bio *bio,
 		struct bdev_filter *flt)
 {
-	enum bdev_filter_result ret = bdev_filter_pass;
+	enum bdev_filter_result ret = BIO_FILTER_PASS;
 	struct tracker *tracker = container_of(flt, struct tracker, flt);
 	int err;
 	sector_t sector;
@@ -83,7 +90,7 @@ static enum bdev_filter_result tracker_submit_bio_cb(struct bio *bio,
 	if (bio->bi_opf & REQ_NOWAIT) {
 		if (!percpu_down_read_trylock(&tracker_submit_lock)) {
 			bio_wouldblock_error(bio);
-			return bdev_filter_skip;
+			return BIO_FILTER_SKIP;
 		}
 	} else
 		percpu_down_read(&tracker_submit_lock);
@@ -118,7 +125,7 @@ static enum bdev_filter_result tracker_submit_bio_cb(struct bio *bio,
 	if (unlikely(err)) {
 		if (err == -EAGAIN) {
 			bio_wouldblock_error(bio);
-			ret = bdev_filter_skip;
+			ret = BIO_FILTER_SKIP;
 		} else
 			pr_err("Failed to copy data to diff storage with error %d\n", abs(err));
 
@@ -131,7 +138,7 @@ static enum bdev_filter_result tracker_submit_bio_cb(struct bio *bio,
 	 * be sent and returned to complete the processing of the original bio.
 	 */
 	if (!bio_list_empty(current->bio_list) && (bio->bi_opf & REQ_SYNC))
-		ret = bdev_filter_repeat;
+		ret = BIO_FILTER_REPEAT;
 out:
 	percpu_up_read(&tracker_submit_lock);
 	return ret;
@@ -157,9 +164,8 @@ static void tracker_release_work(struct work_struct *work)
 	} while (tracker);
 }
 
-static void tracker_detach_cb(struct kref *kref)
+static void tracker_detach(struct bdev_filter *flt)
 {
-	struct bdev_filter *flt = container_of(kref, struct bdev_filter, kref);
 	struct tracker *tracker = container_of(flt, struct tracker, flt);
 
 	spin_lock(&tracker_release_worker.lock);
@@ -170,8 +176,9 @@ static void tracker_detach_cb(struct kref *kref)
 }
 
 static const struct bdev_filter_operations tracker_fops = {
-	.submit_bio_cb = tracker_submit_bio_cb,
-	.detach_cb = tracker_detach_cb
+	.name		= KBUILD_MODNAME,
+	.submit_bio	= tracker_submit_bio,
+	.detach		= tracker_detach
 };
 
 static int tracker_filter_attach(struct block_device *bdev,
@@ -191,8 +198,7 @@ static int tracker_filter_attach(struct block_device *bdev,
 			 MINOR(bdev->bd_dev));
 	}
 
-	ret = bdev_filter_attach(bdev, KBUILD_MODNAME, bdev_filter_alt_blksnap,
-				 &tracker->flt);
+	ret = bdev_filter_attach(bdev, &tracker->flt);
 	if (is_frozen) {
 		if (thaw_bdev(bdev))
 			pr_err("Failed to thaw device [%u:%u]\n",
@@ -209,9 +215,9 @@ static int tracker_filter_attach(struct block_device *bdev,
 	return ret;
 }
 
-static int tracker_filter_detach(struct block_device *bdev)
+static int tracker_filter_detach(struct block_device *bdev,
+				 struct tracker *tracker)
 {
-	int ret;
 	bool is_frozen = false;
 
 	pr_debug("Tracker delete filter\n");
@@ -224,7 +230,8 @@ static int tracker_filter_detach(struct block_device *bdev)
 			 MINOR(bdev->bd_dev));
 	}
 
-	ret = bdev_filter_detach(bdev, KBUILD_MODNAME, bdev_filter_alt_blksnap);
+	bdev_filter_detach(bdev, &tracker->flt);
+	// XXX: this now manually need to call ->detach
 
 	if (is_frozen) {
 		if (thaw_bdev(bdev))
@@ -235,10 +242,7 @@ static int tracker_filter_detach(struct block_device *bdev)
 				 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 	}
 
-	if (ret)
-		pr_err("Failed to detach filter from device [%u:%u]\n",
-		       MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-	return ret;
+	return 0;
 }
 
 static struct tracker *tracker_new(struct block_device *bdev)
@@ -257,7 +261,7 @@ static struct tracker *tracker_new(struct block_device *bdev)
 	memory_object_inc(memory_object_tracker);
 #endif
 	refcount_inc(&trackers_counter);
-	bdev_filter_init(&tracker->flt, &tracker_fops);
+	tracker->flt.ops = &tracker_fops;
 	INIT_LIST_HEAD(&tracker->link);
 	atomic_set(&tracker->snapshot_is_taken, false);
 	tracker->dev_id = bdev->bd_dev;
@@ -447,7 +451,8 @@ struct tracker *tracker_create_or_get(dev_t dev_id)
 		 * a ref counter value of 2. This allows not to detach
 		 * the filter when the snapshot is released.
 		 */
-		bdev_filter_get(&tracker->flt);
+//		XXX: do manual refcouting here
+//		bdev_filter_get(&tracker->flt);
 
 		spin_lock(&tracked_device_lock);
 		list_add_tail(&tr_dev->link, &tracked_device_list);
@@ -497,7 +502,7 @@ int tracker_remove(dev_t dev_id)
 		goto put_tracker;
 	}
 
-	ret = tracker_filter_detach(bdev);
+	ret = tracker_filter_detach(bdev, tracker);
 	if (ret)
 		pr_err("Failed to remove tracker from device [%u:%u]\n",
 		       MAJOR(dev_id), MINOR(dev_id));
diff --git a/drivers/block/blksnap/tracker.h b/drivers/block/blksnap/tracker.h
index a9b0bec7b6016..9567e06a646c6 100644
--- a/drivers/block/blksnap/tracker.h
+++ b/drivers/block/blksnap/tracker.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/rwsem.h>
 #include <linux/blkdev.h>
+#include <linux/blk-filter.h>
 #include <linux/fs.h>
 
 struct cbt_map;
@@ -48,8 +49,9 @@ void tracker_unlock(void);
 
 static inline void tracker_put(struct tracker *tracker)
 {
-	if (likely(tracker))
-		bdev_filter_put(&tracker->flt);
+//	XXX: do manual refcountig here
+//	if (likely(tracker))
+//		bdev_filter_put(&tracker->flt);
 };
 
 struct tracker *tracker_get_by_dev(struct block_device *bdev);
diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h
new file mode 100644
index 0000000000000..868be7be0b789
--- /dev/null
+++ b/include/linux/blk-filter.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_BLK_FILTER_H
+#define _LINUX_BLK_FILTER_H
+
+struct bio;
+struct block_device;
+struct bdev_filter;
+
+enum bdev_filter_result {
+	/* Normal bio submission continues: */
+	BIO_FILTER_PASS,
+	/* Bio is consumed by the filter driver: */
+	BIO_FILTER_SKIP,
+	/* Call into the filter driver again: */
+	BIO_FILTER_REPEAT,
+};
+
+struct bdev_filter_operations {
+	const char *name;
+	/* called from submit_bio() */
+	enum bdev_filter_result (*submit_bio)(struct bio *bio,
+			struct bdev_filter *flt);
+	/* called when the block device goes away */
+	void (*detach)(struct bdev_filter *flt);
+};
+
+struct bdev_filter {
+	const struct bdev_filter_operations *ops;
+};
+
+int bdev_filter_attach(struct block_device *bdev, struct bdev_filter *flt);
+void bdev_filter_detach(struct block_device *bdev, struct bdev_filter *flt);
+
+#endif /* _LINUX_BLK_FILTER_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b88f506ea59e1..a8b3833dc1240 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -37,23 +37,6 @@ struct bio_crypt_ctx;
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
-#ifdef CONFIG_BLK_FILTER
-/**
- * enum bdev_filter_altitudes - Set of reserved altitudes for block device
- *	filters.
- *
- * @bdev_filter_alt_blksnap:
- *	An altitude for the blksnap module.
- * @bdev_filter_alt_end:
- *	Indicates the end of the altitude set.
- */
-enum bdev_filter_altitudes {
-	bdev_filter_alt_blksnap = 0,
-	bdev_filter_alt_end
-};
-struct bdev_filter;
-#endif
-
 struct block_device {
 	sector_t		bd_start_sect;
 	sector_t		bd_nr_sectors;
@@ -74,6 +57,7 @@ struct block_device {
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
 	struct gendisk *	bd_disk;
 	struct request_queue *	bd_queue;
+	struct bdev_filter	*bd_filter;
 
 	/* The counter of freeze processes */
 	int			bd_fsfreeze_count;
@@ -85,10 +69,6 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
-#ifdef CONFIG_BLK_FILTER
-	struct bdev_filter	*bd_filters[bdev_filter_alt_end];
-	spinlock_t		bd_filters_lock;
-#endif
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b3705b9a95167..b9a94c53c6cd3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1559,85 +1559,4 @@ struct io_comp_batch {
 
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
-#ifdef CONFIG_BLK_FILTER
-/**
- * enum bdev_filter_result - The result of bio processing by
- *	the block device filter.
- *
- * @bdev_filter_skip:
- *	Original bio does not need to be submitted.
- * @bdev_filter_pass:
- *	It is necessary to submit the original request.
- * @bdev_filter_repeat:
- *	Bio processing has not been completed, a second call is required.
- * @bdev_filter_redirect:
- *	Original bio was redirected to another block device. The set
- *	of filters on it is different, so processing must be repeated.
- */
-enum bdev_filter_result {
-	bdev_filter_skip = 0,
-	bdev_filter_pass,
-	bdev_filter_repeat,
-	bdev_filter_redirect
-};
-struct bdev_filter;
-/**
- * bdev_filter_operations - List of callback functions for the filter.
- *
- * @submit_bio_cb:
- *	A callback function for bio processing.
- * @detach_cb:
- *	A callback function to disable the filter when removing a block
- *	device from the system.
- */
-struct bdev_filter_operations {
-	enum bdev_filter_result (*submit_bio_cb)(struct bio *bio,
-						 struct bdev_filter *flt);
-	void (*detach_cb)(struct kref *kref);
-};
-/**
- * struct bdev_filter - Block device filter.
- *
- * @kref:
- *	Kernel reference counter.
- * @fops:
- *	The pointer to &struct bdev_filter_operations with callback
- *	functions for the filter.
- */
-struct bdev_filter {
-	struct kref kref;
-	const struct bdev_filter_operations *fops;
-};
-/**
- * bdev_filter_init - Initialization of the filter structure.
- * @flt:
- *	Pointer to the &struct bdev_filter to be initialized.
- * @fops:
- *	The callback functions for the filter.
- */
-static inline void bdev_filter_init(struct bdev_filter *flt,
-		const struct bdev_filter_operations *fops)
-{
-	kref_init(&flt->kref);
-	flt->fops = fops;
-};
-int bdev_filter_attach(struct block_device *bdev, const char *name,
-		       const enum bdev_filter_altitudes altitude,
-		       struct bdev_filter *flt);
-int bdev_filter_detach(struct block_device *bdev, const char *name,
-		       const enum bdev_filter_altitudes altitude);
-struct bdev_filter *bdev_filter_get_by_altitude(struct block_device *bdev,
-		       const enum bdev_filter_altitudes altitude);
-static inline void bdev_filter_get(struct bdev_filter *flt)
-{
-	kref_get(&flt->kref);
-}
-static inline void bdev_filter_put(struct bdev_filter *flt)
-{
-	if (flt)
-		kref_put(&flt->kref, flt->fops->detach_cb);
-};
-
-#endif
-
 #endif /* _LINUX_BLKDEV_H */
Sergei Shtepa July 7, 2022, 8:26 a.m. UTC | #6
Thank you, Christoph, for your attention to the patch.

I am preparing the next version of the patch. In it, I planned to
simplify the bdev_filer code.
I will make changes in it, in accordance with your comments, and
will add your code and check it on my test labs.

But I'm not sure if using the blk_mq_freeze_queue() is appropriate.
If I understood the code correctly, it is based on the expectation
that the counter q->q_usage_counter will decrease to zero.
To increase it, a blk_queue_enter() is used. And at the time of
calling the filter_bio() in the submit_bio_noacct(), this counter
has not yet been increased. I will double check this and try to
get rid of the bdev->bd_filter_lock.
Christoph Hellwig July 7, 2022, 5:26 p.m. UTC | #7
On Thu, Jul 07, 2022 at 10:26:55AM +0200, Sergei Shtepa wrote:
> Thank you, Christoph, for your attention to the patch.
> 
> I am preparing the next version of the patch. In it, I planned to
> simplify the bdev_filer code.
> I will make changes in it, in accordance with your comments, and
> will add your code and check it on my test labs.
> 
> But I'm not sure if using the blk_mq_freeze_queue() is appropriate.
> If I understood the code correctly, it is based on the expectation
> that the counter q->q_usage_counter will decrease to zero.
> To increase it, a blk_queue_enter() is used. And at the time of
> calling the filter_bio() in the submit_bio_noacct(), this counter
> has not yet been increased. I will double check this and try to
> get rid of the bdev->bd_filter_lock.

Indeed.  For this to work we'd need to call the filter driver
later.  Which is brings up another question:  Is there a real
need to attach the filter driver to the bdev and thus potentially
partition?  The rest of the block layer operates on the whole disk
after the intial partition remapping, and besides allowing the
filter driver to be called under q_usage_counter, this would
also clean up some concepts.  It would probably also allow to
remove the repeat return value over just using submit_bio_noacct
similar to how normal stacking drivers reinject bios.
Sergei Shtepa July 8, 2022, 10:45 a.m. UTC | #8
On 7/7/22 19:26, Christoph Hellwig wrote:
> 
> On Thu, Jul 07, 2022 at 10:26:55AM +0200, Sergei Shtepa wrote:
>> Thank you, Christoph, for your attention to the patch.
>>
>> I am preparing the next version of the patch. In it, I planned to
>> simplify the bdev_filer code.
>> I will make changes in it, in accordance with your comments, and
>> will add your code and check it on my test labs.
>>
>> But I'm not sure if using the blk_mq_freeze_queue() is appropriate.
>> If I understood the code correctly, it is based on the expectation
>> that the counter q->q_usage_counter will decrease to zero.
>> To increase it, a blk_queue_enter() is used. And at the time of
>> calling the filter_bio() in the submit_bio_noacct(), this counter
>> has not yet been increased. I will double check this and try to
>> get rid of the bdev->bd_filter_lock.
> Indeed.  For this to work we'd need to call the filter driver
> later.  Which is brings up another question:  Is there a real
> need to attach the filter driver to the bdev and thus potentially
> partition?  The rest of the block layer operates on the whole disk
> after the intial partition remapping, and besides allowing the
> filter driver to be called under q_usage_counter, this would
> also clean up some concepts.  It would probably also allow to
> remove the repeat return value over just using submit_bio_noacct
> similar to how normal stacking drivers reinject bios.
> 

Thank you Christoph.
This is the most crucial question for the entire patch.
The filtering location sets restrictions for the filter code and
determines its main algorithm.

1. Work at the partition or disk level?
At the user level, programs operate with block devices.
In fact, the "disk" entity makes sense only for the kernel level. 
When the user chooses which block devices to backup and which not,
he operates with mounting points, which are converted into block
devices, partitions. Therefore, it is better to handle bio before
remapping to disk.
If the filtering is performed after remapping, then we will be
forced to apply a filter to the entire disk, or complicate the
filtering algorithm by calculating which range of sectors bio is
addressed to. And if bio is addressed to the partition boundary...
Filtering at the block device level seems to me a simpler solution.
But this is not the biggest problem.

2. Can the filter sleep or postpone bio processing to the worker thread?
The problem is in the implementation of the COW algorithm.
If I send a bio to read a chunk (one bio), and then pass a write bio,
then with some probability I am reading partially overwritten data.
Writing overtakes reading. And flags REQ_SYNC and REQ_PREFLUSH don't help.
Maybe it's a disk driver issue, or a hypervisor, or a NAS, or a RAID,
or maybe normal behavior. I don't know. Although, maybe I'm not working
correctly with flags. I have seen the comments on patch 11/20, but I am
not sure that the fixes will solve this problem.
But because of this, I have to postpone the write until the read completes.

2.1 The easiest way to solve the problem is to block the writer's thread
with a semaphore. And for bio with a flag REQ_NOWAIT, complete processing
with bio_wouldblock_error(). This is the solution currently being used.

2.2 Another solution is possible without putting the thread into a sleep
state, but with placing a write bio in a queue to another thread.
This solution is used in the veeamsnap out-of-tree module and it has
performance issues. I don't like. But when handling make_request_fn,
which was on kernels before 5.10, there was no choice.

The current implementation, when the filtering is performed before
remapping, allows to handle the bio to the partition, and allows to
switch the writer's thread to the sleep state.
I had to pay for it with a reference counter on the filter and a spinlock.
It may be possible to do better with RCU. I haven't tried it yet.

If I am blocked by the q->q_usage_counter counter, then I will not
be able to execute COW in the context of the current thread due to deadlocks.
I will have to use a scheme with an additional worker thread.
Bio filtering will become much more complicated.

From an architectural point of view, I see the filter as an intermediate
layer between the file system and the block layer. If we lower the filter
deep into the block layer, then restrictions will be imposed on its use.
Christoph Hellwig July 13, 2022, 11:56 a.m. UTC | #9
On Fri, Jul 08, 2022 at 12:45:33PM +0200, Sergei Shtepa wrote:
> 1. Work at the partition or disk level?
> At the user level, programs operate with block devices.
> In fact, the "disk" entity makes sense only for the kernel level. 
> When the user chooses which block devices to backup and which not,
> he operates with mounting points, which are converted into block
> devices, partitions. Therefore, it is better to handle bio before
> remapping to disk.
> If the filtering is performed after remapping, then we will be
> forced to apply a filter to the entire disk, or complicate the
> filtering algorithm by calculating which range of sectors bio is
> addressed to. And if bio is addressed to the partition boundary...
> Filtering at the block device level seems to me a simpler solution.
> But this is not the biggest problem.

Note that bi_bdev stays for the partition things came from.  So we
could still do filtering after blk_partition_remap has been called,
the filter driver just needs to be careful on how to interpret the
sector numbers.

> 2. Can the filter sleep or postpone bio processing to the worker thread?

I think all of te above is fine, just for normal submit_bio based
drivers.

> The problem is in the implementation of the COW algorithm.
> If I send a bio to read a chunk (one bio), and then pass a write bio,
> then with some probability I am reading partially overwritten data.
> Writing overtakes reading. And flags REQ_SYNC and REQ_PREFLUSH don't help.
> Maybe it's a disk driver issue, or a hypervisor, or a NAS, or a RAID,
> or maybe normal behavior. I don't know. Although, maybe I'm not working
> correctly with flags. I have seen the comments on patch 11/20, but I am
> not sure that the fixes will solve this problem.
> But because of this, I have to postpone the write until the read completes.

In the I/O stack there really isn't any ordering.  While a general
reordering looks a bit odd to be, it absolutely it always possible.

> 2.1 The easiest way to solve the problem is to block the writer's thread
> with a semaphore. And for bio with a flag REQ_NOWAIT, complete processing
> with bio_wouldblock_error(). This is the solution currently being used.

This sounds ok.  The other option would be to put the write on hold and
only queue it up from the read completion (or rather a workqueue kicked
off from the read completion).  But this is basically the same, just
without blocking the I/O submitter, so we could do the semaphore first
and optimize later as needed.

> If I am blocked by the q->q_usage_counter counter, then I will not
> be able to execute COW in the context of the current thread due to deadlocks.
> I will have to use a scheme with an additional worker thread.
> Bio filtering will become much more complicated.

q_usage_counter itself doesn't really block you from doing anything.
You can still sleep inside of it, and most driver do that.
Sergei Shtepa July 13, 2022, 1:47 p.m. UTC | #10
On 7/13/22 13:56, Christoph Hellwig wrote:
> 
> On Fri, Jul 08, 2022 at 12:45:33PM +0200, Sergei Shtepa wrote:
>> 1. Work at the partition or disk level?
>> At the user level, programs operate with block devices.
>> In fact, the "disk" entity makes sense only for the kernel level. 
>> When the user chooses which block devices to backup and which not,
>> he operates with mounting points, which are converted into block
>> devices, partitions. Therefore, it is better to handle bio before
>> remapping to disk.
>> If the filtering is performed after remapping, then we will be
>> forced to apply a filter to the entire disk, or complicate the
>> filtering algorithm by calculating which range of sectors bio is
>> addressed to. And if bio is addressed to the partition boundary...
>> Filtering at the block device level seems to me a simpler solution.
>> But this is not the biggest problem.
> Note that bi_bdev stays for the partition things came from.  So we
> could still do filtering after blk_partition_remap has been called,
> the filter driver just needs to be careful on how to interpret the
> sector numbers.

Thanks. I'll check it out.

> 
>> 2. Can the filter sleep or postpone bio processing to the worker thread?
> I think all of te above is fine, just for normal submit_bio based
> drivers.
 
Good. But I'm starting to think that for request-based block devices,
filtering should be different. I need to check it out.

>> The problem is in the implementation of the COW algorithm.
>> If I send a bio to read a chunk (one bio), and then pass a write bio,
>> then with some probability I am reading partially overwritten data.
>> Writing overtakes reading. And flags REQ_SYNC and REQ_PREFLUSH don't help.
>> Maybe it's a disk driver issue, or a hypervisor, or a NAS, or a RAID,
>> or maybe normal behavior. I don't know. Although, maybe I'm not working
>> correctly with flags. I have seen the comments on patch 11/20, but I am
>> not sure that the fixes will solve this problem.
>> But because of this, I have to postpone the write until the read completes.
> In the I/O stack there really isn't any ordering.  While a general
> reordering looks a bit odd to be, it absolutely it always possible.
> 

Thank you!
So this is normal behavior and locking the writing is necessary.
When designing the module, I mistakenly thought that it would be enough
to set the correct order of sending bios.

>> 2.1 The easiest way to solve the problem is to block the writer's thread
>> with a semaphore. And for bio with a flag REQ_NOWAIT, complete processing
>> with bio_wouldblock_error(). This is the solution currently being used.
> This sounds ok.  The other option would be to put the write on hold and
> only queue it up from the read completion (or rather a workqueue kicked
> off from the read completion).  But this is basically the same, just
> without blocking the I/O submitter, so we could do the semaphore first
> and optimize later as needed.
> 
>> If I am blocked by the q->q_usage_counter counter, then I will not
>> be able to execute COW in the context of the current thread due to deadlocks.
>> I will have to use a scheme with an additional worker thread.
>> Bio filtering will become much more complicated.
> q_usage_counter itself doesn't really block you from doing anything.
> You can still sleep inside of it, and most driver do that.
> 
Ok. I will try to lower the handle point under the protection of the
q_usage_counter. Maybe I'm mistaken about deadlocks.

Thank you so much for the review and for the explanatory answers!
I got a lot of useful recommendations.
I have a lot of work to do to improve the patch.
Christoph Hellwig July 14, 2022, 5:12 a.m. UTC | #11
On Wed, Jul 13, 2022 at 03:47:23PM +0200, Sergei Shtepa wrote:
> >> 2. Can the filter sleep or postpone bio processing to the worker thread?
> > I think all of te above is fine, just for normal submit_bio based
> > drivers.
>  
> Good. But I'm starting to think that for request-based block devices,
> filtering should be different. I need to check it out.

As long as you filter in the submit_bio stack you handle both submit_bio
and request based (blk-mq) drivers.  So I don't think we hould need
to handle them any differently.

> I have a lot of work to do to improve the patch.

If you have any questions or want to get feedback on iterations not
ready to post feel free to ask me offlist.
Sergei Shtepa July 14, 2022, 9:22 a.m. UTC | #12
On 7/14/22 07:12, Christoph Hellwig wrote:
> To:
> Sergei Shtepa <sergei.shtepa@veeam.com>
> CC:
> Christoph Hellwig <hch@infradead.org>, "axboe@kernel.dk" <axboe@kernel.dk>, "linux-block@vger.kernel.org" <linux-block@vger.kernel.org>, "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
> 
> 
> On Wed, Jul 13, 2022 at 03:47:23PM +0200, Sergei Shtepa wrote:
>>>> 2. Can the filter sleep or postpone bio processing to the worker thread?
>>> I think all of te above is fine, just for normal submit_bio based
>>> drivers.
>>  
>> Good. But I'm starting to think that for request-based block devices,
>> filtering should be different. I need to check it out.
> As long as you filter in the submit_bio stack you handle both submit_bio
> and request based (blk-mq) drivers.  So I don't think we hould need
> to handle them any differently.
> 
>> I have a lot of work to do to improve the patch.
> If you have any questions or want to get feedback on iterations not
> ready to post feel free to ask me offlist.
> 
Thank you very much.
When a worthy question matures in me, I will take advantage of your offer.
diff mbox series

Patch

diff --git a/block/Kconfig b/block/Kconfig
index 50b17e260fa2..256483e00224 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -225,6 +225,14 @@  config BLK_MQ_RDMA
 config BLK_PM
 	def_bool PM
 
+config BLK_FILTER
+	bool "Enable block device filters"
+	default n
+	help
+	  Enabling this lets the block layer filters handle bio requests.
+	  Kernel modules can use this feature to extend the functionality
+	  of the block layer.
+
 # do not use in new code
 config BLOCK_HOLDER_DEPRECATED
 	bool
diff --git a/block/bdev.c b/block/bdev.c
index 5fe06c1f2def..4bcd9f4378e3 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -426,8 +426,15 @@  static void init_once(void *data)
 	inode_init_once(&ei->vfs_inode);
 }
 
+#ifdef CONFIG_BLK_FILTER
+static void bdev_filter_cleanup(struct block_device *bdev);
+#endif
+
 static void bdev_evict_inode(struct inode *inode)
 {
+#ifdef CONFIG_BLK_FILTER
+	bdev_filter_cleanup(I_BDEV(inode));
+#endif
 	truncate_inode_pages_final(&inode->i_data);
 	invalidate_inode_buffers(inode); /* is it needed here? */
 	clear_inode(inode);
@@ -503,6 +510,11 @@  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+
+#ifdef CONFIG_BLK_FILTER
+	memset(bdev->bd_filters, 0, sizeof(bdev->bd_filters));
+	spin_lock_init(&bdev->bd_filters_lock);
+#endif
 	return bdev;
 }
 
@@ -1071,3 +1083,120 @@  void sync_bdevs(bool wait)
 	spin_unlock(&blockdev_superblock->s_inode_list_lock);
 	iput(old_inode);
 }
+
+#ifdef CONFIG_BLK_FILTER
+static void bdev_filter_cleanup(struct block_device *bdev)
+{
+	int altitude;
+	struct bdev_filter *flt;
+
+	for (altitude = 0; altitude < bdev_filter_alt_end; altitude++) {
+		spin_lock(&bdev->bd_filters_lock);
+		flt = bdev->bd_filters[altitude];
+		bdev->bd_filters[altitude] = NULL;
+		spin_unlock(&bdev->bd_filters_lock);
+
+		bdev_filter_put(flt);
+	}
+}
+
+/**
+ * bdev_filter_attach - Attach a filter to the original block device.
+ * @bdev:
+ *	Block device.
+ * @name:
+ *	Name of the block device filter.
+ * @altitude:
+ *	Altituda number of the block device filter.
+ * @flt:
+ *	Pointer to the filter structure.
+ *
+ * Before adding a filter, it is necessary to initialize &struct bdev_filter.
+ *
+ * The bdev_filter_detach() function allows to detach the filter from the block
+ * device.
+ *
+ * Return:
+ * 0 - OK
+ * -EALREADY - a filter with this name already exists
+ */
+int bdev_filter_attach(struct block_device *bdev, const char *name,
+		       const enum bdev_filter_altitudes altitude,
+		       struct bdev_filter *flt)
+{
+	int ret = 0;
+
+	spin_lock(&bdev->bd_filters_lock);
+	if (bdev->bd_filters[altitude])
+		ret = -EALREADY;
+	else
+		bdev->bd_filters[altitude] = flt;
+	spin_unlock(&bdev->bd_filters_lock);
+
+	if (!ret)
+		pr_info("block device filter '%s' has been attached to %d:%d",
+			name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bdev_filter_attach);
+
+/**
+ * bdev_filter_detach - Detach a filter from the block device.
+ * @bdev:
+ *	Block device.
+ * @name:
+ *	Name of the block device filter.
+ * @altitude:
+ *	Altituda number of the block device filter.
+ *
+ * The filter should be added using the bdev_filter_attach() function.
+ *
+ * Return:
+ * 0 - OK
+ * -ENOENT - the filter was not found in the linked list
+ */
+int bdev_filter_detach(struct block_device *bdev, const char *name,
+		       const enum bdev_filter_altitudes altitude)
+{
+	struct bdev_filter *flt = NULL;
+
+	spin_lock(&bdev->bd_filters_lock);
+	flt = bdev->bd_filters[altitude];
+	bdev->bd_filters[altitude] = NULL;
+	spin_unlock(&bdev->bd_filters_lock);
+
+	if (!flt)
+		return -ENOENT;
+
+	bdev_filter_put(flt);
+	pr_info("block device filter '%s' has been detached from %d:%d",
+		name, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_filter_detach);
+
+/**
+ * bdev_filter_get_by_altitude - Get filter by altitude.
+ * @bdev:
+ *	Pointer to the block device structure.
+ *
+ * Return:
+ * pointer - pointer to filters structure from &struct blk_filter
+ * NULL - no filter has been set
+ */
+struct bdev_filter *bdev_filter_get_by_altitude(struct block_device *bdev,
+				const enum bdev_filter_altitudes altitude)
+{
+	struct bdev_filter *flt;
+
+	spin_lock(&bdev->bd_filters_lock);
+	flt = bdev->bd_filters[altitude];
+	if (flt)
+		bdev_filter_get(flt);
+	spin_unlock(&bdev->bd_filters_lock);
+
+	return flt;
+}
+EXPORT_SYMBOL_GPL(bdev_filter_get_by_altitude);
+#endif
diff --git a/block/blk-core.c b/block/blk-core.c
index 06ff5bbfe8f6..a44906fb08aa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -757,6 +757,86 @@  void submit_bio_noacct_nocheck(struct bio *bio)
 		__submit_bio_noacct(bio);
 }
 
+#ifdef CONFIG_BLK_FILTER
+
+/**
+ * __filter_bio() - Process bio by the block device filter.
+ * @flt:
+ *	Block device filter.
+ * @bio:
+ *	Original I/O unit.
+ *
+ * Return:
+ * bdev_filter_pass - original bio should be submitted
+ * bdev_filter_skip - do not submit original bio
+ * bdev_filter_redirect - repeat bio processing for another block device
+ */
+static inline enum bdev_filter_result __filter_bio(struct bdev_filter *flt,
+						   struct bio *bio)
+{
+	enum bdev_filter_result result;
+	struct bio *new_bio;
+	struct bio_list bio_list[2] = { };
+
+	do {
+		bio_list_init(&bio_list[0]);
+		current->bio_list = bio_list;
+
+		result = flt->fops->submit_bio_cb(bio, flt);
+
+		current->bio_list = NULL;
+
+		while ((new_bio = bio_list_pop(&bio_list[0]))) {
+			bio_set_flag(new_bio, BIO_FILTERED);
+			submit_bio_noacct(new_bio);
+		};
+	} while (result == bdev_filter_repeat);
+
+	return result;
+}
+
+/**
+ * filter_bio() - Pass bio to the block device filters.
+ * @bio:
+ *	Original I/O unit.
+ *
+ * Return:
+ * true - original bio should be submitted
+ * false - do not submit original bio
+ */
+static bool filter_bio(struct bio *bio)
+{
+	enum bdev_filter_result result = bdev_filter_pass;
+
+	if (bio_flagged(bio, BIO_FILTERED))
+		return true;
+	do {
+		struct block_device *bdev = bio->bi_bdev;
+		unsigned int altitude = 0;
+
+		while (altitude < bdev_filter_alt_end) {
+			struct bdev_filter *flt;
+
+			spin_lock(&bdev->bd_filters_lock);
+			flt = bdev->bd_filters[altitude];
+			if (flt)
+				bdev_filter_get(flt);
+			spin_unlock(&bdev->bd_filters_lock);
+
+			if (flt) {
+				result = __filter_bio(flt, bio);
+				bdev_filter_put(flt);
+				if (result != bdev_filter_pass)
+					break;
+			}
+			altitude++;
+		}
+	} while (result == bdev_filter_redirect);
+
+	return (result == bdev_filter_pass);
+}
+#endif
+
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -790,6 +870,14 @@  void submit_bio_noacct(struct bio *bio)
 		goto end_io;
 	if (unlikely(bio_check_ro(bio)))
 		goto end_io;
+#ifdef CONFIG_BLK_FILTER
+	/*
+	 * It looks like should_fail_bio() and bio_check_ro() can be placed
+	 * in a separate block device filter for debugging.
+	 */
+	if (!filter_bio(bio))
+		goto end_io;
+#endif
 	if (!bio_flagged(bio, BIO_REMAPPED)) {
 		if (unlikely(bio_check_eod(bio)))
 			goto end_io;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a24d4078fb21..b88f506ea59e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -37,6 +37,23 @@  struct bio_crypt_ctx;
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
+#ifdef CONFIG_BLK_FILTER
+/**
+ * enum bdev_filter_altitudes - Set of reserved altitudes for block device
+ *	filters.
+ *
+ * @bdev_filter_alt_blksnap:
+ *	An altitude for the blksnap module.
+ * @bdev_filter_alt_end:
+ *	Indicates the end of the altitude set.
+ */
+enum bdev_filter_altitudes {
+	bdev_filter_alt_blksnap = 0,
+	bdev_filter_alt_end
+};
+struct bdev_filter;
+#endif
+
 struct block_device {
 	sector_t		bd_start_sect;
 	sector_t		bd_nr_sectors;
@@ -68,6 +85,10 @@  struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+#ifdef CONFIG_BLK_FILTER
+	struct bdev_filter	*bd_filters[bdev_filter_alt_end];
+	spinlock_t		bd_filters_lock;
+#endif
 } __randomize_layout;
 
 #define bdev_whole(_bdev) \
@@ -332,6 +353,7 @@  enum {
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+	BIO_FILTERED,		/* bio has already been filtered */
 	BIO_FLAG_LAST
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 608d577734c2..24cb5293897f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1573,4 +1573,85 @@  struct io_comp_batch {
 
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
+#ifdef CONFIG_BLK_FILTER
+/**
+ * enum bdev_filter_result - The result of bio processing by
+ *	the block device filter.
+ *
+ * @bdev_filter_skip:
+ *	Original bio does not need to be submitted.
+ * @bdev_filter_pass:
+ *	It is necessary to submit the original request.
+ * @bdev_filter_repeat:
+ *	Bio processing has not been completed, a second call is required.
+ * @bdev_filter_redirect:
+ *	Original bio was redirected to another block device. The set
+ *	of filters on it is different, so processing must be repeated.
+ */
+enum bdev_filter_result {
+	bdev_filter_skip = 0,
+	bdev_filter_pass,
+	bdev_filter_repeat,
+	bdev_filter_redirect
+};
+struct bdev_filter;
+/**
+ * bdev_filter_operations - List of callback functions for the filter.
+ *
+ * @submit_bio_cb:
+ *	A callback function for bio processing.
+ * @detach_cb:
+ *	A callback function to disable the filter when removing a block
+ *	device from the system.
+ */
+struct bdev_filter_operations {
+	enum bdev_filter_result (*submit_bio_cb)(struct bio *bio,
+						 struct bdev_filter *flt);
+	void (*detach_cb)(struct kref *kref);
+};
+/**
+ * struct bdev_filter - Block device filter.
+ *
+ * @kref:
+ *	Kernel reference counter.
+ * @fops:
+ *	The pointer to &struct bdev_filter_operations with callback
+ *	functions for the filter.
+ */
+struct bdev_filter {
+	struct kref kref;
+	const struct bdev_filter_operations *fops;
+};
+/**
+ * bdev_filter_init - Initialization of the filter structure.
+ * @flt:
+ *	Pointer to the &struct bdev_filter to be initialized.
+ * @fops:
+ *	The callback functions for the filter.
+ */
+static inline void bdev_filter_init(struct bdev_filter *flt,
+		const struct bdev_filter_operations *fops)
+{
+	kref_init(&flt->kref);
+	flt->fops = fops;
+};
+int bdev_filter_attach(struct block_device *bdev, const char *name,
+		       const enum bdev_filter_altitudes altitude,
+		       struct bdev_filter *flt);
+int bdev_filter_detach(struct block_device *bdev, const char *name,
+		       const enum bdev_filter_altitudes altitude);
+struct bdev_filter *bdev_filter_get_by_altitude(struct block_device *bdev,
+		       const enum bdev_filter_altitudes altitude);
+static inline void bdev_filter_get(struct bdev_filter *flt)
+{
+	kref_get(&flt->kref);
+}
+static inline void bdev_filter_put(struct bdev_filter *flt)
+{
+	if (flt)
+		kref_put(&flt->kref, flt->fops->detach_cb);
+};
+
+#endif
+
 #endif /* _LINUX_BLKDEV_H */