diff mbox series

[1/2] block: blk_interposer

Message ID 1611853955-32167-2-git-send-email-sergei.shtepa@veeam.com (mailing list archive)
State Superseded, archived
Delegated to: Mike Snitzer
Headers show
Series block: blk_interposer v3 | expand

Commit Message

Sergei Shtepa Jan. 28, 2021, 5:12 p.m. UTC
The block layer interposer allows to intercept bio requests.
This allows to connect device mapper and other kernel modules
to the block device stack on the fly.

changes:
  * new BIO_INTERPOSED bio flag.
  * new function __submit_bio_interposed() implements the interposers
    logic.
  * new function blk_mq_is_queue_frozen() allow to assert that
    the queue is frozen.
  * functions blk_interposer_attach() and blk_interposer_detach()
    allow to attach and detach interposers.
Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
---
 block/bio.c               |  2 +
 block/blk-core.c          | 29 ++++++++++++++
 block/blk-mq.c            | 13 +++++++
 block/genhd.c             | 82 +++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h    |  1 +
 include/linux/blk_types.h |  6 ++-
 include/linux/genhd.h     | 19 +++++++++
 7 files changed, 150 insertions(+), 2 deletions(-)

Comments

Damien Le Moal Jan. 29, 2021, 1:21 a.m. UTC | #1
On 2021/01/29 2:23, Sergei Shtepa wrote:
> The block layer interposer allows to intercept bio requests.
> This allows to connect device mapper and other kernel modules
> to the block device stack on the fly.
> 
> changes:
>   * new BIO_INTERPOSED bio flag.
>   * new function __submit_bio_interposed() implements the interposers
>     logic.
>   * new function blk_mq_is_queue_frozen() allow to assert that
>     the queue is frozen.
>   * functions blk_interposer_attach() and blk_interposer_detach()
>     allow to attach and detach interposers.

The changelog should not be part of the commit message. If you need a changelog
for a single patch, add it between the commit message end "---" and the patch
stats. git will ignore that part.

> Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> ---
>  block/bio.c               |  2 +
>  block/blk-core.c          | 29 ++++++++++++++
>  block/blk-mq.c            | 13 +++++++
>  block/genhd.c             | 82 +++++++++++++++++++++++++++++++++++++++
>  include/linux/blk-mq.h    |  1 +
>  include/linux/blk_types.h |  6 ++-
>  include/linux/genhd.h     | 19 +++++++++
>  7 files changed, 150 insertions(+), 2 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 1f2cc1fbe283..f6f135eb84b5 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -684,6 +684,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
>  	bio_set_flag(bio, BIO_CLONED);
>  	if (bio_flagged(bio_src, BIO_THROTTLED))
>  		bio_set_flag(bio, BIO_THROTTLED);
> +	if (bio_flagged(bio_src, BIO_INTERPOSED))
> +		bio_set_flag(bio, BIO_INTERPOSED);
>  	bio->bi_opf = bio_src->bi_opf;
>  	bio->bi_ioprio = bio_src->bi_ioprio;
>  	bio->bi_write_hint = bio_src->bi_write_hint;
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 7663a9b94b80..07ec82d8fe57 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1032,6 +1032,32 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>  	return ret;
>  }
>  
> +static blk_qc_t __submit_bio_interposed(struct bio *bio)
> +{
> +	struct bio_list bio_list[2] = { };
> +	blk_qc_t ret = BLK_QC_T_NONE;
> +
> +	current->bio_list = bio_list;
> +	if (likely(bio_queue_enter(bio) == 0)) {
> +		struct gendisk *disk = bio->bi_disk;
> +
> +		bio_set_flag(bio, BIO_INTERPOSED);
> +		if (likely(blk_has_interposer(disk)))
> +			disk->interposer->ip_submit_bio(bio);

Why do you check again blk_has_interposer() here ? That is checked already in
submit_bio_noacct() and the interposer attach/detach cannot run without the
queue frozen. So I do not think you need to check again. If you do, then you
definitely have a race condition here.

> +		else /* interposer was removed */
> +			bio_list_add(&current->bio_list[0], bio);
> +
> +		blk_queue_exit(disk->queue);
> +	}
> +	current->bio_list = NULL;
> +
> +	/* Resubmit remaining bios */
> +	while ((bio = bio_list_pop(&bio_list[0])))
> +		ret = submit_bio_noacct(bio);
> +
> +	return ret;
> +}
> +
>  /**
>   * submit_bio_noacct - re-submit a bio to the block device layer for I/O
>   * @bio:  The bio describing the location in memory and on the device.
> @@ -1057,6 +1083,9 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
>  		return BLK_QC_T_NONE;
>  	}
>  
> +	if (blk_has_interposer(bio->bi_disk) &&
> +	    !bio_flagged(bio, BIO_INTERPOSED))
> +		return __submit_bio_interposed(bio);

OK. I *think* that this is to handle stacked devices, right ? Otherwise, this
condition does not make much sense. Why not just:

	if (blk_has_interposer(bio->bi_disk))
		return __submit_bio_interposed(bio);

So at least adding some comments here may be good.

>  	if (!bio->bi_disk->fops->submit_bio)
>  		return __submit_bio_noacct_mq(bio);
>  	return __submit_bio_noacct(bio);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index f285a9123a8b..924ec26fae5f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -161,6 +161,19 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
>  
> +
> +bool blk_mq_is_queue_frozen(struct request_queue *q)
> +{
> +	bool ret;
> +
> +	mutex_lock(&q->mq_freeze_lock);
> +	ret = percpu_ref_is_dying(&q->q_usage_counter) && percpu_ref_is_zero(&q->q_usage_counter);
> +	mutex_unlock(&q->mq_freeze_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_is_queue_frozen);

Maybe move this change to its own patch preceding this one ?

> +
>  /*
>   * Guarantee no request is in use, so we can change any data structure of
>   * the queue afterward.
> diff --git a/block/genhd.c b/block/genhd.c
> index 419548e92d82..d3459582f56c 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -30,6 +30,7 @@
>  static struct kobject *block_depr;
>  
>  DECLARE_RWSEM(bdev_lookup_sem);
> +DEFINE_MUTEX(bdev_interposer_mutex);
>  
>  /* for extended dynamic devt allocation, currently only one major is used */
>  #define NR_EXT_DEVT		(1 << MINORBITS)
> @@ -2148,3 +2149,84 @@ static void disk_release_events(struct gendisk *disk)
>  	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
>  	kfree(disk->ev);
>  }
> +
> +/**
> + * blk_interposer_attach - Attach interposer to disk
> + * @disk: target disk
> + * @interposer: block device interposer
> + * @ip_submit_bio: hook for submit_bio()
> + *
> + * Returns:
> + *     -EINVAL if @interposer is NULL.
> + *     -EPERM if queue is not frozen.
> + *     -EBUSY if the block device already has @interposer.
> + *     -EALREADY if the block device already has @interposer with same callback.
> + *
> + * Disk must be frozen by blk_mq_freeze_queue().
> + */
> +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
> +			  const ip_submit_bio_t ip_submit_bio)
> +{
> +	int ret = 0;
> +
> +	if (!interposer)
> +		return -EINVAL;

Is this really necessary ? If some user of this function has interposer == NULL,
that caller needs debugging...

> +
> +	if (!blk_mq_is_queue_frozen(disk->queue))
> +		return -EPERM;

Why not do the queue freeze here ?

> +
> +	mutex_lock(&bdev_interposer_mutex);
> +	if (blk_has_interposer(disk)) {
> +		if (disk->interposer->ip_submit_bio == ip_submit_bio)
> +			ret = -EALREADY;
> +		else
> +			ret = -EBUSY;
> +		goto out;
> +	}
> +
> +	interposer->ip_submit_bio = ip_submit_bio;
> +	interposer->disk = disk;
> +
> +	disk->interposer = interposer;
> +out:
> +	mutex_unlock(&bdev_interposer_mutex);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_interposer_attach);
> +
> +/**
> + * blk_interposer_detach - Detach interposer from disk
> + * @interposer: block device interposer
> + * @ip_submit_bio: hook for submit_bio()
> + *
> + * Disk must be frozen by blk_mq_freeze_queue().
> + */
> +void blk_interposer_detach(struct blk_interposer *interposer,
> +			  const ip_submit_bio_t ip_submit_bio)
> +{

The interface is weird. Why not passing the gendisk ?

> +	struct gendisk *disk;
> +
> +	if (WARN_ON(!interposer))
> +		return;

Same comment as above. This should not be necessary.

> +
> +	mutex_lock(&bdev_interposer_mutex);
> +
> +	/* Check if the interposer is still active. */
> +	disk = interposer->disk;
> +	if (WARN_ON(!disk))
> +		goto out;
> +
> +	if (WARN_ON(!blk_mq_is_queue_frozen(disk->queue)))
> +		goto out;
> +
> +	/* Check if it is really our interposer. */
> +	if (WARN_ON(disk->interposer->ip_submit_bio != ip_submit_bio))
> +		goto out;
> +
> +	disk->interposer = NULL;
> +	interposer->disk = NULL;
> +out:
> +	mutex_unlock(&bdev_interposer_mutex);
> +}
> +EXPORT_SYMBOL_GPL(blk_interposer_detach);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index d705b174d346..9d1e8c4e922e 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -525,6 +525,7 @@ void blk_freeze_queue_start(struct request_queue *q);
>  void blk_mq_freeze_queue_wait(struct request_queue *q);
>  int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
>  				     unsigned long timeout);
> +bool blk_mq_is_queue_frozen(struct request_queue *q);
>  
>  int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
>  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 866f74261b3b..6c1351d7b73f 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -227,7 +227,7 @@ struct bio {
>  						 * top bits REQ_OP. Use
>  						 * accessors.
>  						 */
> -	unsigned short		bi_flags;	/* status, etc and bvec pool number */
> +	unsigned int		bi_flags;	/* status, etc and bvec pool number */
>  	unsigned short		bi_ioprio;
>  	unsigned short		bi_write_hint;
>  	blk_status_t		bi_status;
> @@ -304,6 +304,8 @@ enum {
>  				 * of this bio. */
>  	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
>  	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
> +	BIO_INTERPOSED,		/* bio has been interposed and can be moved to
> +				 * a different disk */
>  	BIO_FLAG_LAST
>  };
>  
> @@ -322,7 +324,7 @@ enum {
>   * freed.
>   */
>  #define BVEC_POOL_BITS		(3)
> -#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
> +#define BVEC_POOL_OFFSET	(32 - BVEC_POOL_BITS)
>  #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
>  #if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
>  # error "BVEC_POOL_BITS is too small"
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 809aaa32d53c..8094af3a3db9 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -134,6 +134,14 @@ struct blk_integrity {
>  	unsigned char				tag_size;
>  };
>  
> +struct blk_interposer;

This is not needed.

> +typedef void (*ip_submit_bio_t) (struct bio *bio);

This hides the definition of the submit callback for no good reasons that I can
see. Since the callback has a simple interface, I would prefer this to be dropped.

> +
> +struct blk_interposer {
> +	ip_submit_bio_t ip_submit_bio;
> +	struct gendisk *disk;

If you fix the interface of the detach function, you should not need this field.

> +};
> +
>  struct gendisk {
>  	/* major, first_minor and minors are input parameters only,
>  	 * don't use directly.  Use disk_devt() and disk_max_parts().
> @@ -158,6 +166,7 @@ struct gendisk {
>  
>  	const struct block_device_operations *fops;
>  	struct request_queue *queue;
> +	struct blk_interposer *interposer;
>  	void *private_data;
>  
>  	int flags;
> @@ -346,4 +355,14 @@ static inline void printk_all_partitions(void)
>  }
>  #endif /* CONFIG_BLOCK */
>  
> +/*
> + * block layer interposer
> + */
> +#define blk_has_interposer(d) ((d)->interposer != NULL)

Please make this an inline function.

> +
> +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
> +			  const ip_submit_bio_t ip_submit_bio);
> +void blk_interposer_detach(struct blk_interposer *interposer,
> +			   const ip_submit_bio_t ip_submit_bio);
> +
>  #endif /* _LINUX_GENHD_H */
>
Sergei Shtepa Jan. 29, 2021, 3:58 p.m. UTC | #2
The 01/29/2021 04:21, Damien Le Moal wrote:
> On 2021/01/29 2:23, Sergei Shtepa wrote:
> > The block layer interposer allows to intercept bio requests.
> > This allows to connect device mapper and other kernel modules
> > to the block device stack on the fly.
> > 
> > changes:
> >   * new BIO_INTERPOSED bio flag.
> >   * new function __submit_bio_interposed() implements the interposers
> >     logic.
> >   * new function blk_mq_is_queue_frozen() allow to assert that
> >     the queue is frozen.
> >   * functions blk_interposer_attach() and blk_interposer_detach()
> >     allow to attach and detach interposers.
> 
> The changelog should not be part of the commit message. If you need a changelog
> for a single patch, add it between the commit message end "---" and the patch
> stats. git will ignore that part.
> 
> > Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> > ---
> >  block/bio.c               |  2 +
> >  block/blk-core.c          | 29 ++++++++++++++
> >  block/blk-mq.c            | 13 +++++++
> >  block/genhd.c             | 82 +++++++++++++++++++++++++++++++++++++++
> >  include/linux/blk-mq.h    |  1 +
> >  include/linux/blk_types.h |  6 ++-
> >  include/linux/genhd.h     | 19 +++++++++
> >  7 files changed, 150 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index 1f2cc1fbe283..f6f135eb84b5 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -684,6 +684,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
> >  	bio_set_flag(bio, BIO_CLONED);
> >  	if (bio_flagged(bio_src, BIO_THROTTLED))
> >  		bio_set_flag(bio, BIO_THROTTLED);
> > +	if (bio_flagged(bio_src, BIO_INTERPOSED))
> > +		bio_set_flag(bio, BIO_INTERPOSED);
> >  	bio->bi_opf = bio_src->bi_opf;
> >  	bio->bi_ioprio = bio_src->bi_ioprio;
> >  	bio->bi_write_hint = bio_src->bi_write_hint;
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 7663a9b94b80..07ec82d8fe57 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -1032,6 +1032,32 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> >  	return ret;
> >  }
> >  
> > +static blk_qc_t __submit_bio_interposed(struct bio *bio)
> > +{
> > +	struct bio_list bio_list[2] = { };
> > +	blk_qc_t ret = BLK_QC_T_NONE;
> > +
> > +	current->bio_list = bio_list;
> > +	if (likely(bio_queue_enter(bio) == 0)) {
> > +		struct gendisk *disk = bio->bi_disk;
> > +
> > +		bio_set_flag(bio, BIO_INTERPOSED);
> > +		if (likely(blk_has_interposer(disk)))
> > +			disk->interposer->ip_submit_bio(bio);
> 
> Why do you check again blk_has_interposer() here ? That is checked already in
> submit_bio_noacct() and the interposer attach/detach cannot run without the
> queue frozen. So I do not think you need to check again. If you do, then you
> definitely have a race condition here.

There is no problem here.
The first call to blk_has_interposer() allows not to call the __submit_bio_interposed()
function if the interposer is not specified.
Changing the disk->interpreter field is blocked by the bio_queue_enter() and
blk_queue_exit() functions. Therefore, a second check of blk_has_interposer()
is needed after calling bio_queue_enter().
But I think setting the BIO_INTERPOSED flag should be moved below, just before
calling ip_submit_bio().

> 
> > +		else /* interposer was removed */
> > +			bio_list_add(&current->bio_list[0], bio);
> > +
> > +		blk_queue_exit(disk->queue);
> > +	}
> > +	current->bio_list = NULL;
> > +
> > +	/* Resubmit remaining bios */
> > +	while ((bio = bio_list_pop(&bio_list[0])))
> > +		ret = submit_bio_noacct(bio);
> > +
> > +	return ret;
> > +}
> > +
> >  /**
> >   * submit_bio_noacct - re-submit a bio to the block device layer for I/O
> >   * @bio:  The bio describing the location in memory and on the device.
> > @@ -1057,6 +1083,9 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
> >  		return BLK_QC_T_NONE;
> >  	}
> >  
> > +	if (blk_has_interposer(bio->bi_disk) &&
> > +	    !bio_flagged(bio, BIO_INTERPOSED))
> > +		return __submit_bio_interposed(bio);
> 
> OK. I *think* that this is to handle stacked devices, right ? Otherwise, this
> condition does not make much sense. Why not just:
> 
> 	if (blk_has_interposer(bio->bi_disk))
> 		return __submit_bio_interposed(bio);
> 
> So at least adding some comments here may be good.

Checking the BIO_INTERPOSED flag is necessary here.
The device mapper can re-send an intercepted request, or generate new ones
that should not be intercepted.
I'll think about a comment for this line.

> 
> >  	if (!bio->bi_disk->fops->submit_bio)
> >  		return __submit_bio_noacct_mq(bio);
> >  	return __submit_bio_noacct(bio);
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index f285a9123a8b..924ec26fae5f 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -161,6 +161,19 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
> >  }
> >  EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
> >  
> > +
> > +bool blk_mq_is_queue_frozen(struct request_queue *q)
> > +{
> > +	bool ret;
> > +
> > +	mutex_lock(&q->mq_freeze_lock);
> > +	ret = percpu_ref_is_dying(&q->q_usage_counter) && percpu_ref_is_zero(&q->q_usage_counter);
> > +	mutex_unlock(&q->mq_freeze_lock);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(blk_mq_is_queue_frozen);
> 
> Maybe move this change to its own patch preceding this one ?

Yes, if it makes the review easier.

> 
> > +
> >  /*
> >   * Guarantee no request is in use, so we can change any data structure of
> >   * the queue afterward.
> > diff --git a/block/genhd.c b/block/genhd.c
> > index 419548e92d82..d3459582f56c 100644
> > --- a/block/genhd.c
> > +++ b/block/genhd.c
> > @@ -30,6 +30,7 @@
> >  static struct kobject *block_depr;
> >  
> >  DECLARE_RWSEM(bdev_lookup_sem);
> > +DEFINE_MUTEX(bdev_interposer_mutex);
> >  
> >  /* for extended dynamic devt allocation, currently only one major is used */
> >  #define NR_EXT_DEVT		(1 << MINORBITS)
> > @@ -2148,3 +2149,84 @@ static void disk_release_events(struct gendisk *disk)
> >  	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
> >  	kfree(disk->ev);
> >  }
> > +
> > +/**
> > + * blk_interposer_attach - Attach interposer to disk
> > + * @disk: target disk
> > + * @interposer: block device interposer
> > + * @ip_submit_bio: hook for submit_bio()
> > + *
> > + * Returns:
> > + *     -EINVAL if @interposer is NULL.
> > + *     -EPERM if queue is not frozen.
> > + *     -EBUSY if the block device already has @interposer.
> > + *     -EALREADY if the block device already has @interposer with same callback.
> > + *
> > + * Disk must be frozen by blk_mq_freeze_queue().
> > + */
> > +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
> > +			  const ip_submit_bio_t ip_submit_bio)
> > +{
> > +	int ret = 0;
> > +
> > +	if (!interposer)
> > +		return -EINVAL;
> 
> Is this really necessary ? If some user of this function has interposer == NULL,
> that caller needs debugging...

Right. I`ll add WARN_ON() at this place for debugging, but avoid the kernel drop.

> 
> > +
> > +	if (!blk_mq_is_queue_frozen(disk->queue))
> > +		return -EPERM;
> 
> Why not do the queue freeze here ?

In the very first version, the freeze was here. But the state when the queue is frozen
is also very convenient for executing other code. In order not to create new locks,
the freeze was placed in dm_interposer_attach_dev().

> 
> > +
> > +	mutex_lock(&bdev_interposer_mutex);
> > +	if (blk_has_interposer(disk)) {
> > +		if (disk->interposer->ip_submit_bio == ip_submit_bio)
> > +			ret = -EALREADY;
> > +		else
> > +			ret = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	interposer->ip_submit_bio = ip_submit_bio;
> > +	interposer->disk = disk;
> > +
> > +	disk->interposer = interposer;
> > +out:
> > +	mutex_unlock(&bdev_interposer_mutex);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(blk_interposer_attach);
> > +
> > +/**
> > + * blk_interposer_detach - Detach interposer from disk
> > + * @interposer: block device interposer
> > + * @ip_submit_bio: hook for submit_bio()
> > + *
> > + * Disk must be frozen by blk_mq_freeze_queue().
> > + */
> > +void blk_interposer_detach(struct blk_interposer *interposer,
> > +			  const ip_submit_bio_t ip_submit_bio)
> > +{
> 
> The interface is weird. Why not passing the gendisk ?

Passing interposer will be more safety.
The kernel module (device mapper) owns the "interposer" object and passes it
to the function.
In addition, need to reset the interposer->disk. This is a sign that interposer
has been detached.

> 
> > +	struct gendisk *disk;
> > +
> > +	if (WARN_ON(!interposer))
> > +		return;
> 
> Same comment as above. This should not be necessary.

I think it`s necessary in case detach will be called twice.
It allow to avoid the kernel drop.

> 
> > +
> > +	mutex_lock(&bdev_interposer_mutex);
> > +
> > +	/* Check if the interposer is still active. */
> > +	disk = interposer->disk;
> > +	if (WARN_ON(!disk))
> > +		goto out;
> > +
> > +	if (WARN_ON(!blk_mq_is_queue_frozen(disk->queue)))
> > +		goto out;
> > +
> > +	/* Check if it is really our interposer. */
> > +	if (WARN_ON(disk->interposer->ip_submit_bio != ip_submit_bio))
> > +		goto out;
> > +
> > +	disk->interposer = NULL;
> > +	interposer->disk = NULL;
> > +out:
> > +	mutex_unlock(&bdev_interposer_mutex);
> > +}
> > +EXPORT_SYMBOL_GPL(blk_interposer_detach);
> > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> > index d705b174d346..9d1e8c4e922e 100644
> > --- a/include/linux/blk-mq.h
> > +++ b/include/linux/blk-mq.h
> > @@ -525,6 +525,7 @@ void blk_freeze_queue_start(struct request_queue *q);
> >  void blk_mq_freeze_queue_wait(struct request_queue *q);
> >  int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
> >  				     unsigned long timeout);
> > +bool blk_mq_is_queue_frozen(struct request_queue *q);
> >  
> >  int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
> >  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index 866f74261b3b..6c1351d7b73f 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -227,7 +227,7 @@ struct bio {
> >  						 * top bits REQ_OP. Use
> >  						 * accessors.
> >  						 */
> > -	unsigned short		bi_flags;	/* status, etc and bvec pool number */
> > +	unsigned int		bi_flags;	/* status, etc and bvec pool number */
> >  	unsigned short		bi_ioprio;
> >  	unsigned short		bi_write_hint;
> >  	blk_status_t		bi_status;
> > @@ -304,6 +304,8 @@ enum {
> >  				 * of this bio. */
> >  	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
> >  	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
> > +	BIO_INTERPOSED,		/* bio has been interposed and can be moved to
> > +				 * a different disk */
> >  	BIO_FLAG_LAST
> >  };
> >  
> > @@ -322,7 +324,7 @@ enum {
> >   * freed.
> >   */
> >  #define BVEC_POOL_BITS		(3)
> > -#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
> > +#define BVEC_POOL_OFFSET	(32 - BVEC_POOL_BITS)
> >  #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
> >  #if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
> >  # error "BVEC_POOL_BITS is too small"
> > diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> > index 809aaa32d53c..8094af3a3db9 100644
> > --- a/include/linux/genhd.h
> > +++ b/include/linux/genhd.h
> > @@ -134,6 +134,14 @@ struct blk_integrity {
> >  	unsigned char				tag_size;
> >  };
> >  
> > +struct blk_interposer;
> 
> This is not needed.

Rigth.

> 
> > +typedef void (*ip_submit_bio_t) (struct bio *bio);
> 
> This hides the definition of the submit callback for no good reasons that I can
> see. Since the callback has a simple interface, I would prefer this to be dropped.

May be. But the ip_submit_bio_t type is used to describe the parameter of
the blk_interposer_attach() function. I would prefer that the types of
these objects are guaranteed to match.

> 
> > +
> > +struct blk_interposer {
> > +	ip_submit_bio_t ip_submit_bio;
> > +	struct gendisk *disk;
> 
> If you fix the interface of the detach function, you should not need this field.

No, I probably won't delete this field.
This field is a good indicator of whether the detach has occurred or not.
It allows not to detach someone else's "interposer" in case of an erroneous
re-call of detach.

> 
> > +};
> > +
> >  struct gendisk {
> >  	/* major, first_minor and minors are input parameters only,
> >  	 * don't use directly.  Use disk_devt() and disk_max_parts().
> > @@ -158,6 +166,7 @@ struct gendisk {
> >  
> >  	const struct block_device_operations *fops;
> >  	struct request_queue *queue;
> > +	struct blk_interposer *interposer;
> >  	void *private_data;
> >  
> >  	int flags;
> > @@ -346,4 +355,14 @@ static inline void printk_all_partitions(void)
> >  }
> >  #endif /* CONFIG_BLOCK */
> >  
> > +/*
> > + * block layer interposer
> > + */
> > +#define blk_has_interposer(d) ((d)->interposer != NULL)
> 
> Please make this an inline function.

I prefer inline functions, but I would prefer to stay it as it defined by Hannes.
In addition, such a define is useful for determining whether the kernel supports
blk_interposer or not.

> 
> > +
> > +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
> > +			  const ip_submit_bio_t ip_submit_bio);
> > +void blk_interposer_detach(struct blk_interposer *interposer,
> > +			   const ip_submit_bio_t ip_submit_bio);
> > +
> >  #endif /* _LINUX_GENHD_H */
> > 
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
>
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 1f2cc1fbe283..f6f135eb84b5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -684,6 +684,8 @@  void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
+	if (bio_flagged(bio_src, BIO_INTERPOSED))
+		bio_set_flag(bio, BIO_INTERPOSED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_write_hint = bio_src->bi_write_hint;
diff --git a/block/blk-core.c b/block/blk-core.c
index 7663a9b94b80..07ec82d8fe57 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1032,6 +1032,32 @@  static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
 	return ret;
 }
 
+static blk_qc_t __submit_bio_interposed(struct bio *bio)
+{
+	struct bio_list bio_list[2] = { };
+	blk_qc_t ret = BLK_QC_T_NONE;
+
+	current->bio_list = bio_list;
+	if (likely(bio_queue_enter(bio) == 0)) {
+		struct gendisk *disk = bio->bi_disk;
+
+		bio_set_flag(bio, BIO_INTERPOSED);
+		if (likely(blk_has_interposer(disk)))
+			disk->interposer->ip_submit_bio(bio);
+		else /* interposer was removed */
+			bio_list_add(&current->bio_list[0], bio);
+
+		blk_queue_exit(disk->queue);
+	}
+	current->bio_list = NULL;
+
+	/* Resubmit remaining bios */
+	while ((bio = bio_list_pop(&bio_list[0])))
+		ret = submit_bio_noacct(bio);
+
+	return ret;
+}
+
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -1057,6 +1083,9 @@  blk_qc_t submit_bio_noacct(struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
+	if (blk_has_interposer(bio->bi_disk) &&
+	    !bio_flagged(bio, BIO_INTERPOSED))
+		return __submit_bio_interposed(bio);
 	if (!bio->bi_disk->fops->submit_bio)
 		return __submit_bio_noacct_mq(bio);
 	return __submit_bio_noacct(bio);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f285a9123a8b..924ec26fae5f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -161,6 +161,19 @@  int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 
+
+bool blk_mq_is_queue_frozen(struct request_queue *q)
+{
+	bool ret;
+
+	mutex_lock(&q->mq_freeze_lock);
+	ret = percpu_ref_is_dying(&q->q_usage_counter) && percpu_ref_is_zero(&q->q_usage_counter);
+	mutex_unlock(&q->mq_freeze_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_is_queue_frozen);
+
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
diff --git a/block/genhd.c b/block/genhd.c
index 419548e92d82..d3459582f56c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -30,6 +30,7 @@ 
 static struct kobject *block_depr;
 
 DECLARE_RWSEM(bdev_lookup_sem);
+DEFINE_MUTEX(bdev_interposer_mutex);
 
 /* for extended dynamic devt allocation, currently only one major is used */
 #define NR_EXT_DEVT		(1 << MINORBITS)
@@ -2148,3 +2149,84 @@  static void disk_release_events(struct gendisk *disk)
 	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
 	kfree(disk->ev);
 }
+
+/**
+ * blk_interposer_attach - Attach interposer to disk
+ * @disk: target disk
+ * @interposer: block device interposer
+ * @ip_submit_bio: hook for submit_bio()
+ *
+ * Returns:
+ *     -EINVAL if @interposer is NULL.
+ *     -EPERM if queue is not frozen.
+ *     -EBUSY if the block device already has @interposer.
+ *     -EALREADY if the block device already has @interposer with same callback.
+ *
+ * Disk must be frozen by blk_mq_freeze_queue().
+ */
+int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
+			  const ip_submit_bio_t ip_submit_bio)
+{
+	int ret = 0;
+
+	if (!interposer)
+		return -EINVAL;
+
+	if (!blk_mq_is_queue_frozen(disk->queue))
+		return -EPERM;
+
+	mutex_lock(&bdev_interposer_mutex);
+	if (blk_has_interposer(disk)) {
+		if (disk->interposer->ip_submit_bio == ip_submit_bio)
+			ret = -EALREADY;
+		else
+			ret = -EBUSY;
+		goto out;
+	}
+
+	interposer->ip_submit_bio = ip_submit_bio;
+	interposer->disk = disk;
+
+	disk->interposer = interposer;
+out:
+	mutex_unlock(&bdev_interposer_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_interposer_attach);
+
+/**
+ * blk_interposer_detach - Detach interposer from disk
+ * @interposer: block device interposer
+ * @ip_submit_bio: hook for submit_bio()
+ *
+ * Disk must be frozen by blk_mq_freeze_queue().
+ */
+void blk_interposer_detach(struct blk_interposer *interposer,
+			  const ip_submit_bio_t ip_submit_bio)
+{
+	struct gendisk *disk;
+
+	if (WARN_ON(!interposer))
+		return;
+
+	mutex_lock(&bdev_interposer_mutex);
+
+	/* Check if the interposer is still active. */
+	disk = interposer->disk;
+	if (WARN_ON(!disk))
+		goto out;
+
+	if (WARN_ON(!blk_mq_is_queue_frozen(disk->queue)))
+		goto out;
+
+	/* Check if it is really our interposer. */
+	if (WARN_ON(disk->interposer->ip_submit_bio != ip_submit_bio))
+		goto out;
+
+	disk->interposer = NULL;
+	interposer->disk = NULL;
+out:
+	mutex_unlock(&bdev_interposer_mutex);
+}
+EXPORT_SYMBOL_GPL(blk_interposer_detach);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d705b174d346..9d1e8c4e922e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -525,6 +525,7 @@  void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
+bool blk_mq_is_queue_frozen(struct request_queue *q);
 
 int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 866f74261b3b..6c1351d7b73f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -227,7 +227,7 @@  struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* status, etc and bvec pool number */
+	unsigned int		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
@@ -304,6 +304,8 @@  enum {
 				 * of this bio. */
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
 	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
+	BIO_INTERPOSED,		/* bio has been interposed and can be moved to
+				 * a different disk */
 	BIO_FLAG_LAST
 };
 
@@ -322,7 +324,7 @@  enum {
  * freed.
  */
 #define BVEC_POOL_BITS		(3)
-#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
+#define BVEC_POOL_OFFSET	(32 - BVEC_POOL_BITS)
 #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
 #if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
 # error "BVEC_POOL_BITS is too small"
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 809aaa32d53c..8094af3a3db9 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -134,6 +134,14 @@  struct blk_integrity {
 	unsigned char				tag_size;
 };
 
+struct blk_interposer;
+typedef void (*ip_submit_bio_t) (struct bio *bio);
+
+struct blk_interposer {
+	ip_submit_bio_t ip_submit_bio;
+	struct gendisk *disk;
+};
+
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
 	 * don't use directly.  Use disk_devt() and disk_max_parts().
@@ -158,6 +166,7 @@  struct gendisk {
 
 	const struct block_device_operations *fops;
 	struct request_queue *queue;
+	struct blk_interposer *interposer;
 	void *private_data;
 
 	int flags;
@@ -346,4 +355,14 @@  static inline void printk_all_partitions(void)
 }
 #endif /* CONFIG_BLOCK */
 
+/*
+ * block layer interposer
+ */
+#define blk_has_interposer(d) ((d)->interposer != NULL)
+
+int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer,
+			  const ip_submit_bio_t ip_submit_bio);
+void blk_interposer_detach(struct blk_interposer *interposer,
+			   const ip_submit_bio_t ip_submit_bio);
+
 #endif /* _LINUX_GENHD_H */