diff mbox series

[v7,3/3] dm: add DM_INTERPOSED_FLAG

Message ID 1615563895-28565-4-git-send-email-sergei.shtepa@veeam.com (mailing list archive)
State Changes Requested, archived
Delegated to: Mike Snitzer
Headers show
Series block device interposer | expand

Commit Message

Sergei Shtepa March 12, 2021, 3:44 p.m. UTC
DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
Underlying block device opens without a flag FMODE_EXCL.
DM target receives bio from the original device via
bdev_interposer.

Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
---
 drivers/md/dm-core.h          |  3 ++
 drivers/md/dm-ioctl.c         | 13 ++++++++
 drivers/md/dm-table.c         | 61 +++++++++++++++++++++++++++++------
 drivers/md/dm.c               | 38 +++++++++++++++-------
 include/linux/device-mapper.h |  1 +
 include/uapi/linux/dm-ioctl.h |  6 ++++
 6 files changed, 101 insertions(+), 21 deletions(-)

Comments

Mike Snitzer March 12, 2021, 7 p.m. UTC | #1
On Fri, Mar 12 2021 at 10:44am -0500,
Sergei Shtepa <sergei.shtepa@veeam.com> wrote:

> DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> Underlying block device opens without a flag FMODE_EXCL.
> DM target receives bio from the original device via
> bdev_interposer.
> 
> Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> ---
>  drivers/md/dm-core.h          |  3 ++
>  drivers/md/dm-ioctl.c         | 13 ++++++++
>  drivers/md/dm-table.c         | 61 +++++++++++++++++++++++++++++------
>  drivers/md/dm.c               | 38 +++++++++++++++-------
>  include/linux/device-mapper.h |  1 +
>  include/uapi/linux/dm-ioctl.h |  6 ++++
>  6 files changed, 101 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index 5953ff2bd260..9eae419c7b18 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -114,6 +114,9 @@ struct mapped_device {
>  	bool init_tio_pdu:1;
>  
>  	struct srcu_struct io_barrier;
> +
> +	/* attach target via block-layer interposer */
> +	bool is_interposed;
>  };

This flag is a mix of uses.  First it is used to store that
DM_INTERPOSED_FLAG was provided as input param during load.

And the same 'is_interposed' name is used in 'struct dm_dev'.

To me this state should be elevated to block core -- awkward for every
driver that might want to use blk-interposer to be sprinkling state
around its core structures.

So I'd prefer you:
1) rename 'struct mapped_device' to 'interpose' _and_ add it just after
   "bool init_tio_pdu:1;" with "bool interpose:1;" -- this reflects
   interpose was requested during load.
2) bdev_interposer_attach() should be made to set some block core state
   that is able to be tested to check if a device is_interposed.
3) Don't add an 'is_interposed' flag to 'struct dm_dev'

>  
>  void disable_discard(struct mapped_device *md);
> diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
> index 5e306bba4375..2b4c9557c283 100644
> --- a/drivers/md/dm-ioctl.c
> +++ b/drivers/md/dm-ioctl.c
> @@ -1267,6 +1267,11 @@ static inline fmode_t get_mode(struct dm_ioctl *param)
>  	return mode;
>  }
>  
> +static inline bool get_interposer_flag(struct dm_ioctl *param)
> +{
> +	return (param->flags & DM_INTERPOSED_FLAG);
> +}
> +

As I mention at the end: rename to DM_INTERPOSE_FLAG

>  static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
>  		       struct dm_target_spec **spec, char **target_params)
>  {
> @@ -1293,6 +1298,10 @@ static int populate_table(struct dm_table *table,
>  		DMWARN("populate_table: no targets specified");
>  		return -EINVAL;
>  	}
> +	if (table->md->is_interposed && (param->target_count != 1)) {
> +		DMWARN("%s: with interposer should be specified only one target", __func__);

This error/warning reads very awkwardly. Maybe?:
"%s: interposer requires only a single target be specified"

> +		return -EINVAL;
> +	}
>  
>  	for (i = 0; i < param->target_count; i++) {
>  
> @@ -1338,6 +1347,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
>  	if (!md)
>  		return -ENXIO;
>  
> +	md->is_interposed = get_interposer_flag(param);
> +
>  	r = dm_table_create(&t, get_mode(param), param->target_count, md);
>  	if (r)
>  		goto err;
> @@ -2098,6 +2109,8 @@ int __init dm_early_create(struct dm_ioctl *dmi,
>  	if (r)
>  		goto err_hash_remove;
>  
> +	md->is_interposed = get_interposer_flag(dmi);
> +
>  	/* add targets */
>  	for (i = 0; i < dmi->target_count; i++) {
>  		r = dm_table_add_target(t, spec_array[i]->target_type,
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 95391f78b8d5..f6e2eb3f8949 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -225,12 +225,13 @@ void dm_table_destroy(struct dm_table *t)
>  /*
>   * See if we've already got a device in the list.
>   */
> -static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
> +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev, bool is_interposed)

Think in make more sense to internalize the need to consider
md->interpose here.

So:

static struct dm_dev_internal *find_device(struct dm_table *t, dev_t dev)
{
        struct list_head *l = &t->devices;
        bool is_interposed = t->md->interpose;
        ...

>  {
>  	struct dm_dev_internal *dd;
>  
>  	list_for_each_entry (dd, l, list)
> -		if (dd->dm_dev->bdev->bd_dev == dev)
> +		if ((dd->dm_dev->bdev->bd_dev == dev) &&
> +		    (dd->dm_dev->is_interposed == is_interposed))
>  			return dd;

But why must this extra state be used/tested?  Seems like quite a deep
embedding of interposer into dm_dev_internal.. feels unnecessary.

>  
>  	return NULL;
> @@ -358,6 +359,18 @@ dev_t dm_get_dev_t(const char *path)
>  }
>  EXPORT_SYMBOL_GPL(dm_get_dev_t);
>  
> +static inline void dm_disk_freeze(struct gendisk *disk)
> +{
> +	blk_mq_freeze_queue(disk->queue);
> +	blk_mq_quiesce_queue(disk->queue);
> +}
> +
> +static inline void dm_disk_unfreeze(struct gendisk *disk)
> +{
> +	blk_mq_unquiesce_queue(disk->queue);
> +	blk_mq_unfreeze_queue(disk->queue);
> +}
> +

These interfaces don't account for bio-based at all (pretty sure we've
been over this and you pointed out that they'll just return early), but
they also don't take steps to properly flush outstanding DM io.
Shouldn't you require DM devices do an internal suspend/resume?  And if
original device isn't DM then fallback to blk_mq calls?

>  /*
>   * Add a device to the list, or just increment the usage count if
>   * it's already present.
> @@ -385,7 +398,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
>  			return -ENODEV;
>  	}
>  
> -	dd = find_device(&t->devices, dev);
> +	dd = find_device(&t->devices, dev, t->md->is_interposed);
>  	if (!dd) {
>  		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
>  		if (!dd)
> @@ -398,15 +411,38 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
>  
>  		refcount_set(&dd->count, 1);
>  		list_add(&dd->list, &t->devices);
> -		goto out;
> -
>  	} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
>  		r = upgrade_mode(dd, mode, t->md);
>  		if (r)
>  			return r;
> +		refcount_inc(&dd->count);
>  	}
> -	refcount_inc(&dd->count);

This looks bogus... you cannot only increment refcount with the mode
check/upgrade branch (IIRC: I've made this same mistake in the past)

> -out:
> +
> +	if (t->md->is_interposed) {
> +		struct block_device *original = dd->dm_dev->bdev;
> +		struct block_device *interposer = t->md->disk->part0;
> +
> +		if ((ti->begin != 0) || (ti->len < bdev_nr_sectors(original))) {
> +			dm_put_device(ti, dd->dm_dev);
> +			DMERR("The interposer device should not be less than the original.");
> +			return -EINVAL;

Can you explain why allowing the device to be larger is meaningful?  Not
saying it isn't I'd just like to understand use-cases you forsee.

> +		}
> +
> +		/*
> +		 * Attach mapped interposer device to original.
> +		 * It is quite convenient that device mapper creates
> +		 * one disk for one block device.
> +		 */
> +		dm_disk_freeze(original->bd_disk);
> +		r = bdev_interposer_attach(original, interposer);
> +		dm_disk_unfreeze(original->bd_disk);
> +		if (r) {
> +			dm_put_device(ti, dd->dm_dev);
> +			DMERR("Failed to attach dm interposer.");
> +			return r;
> +		}
> +	}
> +
>  	*result = dd->dm_dev;
>  	return 0;
>  }
> @@ -446,6 +482,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
>  {
>  	int found = 0;
>  	struct list_head *devices = &ti->table->devices;
> +	struct mapped_device *md = ti->table->md;
>  	struct dm_dev_internal *dd;
>  
>  	list_for_each_entry(dd, devices, list) {
> @@ -456,11 +493,17 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
>  	}
>  	if (!found) {
>  		DMWARN("%s: device %s not in table devices list",
> -		       dm_device_name(ti->table->md), d->name);
> +		       dm_device_name(md), d->name);
>  		return;
>  	}
> +	if (md->is_interposed) {
> +		dm_disk_freeze(d->bdev->bd_disk);
> +		bdev_interposer_detach(d->bdev);
> +		dm_disk_unfreeze(d->bdev->bd_disk);
> +	}
> +
>  	if (refcount_dec_and_test(&dd->count)) {
> -		dm_put_table_device(ti->table->md, d);
> +		dm_put_table_device(md, d);
>  		list_del(&dd->list);
>  		kfree(dd);
>  	}
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 50b693d776d6..466bf70a66b0 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -762,16 +762,24 @@ static int open_table_device(struct table_device *td, dev_t dev,
>  
>  	BUG_ON(td->dm_dev.bdev);
>  
> -	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
> -	if (IS_ERR(bdev))
> -		return PTR_ERR(bdev);
> +	if (md->is_interposed) {
>  
> -	r = bd_link_disk_holder(bdev, dm_disk(md));
> -	if (r) {
> -		blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
> -		return r;
> +		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL);
> +		if (IS_ERR(bdev))
> +			return PTR_ERR(bdev);
> +	} else {
> +		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
> +		if (IS_ERR(bdev))
> +			return PTR_ERR(bdev);
> +
> +		r = bd_link_disk_holder(bdev, dm_disk(md));
> +		if (r) {
> +			blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
> +			return r;
> +		}
>  	}
>  
> +	td->dm_dev.is_interposed = md->is_interposed;

This _should_ hopefully get cleaned up by pushing such state into block
core's interposer interfaces.

But again, not seeing what utility/safety this extra flag is providing
to begin with.  Is this state _actually_ needed at all?


>  	td->dm_dev.bdev = bdev;
>  	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>  	return 0;
> @@ -785,20 +793,26 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
>  	if (!td->dm_dev.bdev)
>  		return;
>  
> -	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> -	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
> +	if (td->dm_dev.is_interposed)
> +		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
> +	else {
> +		bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> +		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
> +	}
>  	put_dax(td->dm_dev.dax_dev);
>  	td->dm_dev.bdev = NULL;
>  	td->dm_dev.dax_dev = NULL;
>  }
>  
>  static struct table_device *find_table_device(struct list_head *l, dev_t dev,
> -					      fmode_t mode)
> +					      fmode_t mode, bool is_interposed)
>  {
>  	struct table_device *td;
>  
>  	list_for_each_entry(td, l, list)
> -		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
> +		if (td->dm_dev.bdev->bd_dev == dev &&
> +		    td->dm_dev.mode == mode &&
> +		    td->dm_dev.is_interposed == is_interposed)
>  			return td;
>  
>  	return NULL;
> @@ -811,7 +825,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
>  	struct table_device *td;
>  
>  	mutex_lock(&md->table_devices_lock);
> -	td = find_table_device(&md->table_devices, dev, mode);
> +	td = find_table_device(&md->table_devices, dev, mode, md->is_interposed);
>  	if (!td) {
>  		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
>  		if (!td) {
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 7f4ac87c0b32..76a6dfb1cb29 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -159,6 +159,7 @@ struct dm_dev {
>  	struct block_device *bdev;
>  	struct dax_device *dax_dev;
>  	fmode_t mode;
> +	bool is_interposed;

Again, I'd like this state to be part of 'struct block_device'

>  	char name[16];
>  };
>  
> diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
> index fcff6669137b..fc4d06bb3dbb 100644
> --- a/include/uapi/linux/dm-ioctl.h
> +++ b/include/uapi/linux/dm-ioctl.h
> @@ -362,4 +362,10 @@ enum {
>   */
>  #define DM_INTERNAL_SUSPEND_FLAG	(1 << 18) /* Out */
>  
> +/*
> + * If set, the underlying device should open without FMODE_EXCL
> + * and attach mapped device via bdev_interposer.
> + */
> +#define DM_INTERPOSED_FLAG		(1 << 19) /* In */

Please rename to DM_INTERPOSE_FLAG

> +
>  #endif				/* _LINUX_DM_IOCTL_H */
> -- 
> 2.20.1
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Christoph Hellwig March 14, 2021, 9:30 a.m. UTC | #2
On Fri, Mar 12, 2021 at 06:44:55PM +0300, Sergei Shtepa wrote:
> DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> Underlying block device opens without a flag FMODE_EXCL.
> DM target receives bio from the original device via
> bdev_interposer.

This is more of a philopical comment, but the idea of just letting the
interposed reopen the device by itself seems like a bad idea.  I think
that is probably better hidden in the block layer interposer attachment
function, which could do the extra blkdev_get_by_dev for the caller.

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Sergei Shtepa March 15, 2021, 12:29 p.m. UTC | #3
The 03/12/2021 22:00, Mike Snitzer wrote:
> On Fri, Mar 12 2021 at 10:44am -0500,
> Sergei Shtepa <sergei.shtepa@veeam.com> wrote:
> 
> > DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> > Underlying block device opens without a flag FMODE_EXCL.
> > DM target receives bio from the original device via
> > bdev_interposer.
> > 
> > Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com>
> > ---
> >  drivers/md/dm-core.h          |  3 ++
> >  drivers/md/dm-ioctl.c         | 13 ++++++++
> >  drivers/md/dm-table.c         | 61 +++++++++++++++++++++++++++++------
> >  drivers/md/dm.c               | 38 +++++++++++++++-------
> >  include/linux/device-mapper.h |  1 +
> >  include/uapi/linux/dm-ioctl.h |  6 ++++
> >  6 files changed, 101 insertions(+), 21 deletions(-)
> > 
> > diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> > index 5953ff2bd260..9eae419c7b18 100644
> > --- a/drivers/md/dm-core.h
> > +++ b/drivers/md/dm-core.h
> > @@ -114,6 +114,9 @@ struct mapped_device {
> >  	bool init_tio_pdu:1;
> >  
> >  	struct srcu_struct io_barrier;
> > +
> > +	/* attach target via block-layer interposer */
> > +	bool is_interposed;
> >  };
> 
> This flag is a mix of uses.  First it is used to store that
> DM_INTERPOSED_FLAG was provided as input param during load.
> 
> And the same 'is_interposed' name is used in 'struct dm_dev'.
> 
> To me this state should be elevated to block core -- awkward for every
> driver that might want to use blk-interposer to be sprinkling state
> around its core structures.
> 
> So I'd prefer you:
> 1) rename 'struct mapped_device' to 'interpose' _and_ add it just after
>    "bool init_tio_pdu:1;" with "bool interpose:1;" -- this reflects
>    interpose was requested during load.
> 2) bdev_interposer_attach() should be made to set some block core state
>    that is able to be tested to check if a device is_interposed.
> 3) Don't add an 'is_interposed' flag to 'struct dm_dev'

Ok, but little fix in "rename 'struct mapped_device' to 'interpose'".
Maybe "rename 'bool is_interposed' to 'bool interpose:1'"?
I think I understand your idea, I'll try to implement it.

> 
> >  
> >  void disable_discard(struct mapped_device *md);
> > diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
> > index 5e306bba4375..2b4c9557c283 100644
> > --- a/drivers/md/dm-ioctl.c
> > +++ b/drivers/md/dm-ioctl.c
> > @@ -1267,6 +1267,11 @@ static inline fmode_t get_mode(struct dm_ioctl *param)
> >  	return mode;
> >  }
> >  
> > +static inline bool get_interposer_flag(struct dm_ioctl *param)
> > +{
> > +	return (param->flags & DM_INTERPOSED_FLAG);
> > +}
> > +
> 
> As I mention at the end: rename to DM_INTERPOSE_FLAG

Ok.

> 
> >  static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
> >  		       struct dm_target_spec **spec, char **target_params)
> >  {
> > @@ -1293,6 +1298,10 @@ static int populate_table(struct dm_table *table,
> >  		DMWARN("populate_table: no targets specified");
> >  		return -EINVAL;
> >  	}
> > +	if (table->md->is_interposed && (param->target_count != 1)) {
> > +		DMWARN("%s: with interposer should be specified only one target", __func__);
> 
> This error/warning reads very awkwardly. Maybe?:
> "%s: interposer requires only a single target be specified"

Ok.

> 
> > +		return -EINVAL;
> > +	}
> >  
> >  	for (i = 0; i < param->target_count; i++) {
> >  
> > @@ -1338,6 +1347,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
> >  	if (!md)
> >  		return -ENXIO;
> >  
> > +	md->is_interposed = get_interposer_flag(param);
> > +
> >  	r = dm_table_create(&t, get_mode(param), param->target_count, md);
> >  	if (r)
> >  		goto err;
> > @@ -2098,6 +2109,8 @@ int __init dm_early_create(struct dm_ioctl *dmi,
> >  	if (r)
> >  		goto err_hash_remove;
> >  
> > +	md->is_interposed = get_interposer_flag(dmi);
> > +
> >  	/* add targets */
> >  	for (i = 0; i < dmi->target_count; i++) {
> >  		r = dm_table_add_target(t, spec_array[i]->target_type,
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index 95391f78b8d5..f6e2eb3f8949 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -225,12 +225,13 @@ void dm_table_destroy(struct dm_table *t)
> >  /*
> >   * See if we've already got a device in the list.
> >   */
> > -static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
> > +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev, bool is_interposed)
> 
> Think in make more sense to internalize the need to consider
> md->interpose here.
> 
> So:
> 
> static struct dm_dev_internal *find_device(struct dm_table *t, dev_t dev)
> {
>         struct list_head *l = &t->devices;
>         bool is_interposed = t->md->interpose;
>         ...

Yes.

> 
> >  {
> >  	struct dm_dev_internal *dd;
> >  
> >  	list_for_each_entry (dd, l, list)
> > -		if (dd->dm_dev->bdev->bd_dev == dev)
> > +		if ((dd->dm_dev->bdev->bd_dev == dev) &&
> > +		    (dd->dm_dev->is_interposed == is_interposed))
> >  			return dd;
> 
> But why must this extra state be used/tested?  Seems like quite a deep
> embedding of interposer into dm_dev_internal.. feels unnecessary.

Yeah, I guess so.
Since dm_table is unique for each dm_target and in our case will generally
contain only one device.

> 
> >  
> >  	return NULL;
> > @@ -358,6 +359,18 @@ dev_t dm_get_dev_t(const char *path)
> >  }
> >  EXPORT_SYMBOL_GPL(dm_get_dev_t);
> >  
> > +static inline void dm_disk_freeze(struct gendisk *disk)
> > +{
> > +	blk_mq_freeze_queue(disk->queue);
> > +	blk_mq_quiesce_queue(disk->queue);
> > +}
> > +
> > +static inline void dm_disk_unfreeze(struct gendisk *disk)
> > +{
> > +	blk_mq_unquiesce_queue(disk->queue);
> > +	blk_mq_unfreeze_queue(disk->queue);
> > +}
> > +
> 
> These interfaces don't account for bio-based at all (pretty sure we've
> been over this and you pointed out that they'll just return early), but
> they also don't take steps to properly flush outstanding DM io.
> Shouldn't you require DM devices do an internal suspend/resume?  And if
> original device isn't DM then fallback to blk_mq calls?

I thought that was enough.
The function dm_disk_freeze() guarantees that no bio will be processed,
which means that we can safely attach or detach the interposer.
All bios will continue their processing after dm_disk_unfreeze().
Sorry, but I don't understand why it is required for DM devices to do
internal suspend/resume.
If this is not too difficult, can you explain what problem can cause
a simple queue freeze?

> 
> >  /*
> >   * Add a device to the list, or just increment the usage count if
> >   * it's already present.
> > @@ -385,7 +398,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> >  			return -ENODEV;
> >  	}
> >  
> > -	dd = find_device(&t->devices, dev);
> > +	dd = find_device(&t->devices, dev, t->md->is_interposed);
> >  	if (!dd) {
> >  		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
> >  		if (!dd)
> > @@ -398,15 +411,38 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> >  
> >  		refcount_set(&dd->count, 1);
> >  		list_add(&dd->list, &t->devices);
> > -		goto out;
> > -
> >  	} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
> >  		r = upgrade_mode(dd, mode, t->md);
> >  		if (r)
> >  			return r;
> > +		refcount_inc(&dd->count);
> >  	}
> > -	refcount_inc(&dd->count);
> 
> This looks bogus... you cannot only increment refcount with the mode
> check/upgrade branch (IIRC: I've made this same mistake in the past)
> 

I realized my mistake. I wanted to remove the unnecessary goto,
but I change the logic.

> > -out:
> > +
> > +	if (t->md->is_interposed) {
> > +		struct block_device *original = dd->dm_dev->bdev;
> > +		struct block_device *interposer = t->md->disk->part0;
> > +
> > +		if ((ti->begin != 0) || (ti->len < bdev_nr_sectors(original))) {
> > +			dm_put_device(ti, dd->dm_dev);
> > +			DMERR("The interposer device should not be less than the original.");
> > +			return -EINVAL;
> 
> Can you explain why allowing the device to be larger is meaningful?  Not
> saying it isn't I'd just like to understand use-cases you forsee.

Hmm... Maybe strict equality would be better.

> 
> > +		}
> > +
> > +		/*
> > +		 * Attach mapped interposer device to original.
> > +		 * It is quite convenient that device mapper creates
> > +		 * one disk for one block device.
> > +		 */
> > +		dm_disk_freeze(original->bd_disk);
> > +		r = bdev_interposer_attach(original, interposer);
> > +		dm_disk_unfreeze(original->bd_disk);
> > +		if (r) {
> > +			dm_put_device(ti, dd->dm_dev);
> > +			DMERR("Failed to attach dm interposer.");
> > +			return r;
> > +		}
> > +	}
> > +
> >  	*result = dd->dm_dev;
> >  	return 0;
> >  }
> > @@ -446,6 +482,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
> >  {
> >  	int found = 0;
> >  	struct list_head *devices = &ti->table->devices;
> > +	struct mapped_device *md = ti->table->md;
> >  	struct dm_dev_internal *dd;
> >  
> >  	list_for_each_entry(dd, devices, list) {
> > @@ -456,11 +493,17 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
> >  	}
> >  	if (!found) {
> >  		DMWARN("%s: device %s not in table devices list",
> > -		       dm_device_name(ti->table->md), d->name);
> > +		       dm_device_name(md), d->name);
> >  		return;
> >  	}
> > +	if (md->is_interposed) {
> > +		dm_disk_freeze(d->bdev->bd_disk);
> > +		bdev_interposer_detach(d->bdev);
> > +		dm_disk_unfreeze(d->bdev->bd_disk);
> > +	}
> > +
> >  	if (refcount_dec_and_test(&dd->count)) {
> > -		dm_put_table_device(ti->table->md, d);
> > +		dm_put_table_device(md, d);
> >  		list_del(&dd->list);
> >  		kfree(dd);
> >  	}
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 50b693d776d6..466bf70a66b0 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -762,16 +762,24 @@ static int open_table_device(struct table_device *td, dev_t dev,
> >  
> >  	BUG_ON(td->dm_dev.bdev);
> >  
> > -	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
> > -	if (IS_ERR(bdev))
> > -		return PTR_ERR(bdev);
> > +	if (md->is_interposed) {
> >  
> > -	r = bd_link_disk_holder(bdev, dm_disk(md));
> > -	if (r) {
> > -		blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
> > -		return r;
> > +		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL);
> > +		if (IS_ERR(bdev))
> > +			return PTR_ERR(bdev);
> > +	} else {
> > +		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
> > +		if (IS_ERR(bdev))
> > +			return PTR_ERR(bdev);
> > +
> > +		r = bd_link_disk_holder(bdev, dm_disk(md));
> > +		if (r) {
> > +			blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
> > +			return r;
> > +		}
> >  	}
> >  
> > +	td->dm_dev.is_interposed = md->is_interposed;
> 
> This _should_ hopefully get cleaned up by pushing such state into block
> core's interposer interfaces.
> 
> But again, not seeing what utility/safety this extra flag is providing
> to begin with.  Is this state _actually_ needed at all?
> 

Yes, it is not necessary.

> 
> >  	td->dm_dev.bdev = bdev;
> >  	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
> >  	return 0;
> > @@ -785,20 +793,26 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> >  	if (!td->dm_dev.bdev)
> >  		return;
> >  
> > -	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> > -	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
> > +	if (td->dm_dev.is_interposed)
> > +		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
> > +	else {
> > +		bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> > +		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
> > +	}
> >  	put_dax(td->dm_dev.dax_dev);
> >  	td->dm_dev.bdev = NULL;
> >  	td->dm_dev.dax_dev = NULL;
> >  }
> >  
> >  static struct table_device *find_table_device(struct list_head *l, dev_t dev,
> > -					      fmode_t mode)
> > +					      fmode_t mode, bool is_interposed)
> >  {
> >  	struct table_device *td;
> >  
> >  	list_for_each_entry(td, l, list)
> > -		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
> > +		if (td->dm_dev.bdev->bd_dev == dev &&
> > +		    td->dm_dev.mode == mode &&
> > +		    td->dm_dev.is_interposed == is_interposed)
> >  			return td;
> >  
> >  	return NULL;
> > @@ -811,7 +825,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
> >  	struct table_device *td;
> >  
> >  	mutex_lock(&md->table_devices_lock);
> > -	td = find_table_device(&md->table_devices, dev, mode);
> > +	td = find_table_device(&md->table_devices, dev, mode, md->is_interposed);
> >  	if (!td) {
> >  		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
> >  		if (!td) {
> > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> > index 7f4ac87c0b32..76a6dfb1cb29 100644
> > --- a/include/linux/device-mapper.h
> > +++ b/include/linux/device-mapper.h
> > @@ -159,6 +159,7 @@ struct dm_dev {
> >  	struct block_device *bdev;
> >  	struct dax_device *dax_dev;
> >  	fmode_t mode;
> > +	bool is_interposed;
> 
> Again, I'd like this state to be part of 'struct block_device'

Ok.

> 
> >  	char name[16];
> >  };
> >  
> > diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
> > index fcff6669137b..fc4d06bb3dbb 100644
> > --- a/include/uapi/linux/dm-ioctl.h
> > +++ b/include/uapi/linux/dm-ioctl.h
> > @@ -362,4 +362,10 @@ enum {
> >   */
> >  #define DM_INTERNAL_SUSPEND_FLAG	(1 << 18) /* Out */
> >  
> > +/*
> > + * If set, the underlying device should open without FMODE_EXCL
> > + * and attach mapped device via bdev_interposer.
> > + */
> > +#define DM_INTERPOSED_FLAG		(1 << 19) /* In */
> 
> Please rename to DM_INTERPOSE_FLAG

Ok.

> 
> > +
> >  #endif				/* _LINUX_DM_IOCTL_H */
> > -- 
> > 2.20.1
> > 
>
Sergei Shtepa March 15, 2021, 1:25 p.m. UTC | #4
The 03/14/2021 12:30, Christoph Hellwig wrote:
> On Fri, Mar 12, 2021 at 06:44:55PM +0300, Sergei Shtepa wrote:
> > DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> > Underlying block device opens without a flag FMODE_EXCL.
> > DM target receives bio from the original device via
> > bdev_interposer.
> 
> This is more of a philopical comment, but the idea of just letting the
> interposed reopen the device by itself seems like a bad idea.  I think
> that is probably better hidden in the block layer interposer attachment
> function, which could do the extra blkdev_get_by_dev for the caller.

I suppose this cannot be implemented, since we need to change the behavior
for block devices that already have been opened.
Christoph Hellwig March 16, 2021, 3:23 p.m. UTC | #5
On Mon, Mar 15, 2021 at 04:25:09PM +0300, Sergei Shtepa wrote:
> The 03/14/2021 12:30, Christoph Hellwig wrote:
> > On Fri, Mar 12, 2021 at 06:44:55PM +0300, Sergei Shtepa wrote:
> > > DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> > > Underlying block device opens without a flag FMODE_EXCL.
> > > DM target receives bio from the original device via
> > > bdev_interposer.
> > 
> > This is more of a philopical comment, but the idea of just letting the
> > interposed reopen the device by itself seems like a bad idea.  I think
> > that is probably better hidden in the block layer interposer attachment
> > function, which could do the extra blkdev_get_by_dev for the caller.
> 
> I suppose this cannot be implemented, since we need to change the behavior
> for block devices that already have been opened.

That's not what I mean.  Take a look at the patch relative to your
series to let me know what you think.  The new blkdev_interposer_attach
now takes a dev_t + mode for the original device and opens it on
behalf of the interposer.  It also moves the queue freezing into the
API, which should address the concerns about the helper and adds a few
more sanity checks.

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Christoph Hellwig March 16, 2021, 3:25 p.m. UTC | #6
On Tue, Mar 16, 2021 at 03:23:14PM +0000, Christoph Hellwig wrote:
> On Mon, Mar 15, 2021 at 04:25:09PM +0300, Sergei Shtepa wrote:
> > The 03/14/2021 12:30, Christoph Hellwig wrote:
> > > On Fri, Mar 12, 2021 at 06:44:55PM +0300, Sergei Shtepa wrote:
> > > > DM_INTERPOSED_FLAG allow to create DM targets on "the fly".
> > > > Underlying block device opens without a flag FMODE_EXCL.
> > > > DM target receives bio from the original device via
> > > > bdev_interposer.
> > > 
> > > This is more of a philopical comment, but the idea of just letting the
> > > interposed reopen the device by itself seems like a bad idea.  I think
> > > that is probably better hidden in the block layer interposer attachment
> > > function, which could do the extra blkdev_get_by_dev for the caller.
> > 
> > I suppose this cannot be implemented, since we need to change the behavior
> > for block devices that already have been opened.
> 
> That's not what I mean.  Take a look at the patch relative to your
> series to let me know what you think.  The new blkdev_interposer_attach
> now takes a dev_t + mode for the original device and opens it on
> behalf of the interposer.  It also moves the queue freezing into the
> API, which should address the concerns about the helper and adds a few
> more sanity checks.

And now actually with the diff:


diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2f188a865024ac..d4d7c1caa43966 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -161,19 +161,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 
-bool blk_mq_is_queue_frozen(struct request_queue *q)
-{
-	bool frozen;
-
-	mutex_lock(&q->mq_freeze_lock);
-	frozen = percpu_ref_is_dying(&q->q_usage_counter) &&
-		 percpu_ref_is_zero(&q->q_usage_counter);
-	mutex_unlock(&q->mq_freeze_lock);
-
-	return frozen;
-}
-EXPORT_SYMBOL_GPL(blk_mq_is_queue_frozen);
-
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
diff --git a/block/genhd.c b/block/genhd.c
index fa406b972371ae..64d6338b08cc87 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1944,51 +1944,70 @@ static void disk_release_events(struct gendisk *disk)
 	kfree(disk->ev);
 }
 
-int bdev_interposer_attach(struct block_device *original,
+struct block_device *blkdev_interposer_attach(dev_t dev, fmode_t mode,
 			   struct block_device *interposer)
 {
+	struct block_device *bdev;
 	int ret = 0;
 
-	if (WARN_ON(((!original) || (!interposer))))
-		return -EINVAL;
-	/*
-	 * interposer should be simple, no a multi-queue device
-	 */
-	if (!interposer->bd_disk->fops->submit_bio)
-		return -EINVAL;
+	if (WARN_ON_ONCE(!bdev_is_partition(interposer)))
+		return ERR_PTR(-EINVAL);
+	if (WARN_ON_ONCE(!queue_is_mq(interposer->bd_disk->queue)))
+		return ERR_PTR(-EINVAL);
 
-	if (WARN_ON(!blk_mq_is_queue_frozen(original->bd_disk->queue)))
-		return -EPERM;
+	bdev = blkdev_get_by_dev(dev, mode, NULL);
+	if (IS_ERR(bdev))
+		return bdev;
 
-	mutex_lock(&bdev_interposer_attach_lock);
+	ret = -EINVAL;
+	if (WARN_ON_ONCE(bdev_nr_sectors(bdev) != bdev_nr_sectors(interposer)))
+		goto out;
 
-	if (bdev_has_interposer(original))
-		ret = -EBUSY;
-	else {
-		original->bd_interposer = bdgrab(interposer);
-		if (!original->bd_interposer)
-			ret = -ENODEV;
-	}
+	blk_mq_freeze_queue(bdev->bd_disk->queue);
+	blk_mq_quiesce_queue(bdev->bd_disk->queue);
 
+	mutex_lock(&bdev_interposer_attach_lock);
+	ret = -EBUSY;
+	if (bdev_has_interposer(bdev))
+		goto out_unlock;
+	ret = -ENODEV;
+	bdev->bd_interposer = bdgrab(interposer);
+	if (!bdev->bd_interposer)
+		goto out_unlock;
+	ret = 0;
+out_unlock:
 	mutex_unlock(&bdev_interposer_attach_lock);
 
-	return ret;
+	blk_mq_unquiesce_queue(bdev->bd_disk->queue);
+	blk_mq_unfreeze_queue(bdev->bd_disk->queue);
+out:
+	if (ret) {
+		blkdev_put(bdev, mode);
+		bdev = ERR_PTR(ret);
+	}
+
+	return bdev;
 }
-EXPORT_SYMBOL_GPL(bdev_interposer_attach);
+EXPORT_SYMBOL_GPL(blkdev_interposer_attach);
 
-void bdev_interposer_detach(struct block_device *original)
+void blkdev_interposer_detach(struct block_device *bdev, fmode_t mode)
 {
-	if (WARN_ON(!original))
-		return;
+	struct block_device *interposer;
 
-	if (WARN_ON(!blk_mq_is_queue_frozen(original->bd_disk->queue)))
+	if (WARN_ON_ONCE(!bdev_has_interposer(bdev)))
 		return;
 
+	blk_mq_freeze_queue(bdev->bd_disk->queue);
+	blk_mq_quiesce_queue(bdev->bd_disk->queue);
+
 	mutex_lock(&bdev_interposer_attach_lock);
-	if (bdev_has_interposer(original)) {
-		bdput(original->bd_interposer);
-		original->bd_interposer = NULL;
-	}
+	interposer = bdev->bd_interposer;
+	bdev->bd_interposer = NULL;
 	mutex_unlock(&bdev_interposer_attach_lock);
+
+	blk_mq_unquiesce_queue(bdev->bd_disk->queue);
+	blk_mq_unfreeze_queue(bdev->bd_disk->queue);
+
+	blkdev_put(interposer, mode);
 }
-EXPORT_SYMBOL_GPL(bdev_interposer_detach);
+EXPORT_SYMBOL_GPL(blkdev_interposer_detach);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f6e2eb3f894940..fde57bb5105025 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -359,18 +359,6 @@ dev_t dm_get_dev_t(const char *path)
 }
 EXPORT_SYMBOL_GPL(dm_get_dev_t);
 
-static inline void dm_disk_freeze(struct gendisk *disk)
-{
-	blk_mq_freeze_queue(disk->queue);
-	blk_mq_quiesce_queue(disk->queue);
-}
-
-static inline void dm_disk_unfreeze(struct gendisk *disk)
-{
-	blk_mq_unquiesce_queue(disk->queue);
-	blk_mq_unfreeze_queue(disk->queue);
-}
-
 /*
  * Add a device to the list, or just increment the usage count if
  * it's already present.
@@ -418,29 +406,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 		refcount_inc(&dd->count);
 	}
 
-	if (t->md->is_interposed) {
-		struct block_device *original = dd->dm_dev->bdev;
-		struct block_device *interposer = t->md->disk->part0;
-
-		if ((ti->begin != 0) || (ti->len < bdev_nr_sectors(original))) {
-			dm_put_device(ti, dd->dm_dev);
-			DMERR("The interposer device should not be less than the original.");
-			return -EINVAL;
-		}
-
-		/*
-		 * Attach mapped interposer device to original.
-		 * It is quite convenient that device mapper creates
-		 * one disk for one block device.
-		 */
-		dm_disk_freeze(original->bd_disk);
-		r = bdev_interposer_attach(original, interposer);
-		dm_disk_unfreeze(original->bd_disk);
-		if (r) {
-			dm_put_device(ti, dd->dm_dev);
-			DMERR("Failed to attach dm interposer.");
-			return r;
-		}
+	if (t->md->is_interposed &&
+	    (ti->begin != 0 || ti->len < bdev_nr_sectors(dd->dm_dev->bdev))) {
+		dm_put_device(ti, dd->dm_dev);
+		DMERR("The interposer device should not be less than the original.");
+		return -EINVAL;
 	}
 
 	*result = dd->dm_dev;
@@ -496,11 +466,6 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 		       dm_device_name(md), d->name);
 		return;
 	}
-	if (md->is_interposed) {
-		dm_disk_freeze(d->bdev->bd_disk);
-		bdev_interposer_detach(d->bdev);
-		dm_disk_unfreeze(d->bdev->bd_disk);
-	}
 
 	if (refcount_dec_and_test(&dd->count)) {
 		dm_put_table_device(md, d);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c488e9554aa000..532ce17064b1c1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -763,10 +763,12 @@ static int open_table_device(struct table_device *td, dev_t dev,
 	BUG_ON(td->dm_dev.bdev);
 
 	if (md->is_interposed) {
-
-		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL);
-		if (IS_ERR(bdev))
+		bdev = blkdev_interposer_attach(dev, td->dm_dev.mode,
+						md->disk->part0);
+		if (IS_ERR(bdev)) {
+			DMERR("Failed to attach dm interposer.");
 			return PTR_ERR(bdev);
+		}
 	} else {
 		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 		if (IS_ERR(bdev))
@@ -793,9 +795,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 	if (!td->dm_dev.bdev)
 		return;
 
-	if (td->dm_dev.is_interposed)
-		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
-	else {
+	if (td->dm_dev.is_interposed) {
+		blkdev_interposer_detach(td->dm_dev.bdev, td->dm_dev.mode);
+	} else {
 		bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6f01971abf7b9b..2c473c9b899089 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -533,7 +533,6 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
-bool blk_mq_is_queue_frozen(struct request_queue *q);
 
 int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90f62b4197da91..fbc510162c3827 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -2036,8 +2036,8 @@ static inline bool bdev_has_interposer(struct block_device *bdev)
 	return (bdev->bd_interposer != NULL);
 };
 
-int bdev_interposer_attach(struct block_device *original,
+struct block_device *blkdev_interposer_attach(dev_t dev, fmode_t mode,
 			   struct block_device *interposer);
-void bdev_interposer_detach(struct block_device *original);
+void blkdev_interposer_detach(struct block_device *bdev, fmode_t mode);
 
 #endif /* _LINUX_BLKDEV_H */

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Sergei Shtepa March 16, 2021, 4:20 p.m. UTC | #7
Thanks!
I've already started doing something like that.
I'm glad we're thinking in the same direction.


--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 5953ff2bd260..9eae419c7b18 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -114,6 +114,9 @@  struct mapped_device {
 	bool init_tio_pdu:1;
 
 	struct srcu_struct io_barrier;
+
+	/* attach target via block-layer interposer */
+	bool is_interposed;
 };
 
 void disable_discard(struct mapped_device *md);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 5e306bba4375..2b4c9557c283 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1267,6 +1267,11 @@  static inline fmode_t get_mode(struct dm_ioctl *param)
 	return mode;
 }
 
+static inline bool get_interposer_flag(struct dm_ioctl *param)
+{
+	return (param->flags & DM_INTERPOSED_FLAG);
+}
+
 static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
 		       struct dm_target_spec **spec, char **target_params)
 {
@@ -1293,6 +1298,10 @@  static int populate_table(struct dm_table *table,
 		DMWARN("populate_table: no targets specified");
 		return -EINVAL;
 	}
+	if (table->md->is_interposed && (param->target_count != 1)) {
+		DMWARN("%s: with interposer should be specified only one target", __func__);
+		return -EINVAL;
+	}
 
 	for (i = 0; i < param->target_count; i++) {
 
@@ -1338,6 +1347,8 @@  static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
 	if (!md)
 		return -ENXIO;
 
+	md->is_interposed = get_interposer_flag(param);
+
 	r = dm_table_create(&t, get_mode(param), param->target_count, md);
 	if (r)
 		goto err;
@@ -2098,6 +2109,8 @@  int __init dm_early_create(struct dm_ioctl *dmi,
 	if (r)
 		goto err_hash_remove;
 
+	md->is_interposed = get_interposer_flag(dmi);
+
 	/* add targets */
 	for (i = 0; i < dmi->target_count; i++) {
 		r = dm_table_add_target(t, spec_array[i]->target_type,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 95391f78b8d5..f6e2eb3f8949 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -225,12 +225,13 @@  void dm_table_destroy(struct dm_table *t)
 /*
  * See if we've already got a device in the list.
  */
-static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
+static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev, bool is_interposed)
 {
 	struct dm_dev_internal *dd;
 
 	list_for_each_entry (dd, l, list)
-		if (dd->dm_dev->bdev->bd_dev == dev)
+		if ((dd->dm_dev->bdev->bd_dev == dev) &&
+		    (dd->dm_dev->is_interposed == is_interposed))
 			return dd;
 
 	return NULL;
@@ -358,6 +359,18 @@  dev_t dm_get_dev_t(const char *path)
 }
 EXPORT_SYMBOL_GPL(dm_get_dev_t);
 
+static inline void dm_disk_freeze(struct gendisk *disk)
+{
+	blk_mq_freeze_queue(disk->queue);
+	blk_mq_quiesce_queue(disk->queue);
+}
+
+static inline void dm_disk_unfreeze(struct gendisk *disk)
+{
+	blk_mq_unquiesce_queue(disk->queue);
+	blk_mq_unfreeze_queue(disk->queue);
+}
+
 /*
  * Add a device to the list, or just increment the usage count if
  * it's already present.
@@ -385,7 +398,7 @@  int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 			return -ENODEV;
 	}
 
-	dd = find_device(&t->devices, dev);
+	dd = find_device(&t->devices, dev, t->md->is_interposed);
 	if (!dd) {
 		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
 		if (!dd)
@@ -398,15 +411,38 @@  int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 
 		refcount_set(&dd->count, 1);
 		list_add(&dd->list, &t->devices);
-		goto out;
-
 	} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
 		r = upgrade_mode(dd, mode, t->md);
 		if (r)
 			return r;
+		refcount_inc(&dd->count);
 	}
-	refcount_inc(&dd->count);
-out:
+
+	if (t->md->is_interposed) {
+		struct block_device *original = dd->dm_dev->bdev;
+		struct block_device *interposer = t->md->disk->part0;
+
+		if ((ti->begin != 0) || (ti->len < bdev_nr_sectors(original))) {
+			dm_put_device(ti, dd->dm_dev);
+			DMERR("The interposer device should not be less than the original.");
+			return -EINVAL;
+		}
+
+		/*
+		 * Attach mapped interposer device to original.
+		 * It is quite convenient that device mapper creates
+		 * one disk for one block device.
+		 */
+		dm_disk_freeze(original->bd_disk);
+		r = bdev_interposer_attach(original, interposer);
+		dm_disk_unfreeze(original->bd_disk);
+		if (r) {
+			dm_put_device(ti, dd->dm_dev);
+			DMERR("Failed to attach dm interposer.");
+			return r;
+		}
+	}
+
 	*result = dd->dm_dev;
 	return 0;
 }
@@ -446,6 +482,7 @@  void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
 	int found = 0;
 	struct list_head *devices = &ti->table->devices;
+	struct mapped_device *md = ti->table->md;
 	struct dm_dev_internal *dd;
 
 	list_for_each_entry(dd, devices, list) {
@@ -456,11 +493,17 @@  void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 	}
 	if (!found) {
 		DMWARN("%s: device %s not in table devices list",
-		       dm_device_name(ti->table->md), d->name);
+		       dm_device_name(md), d->name);
 		return;
 	}
+	if (md->is_interposed) {
+		dm_disk_freeze(d->bdev->bd_disk);
+		bdev_interposer_detach(d->bdev);
+		dm_disk_unfreeze(d->bdev->bd_disk);
+	}
+
 	if (refcount_dec_and_test(&dd->count)) {
-		dm_put_table_device(ti->table->md, d);
+		dm_put_table_device(md, d);
 		list_del(&dd->list);
 		kfree(dd);
 	}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 50b693d776d6..466bf70a66b0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -762,16 +762,24 @@  static int open_table_device(struct table_device *td, dev_t dev,
 
 	BUG_ON(td->dm_dev.bdev);
 
-	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	if (md->is_interposed) {
 
-	r = bd_link_disk_holder(bdev, dm_disk(md));
-	if (r) {
-		blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
-		return r;
+		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL);
+		if (IS_ERR(bdev))
+			return PTR_ERR(bdev);
+	} else {
+		bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
+		if (IS_ERR(bdev))
+			return PTR_ERR(bdev);
+
+		r = bd_link_disk_holder(bdev, dm_disk(md));
+		if (r) {
+			blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
+			return r;
+		}
 	}
 
+	td->dm_dev.is_interposed = md->is_interposed;
 	td->dm_dev.bdev = bdev;
 	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	return 0;
@@ -785,20 +793,26 @@  static void close_table_device(struct table_device *td, struct mapped_device *md
 	if (!td->dm_dev.bdev)
 		return;
 
-	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
-	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+	if (td->dm_dev.is_interposed)
+		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
+	else {
+		bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
+		blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+	}
 	put_dax(td->dm_dev.dax_dev);
 	td->dm_dev.bdev = NULL;
 	td->dm_dev.dax_dev = NULL;
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
-					      fmode_t mode)
+					      fmode_t mode, bool is_interposed)
 {
 	struct table_device *td;
 
 	list_for_each_entry(td, l, list)
-		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
+		if (td->dm_dev.bdev->bd_dev == dev &&
+		    td->dm_dev.mode == mode &&
+		    td->dm_dev.is_interposed == is_interposed)
 			return td;
 
 	return NULL;
@@ -811,7 +825,7 @@  int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 	struct table_device *td;
 
 	mutex_lock(&md->table_devices_lock);
-	td = find_table_device(&md->table_devices, dev, mode);
+	td = find_table_device(&md->table_devices, dev, mode, md->is_interposed);
 	if (!td) {
 		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 		if (!td) {
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 7f4ac87c0b32..76a6dfb1cb29 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -159,6 +159,7 @@  struct dm_dev {
 	struct block_device *bdev;
 	struct dax_device *dax_dev;
 	fmode_t mode;
+	bool is_interposed;
 	char name[16];
 };
 
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index fcff6669137b..fc4d06bb3dbb 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -362,4 +362,10 @@  enum {
  */
 #define DM_INTERNAL_SUSPEND_FLAG	(1 << 18) /* Out */
 
+/*
+ * If set, the underlying device should open without FMODE_EXCL
+ * and attach mapped device via bdev_interposer.
+ */
+#define DM_INTERPOSED_FLAG		(1 << 19) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */