diff mbox

[v2,5/7] dm: remove DM_TYPE_DAX_BIO_BASED dm_queue_mode

Message ID 20180529195106.14268-6-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler May 29, 2018, 7:51 p.m. UTC
The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
devices that could possibly support DAX from transitioning into DM devices
that cannot support DAX.

For example, the following transition will currently fail:

 dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED

but these will both succeed:

 dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
 		DM_TYPE_DAX_BASED        DM_TYPE_BIO_BASED

 dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
 		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED

This seems arbitrary, as really the choice on whether to use DAX happens at
filesystem mount time.  There's no guarantee that the in the first case
(double fsdax pmem) we were using the dax mount option with our file
system.

Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
of truth.  If this is set, we can use DAX, and if not, not.  We keep this
up to date in table_load() as the table changes.  As with regular block
devices the filesystem will then know at mount time whether DAX is a
supported mount option or not.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/md/dm-ioctl.c         | 16 ++++++----------
 drivers/md/dm-table.c         | 25 ++++++++++---------------
 drivers/md/dm.c               |  2 --
 include/linux/device-mapper.h |  8 ++++++--
 4 files changed, 22 insertions(+), 29 deletions(-)

Comments

Mike Snitzer June 1, 2018, 10:04 p.m. UTC | #1
On Tue, May 29 2018 at  3:51pm -0400,
Ross Zwisler <ross.zwisler@linux.intel.com> wrote:

> The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> devices that could possibly support DAX from transitioning into DM devices
> that cannot support DAX.
> 
> For example, the following transition will currently fail:
> 
>  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> 	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> 
> but these will both succeed:
> 
>  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
>  	      DM_TYPE_DAX_BIO_BASED        DM_TYPE_BIO_BASED
> 

I fail to see how this succeeds given
drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:

DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED

>  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
>  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> 
> This seems arbitrary, as really the choice on whether to use DAX happens at
> filesystem mount time.  There's no guarantee that the in the first case
> (double fsdax pmem) we were using the dax mount option with our file
> system.
> 
> Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
> it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
> of truth.  If this is set, we can use DAX, and if not, not.  We keep this
> up to date in table_load() as the table changes.  As with regular block
> devices the filesystem will then know at mount time whether DAX is a
> supported mount option or not.

If you don't think you need this specialization that is fine.. but DM
devices supporting suspending (as part of table reloads) so is there any
risk that there will be inflight IO (say if someone did 'dmsetup suspend
--noflush').. and then upon reload the device type changed out from
under us.. anyway, I don't have all the PMEM DAX stuff paged back into
my head yet.

But this just seems like we really shouldn't be allowing the
transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED

Mike

> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  drivers/md/dm-ioctl.c         | 16 ++++++----------
>  drivers/md/dm-table.c         | 25 ++++++++++---------------
>  drivers/md/dm.c               |  2 --
>  include/linux/device-mapper.h |  8 ++++++--
>  4 files changed, 22 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
> index 5acf77de5945..d1f86d0bb2d0 100644
> --- a/drivers/md/dm-ioctl.c
> +++ b/drivers/md/dm-ioctl.c
> @@ -1292,15 +1292,6 @@ static int populate_table(struct dm_table *table,
>  	return dm_table_complete(table);
>  }
>  
> -static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
> -{
> -	if (cur == new ||
> -	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
> -		return true;
> -
> -	return false;
> -}
> -
>  static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
>  {
>  	int r;
> @@ -1343,12 +1334,17 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
>  			DMWARN("unable to set up device queue for new table.");
>  			goto err_unlock_md_type;
>  		}
> -	} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
> +	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {
>  		DMWARN("can't change device type after initial table load.");
>  		r = -EINVAL;
>  		goto err_unlock_md_type;
>  	}
>  
> +	if (dm_table_supports_dax(t))
> +		blk_queue_flag_set(QUEUE_FLAG_DAX, md->queue);
> +	else
> +		blk_queue_flag_clear(QUEUE_FLAG_DAX, md->queue);
> +
>  	dm_unlock_md_type(md);
>  
>  	/* stage inactive table */
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 5bb994b012ca..ea5c4a1e6f5b 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -866,7 +866,6 @@ EXPORT_SYMBOL(dm_consume_args);
>  static bool __table_type_bio_based(enum dm_queue_mode table_type)
>  {
>  	return (table_type == DM_TYPE_BIO_BASED ||
> -		table_type == DM_TYPE_DAX_BIO_BASED ||
>  		table_type == DM_TYPE_NVME_BIO_BASED);
>  }
>  
> @@ -888,7 +887,7 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
>  	return bdev_dax_supported(dev->bdev, PAGE_SIZE);
>  }
>  
> -static bool dm_table_supports_dax(struct dm_table *t)
> +bool dm_table_supports_dax(struct dm_table *t)
>  {
>  	struct dm_target *ti;
>  	unsigned i;
> @@ -907,6 +906,7 @@ static bool dm_table_supports_dax(struct dm_table *t)
>  
>  	return true;
>  }
> +EXPORT_SYMBOL_GPL(dm_table_supports_dax);
>  
>  static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
>  
> @@ -944,7 +944,6 @@ static int dm_table_determine_type(struct dm_table *t)
>  			/* possibly upgrade to a variant of bio-based */
>  			goto verify_bio_based;
>  		}
> -		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
>  		BUG_ON(t->type == DM_TYPE_NVME_BIO_BASED);
>  		goto verify_rq_based;
>  	}
> @@ -981,18 +980,14 @@ static int dm_table_determine_type(struct dm_table *t)
>  verify_bio_based:
>  		/* We must use this table as bio-based */
>  		t->type = DM_TYPE_BIO_BASED;
> -		if (dm_table_supports_dax(t) ||
> -		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
> -			t->type = DM_TYPE_DAX_BIO_BASED;
> -		} else {
> -			/* Check if upgrading to NVMe bio-based is valid or required */
> -			tgt = dm_table_get_immutable_target(t);
> -			if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
> -				t->type = DM_TYPE_NVME_BIO_BASED;
> -				goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
> -			} else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
> -				t->type = DM_TYPE_NVME_BIO_BASED;
> -			}
> +
> +		/* Check if upgrading to NVMe bio-based is valid or required */
> +		tgt = dm_table_get_immutable_target(t);
> +		if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
> +			t->type = DM_TYPE_NVME_BIO_BASED;
> +			goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
> +		} else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
> +			t->type = DM_TYPE_NVME_BIO_BASED;
>  		}
>  		return 0;
>  	}
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 9728433362d1..0ce06fa292fd 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -2192,7 +2192,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
>  		}
>  		break;
>  	case DM_TYPE_BIO_BASED:
> -	case DM_TYPE_DAX_BIO_BASED:
>  		dm_init_normal_md_queue(md);
>  		blk_queue_make_request(md->queue, dm_make_request);
>  		break;
> @@ -2910,7 +2909,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
>  
>  	switch (type) {
>  	case DM_TYPE_BIO_BASED:
> -	case DM_TYPE_DAX_BIO_BASED:
>  	case DM_TYPE_NVME_BIO_BASED:
>  		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
>  		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 31fef7c34185..cbf3d7e7ed33 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -27,8 +27,7 @@ enum dm_queue_mode {
>  	DM_TYPE_BIO_BASED	 = 1,
>  	DM_TYPE_REQUEST_BASED	 = 2,
>  	DM_TYPE_MQ_REQUEST_BASED = 3,
> -	DM_TYPE_DAX_BIO_BASED	 = 4,
> -	DM_TYPE_NVME_BIO_BASED	 = 5,
> +	DM_TYPE_NVME_BIO_BASED	 = 4,
>  };
>  
>  typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
> @@ -460,6 +459,11 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
>   */
>  void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
>  
> +/*
> + * Check to see if this target type and all table devices support DAX.
> + */
> +bool dm_table_supports_dax(struct dm_table *t);
> +
>  /*
>   * Finally call this to make the table ready for use.
>   */
> -- 
> 2.14.3
>
Ross Zwisler June 4, 2018, 11:24 p.m. UTC | #2
On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> On Tue, May 29 2018 at  3:51pm -0400,
> Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> 
> > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > devices that could possibly support DAX from transitioning into DM devices
> > that cannot support DAX.
> > 
> > For example, the following transition will currently fail:
> > 
> >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > 	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > 
> > but these will both succeed:
> > 
> >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> >  	      DM_TYPE_DAX_BIO_BASED        DM_TYPE_BIO_BASED
> > 
> 
> I fail to see how this succeeds given
> drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> 
> DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED

Right, sorry, that was a typo.  What I meant was:

> For example, the following transition will currently fail:
> 
>  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
>               DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> 
> but these will both succeed:
> 
>  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
>                 DM_TYPE_BIO_BASED        DM_TYPE_BIO_BASED
> 
>  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
>                 DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED

So we allow 2 of the 3 transitions, but the reason that we disallow the third
isn't fully clear to me.

> >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> >  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > 
> > This seems arbitrary, as really the choice on whether to use DAX happens at
> > filesystem mount time.  There's no guarantee that the in the first case
> > (double fsdax pmem) we were using the dax mount option with our file
> > system.
> > 
> > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
> > it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
> > of truth.  If this is set, we can use DAX, and if not, not.  We keep this
> > up to date in table_load() as the table changes.  As with regular block
> > devices the filesystem will then know at mount time whether DAX is a
> > supported mount option or not.
> 
> If you don't think you need this specialization that is fine.. but DM
> devices supporting suspending (as part of table reloads) so is there any
> risk that there will be inflight IO (say if someone did 'dmsetup suspend
> --noflush').. and then upon reload the device type changed out from
> under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> my head yet.
> 
> But this just seems like we really shouldn't be allowing the
> transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED

I admit I don't fully understand all the ways that DM supports suspending and
resuming devices.  Is there actually a case where we can change out the DM
devices while I/O is running, and somehow end up trying to issue a DAX I/O to
a device that doesn't support DAX?

Toshi, do you have a test case that shows this somehow?
Kani, Toshi June 4, 2018, 11:49 p.m. UTC | #3
On Mon, 2018-06-04 at 17:24 -0600, Ross Zwisler wrote:
> On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:

> > On Tue, May 29 2018 at  3:51pm -0400,

> > Ross Zwisler <ross.zwisler@linux.intel.com> wrote:

 :
> > For example, the following transition will currently fail:

> > 

> >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]

> >               DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED

> > 

> > but these will both succeed:

> > 

> >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]

> >                 DM_TYPE_BIO_BASED        DM_TYPE_BIO_BASED

> > 

> >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]

> >                 DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED

> 

> So we allow 2 of the 3 transitions, but the reason that we disallow the third

> isn't fully clear to me.


I need to refresh my memory for the code, but here is the intent.
https://lkml.org/lkml/2016/6/22/1000
https://lkml.org/lkml/2016/6/22/999


> > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]

> > >  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED

> > > 

> > > This seems arbitrary, as really the choice on whether to use DAX happens at

> > > filesystem mount time.  There's no guarantee that the in the first case

> > > (double fsdax pmem) we were using the dax mount option with our file

> > > system.

> > > 

> > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around

> > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one source

> > > of truth.  If this is set, we can use DAX, and if not, not.  We keep this

> > > up to date in table_load() as the table changes.  As with regular block

> > > devices the filesystem will then know at mount time whether DAX is a

> > > supported mount option or not.

> > 

> > If you don't think you need this specialization that is fine.. but DM

> > devices supporting suspending (as part of table reloads) so is there any

> > risk that there will be inflight IO (say if someone did 'dmsetup suspend

> > --noflush').. and then upon reload the device type changed out from

> > under us.. anyway, I don't have all the PMEM DAX stuff paged back into

> > my head yet.

> > 

> > But this just seems like we really shouldn't be allowing the

> > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED

> 

> I admit I don't fully understand all the ways that DM supports suspending and

> resuming devices.  Is there actually a case where we can change out the DM

> devices while I/O is running, and somehow end up trying to issue a DAX I/O to

> a device that doesn't support DAX?

> 

> Toshi, do you have a test case that shows this somehow?


No, I did not test suspend/resume since HPE servers do not support it.

Thanks,
-Toshi
Mike Snitzer June 5, 2018, 12:46 a.m. UTC | #4
On Mon, Jun 04 2018 at  7:24pm -0400,
Ross Zwisler <ross.zwisler@linux.intel.com> wrote:

> On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> > On Tue, May 29 2018 at  3:51pm -0400,
> > Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> > 
> > > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > > devices that could possibly support DAX from transitioning into DM devices
> > > that cannot support DAX.
> > > 
> > > For example, the following transition will currently fail:
> > > 
> > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > 	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > > 
> > > but these will both succeed:
> > > 
> > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > >  	      DM_TYPE_DAX_BIO_BASED        DM_TYPE_BIO_BASED
> > > 
> > 
> > I fail to see how this succeeds given
> > drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> > 
> > DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED
> 
> Right, sorry, that was a typo.  What I meant was:
> 
> > For example, the following transition will currently fail:
> > 
> >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> >               DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > 
> > but these will both succeed:
> > 
> >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> >                 DM_TYPE_BIO_BASED        DM_TYPE_BIO_BASED
> > 
> >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> >                 DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> 
> So we allow 2 of the 3 transitions, but the reason that we disallow the third
> isn't fully clear to me.
> 
> > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > >  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > > 
> > > This seems arbitrary, as really the choice on whether to use DAX happens at
> > > filesystem mount time.  There's no guarantee that the in the first case
> > > (double fsdax pmem) we were using the dax mount option with our file
> > > system.
> > > 
> > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
> > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
> > > of truth.  If this is set, we can use DAX, and if not, not.  We keep this
> > > up to date in table_load() as the table changes.  As with regular block
> > > devices the filesystem will then know at mount time whether DAX is a
> > > supported mount option or not.
> > 
> > If you don't think you need this specialization that is fine.. but DM
> > devices supporting suspending (as part of table reloads) so is there any
> > risk that there will be inflight IO (say if someone did 'dmsetup suspend
> > --noflush').. and then upon reload the device type changed out from
> > under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> > my head yet.
> > 
> > But this just seems like we really shouldn't be allowing the
> > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED
> 
> I admit I don't fully understand all the ways that DM supports suspending and
> resuming devices.  Is there actually a case where we can change out the DM
> devices while I/O is running, and somehow end up trying to issue a DAX I/O to
> a device that doesn't support DAX?

Yes, provided root permissions, it's very easy to dmsetup suspend/load/resume
to replace any portion of the DM device's logical address space to map to an
entirely different DM target (with a different backing store).  It's
pretty intrusive to do such things, but easily done and powerful.

Mike
Ross Zwisler June 6, 2018, 5:24 p.m. UTC | #5
On Mon, Jun 04, 2018 at 08:46:28PM -0400, Mike Snitzer wrote:
> On Mon, Jun 04 2018 at  7:24pm -0400,
> Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> 
> > On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> > > On Tue, May 29 2018 at  3:51pm -0400,
> > > Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> > > 
> > > > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > > > devices that could possibly support DAX from transitioning into DM devices
> > > > that cannot support DAX.
> > > > 
> > > > For example, the following transition will currently fail:
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > > 	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > > > 
> > > > but these will both succeed:
> > > > 
> > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > >  	      DM_TYPE_DAX_BIO_BASED        DM_TYPE_BIO_BASED
> > > > 
> > > 
> > > I fail to see how this succeeds given
> > > drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> > > 
> > > DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED
> > 
> > Right, sorry, that was a typo.  What I meant was:
> > 
> > > For example, the following transition will currently fail:
> > > 
> > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > >               DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > > 
> > > but these will both succeed:
> > > 
> > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > >                 DM_TYPE_BIO_BASED        DM_TYPE_BIO_BASED
> > > 
> > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > >                 DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > 
> > So we allow 2 of the 3 transitions, but the reason that we disallow the third
> > isn't fully clear to me.
> > 
> > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > >  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > > > 
> > > > This seems arbitrary, as really the choice on whether to use DAX happens at
> > > > filesystem mount time.  There's no guarantee that the in the first case
> > > > (double fsdax pmem) we were using the dax mount option with our file
> > > > system.
> > > > 
> > > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
> > > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
> > > > of truth.  If this is set, we can use DAX, and if not, not.  We keep this
> > > > up to date in table_load() as the table changes.  As with regular block
> > > > devices the filesystem will then know at mount time whether DAX is a
> > > > supported mount option or not.
> > > 
> > > If you don't think you need this specialization that is fine.. but DM
> > > devices supporting suspending (as part of table reloads) so is there any
> > > risk that there will be inflight IO (say if someone did 'dmsetup suspend
> > > --noflush').. and then upon reload the device type changed out from
> > > under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> > > my head yet.
> > > 
> > > But this just seems like we really shouldn't be allowing the
> > > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED
> > 
> > I admit I don't fully understand all the ways that DM supports suspending and
> > resuming devices.  Is there actually a case where we can change out the DM
> > devices while I/O is running, and somehow end up trying to issue a DAX I/O to
> > a device that doesn't support DAX?
> 
> Yes, provided root permissions, it's very easy to dmsetup suspend/load/resume
> to replace any portion of the DM device's logical address space to map to an
> entirely different DM target (with a different backing store).  It's
> pretty intrusive to do such things, but easily done and powerful.
> 
> Mike

Hmmm, I don't understand how you can do this if there is a filesystem built on
your DM device?  Say you have a DM device, either striped or linear, that is
made up of 2 devices, and then you use dmsetup to replace one of the DM member
devices with something else.  You've just swapped out half of your LBA space
with new data, right? 

I don't understand how you can expect a filesystem built on the old DM device
to still work?  You especially can't do this while the filesystem is mounted -
all the in-core filesystem metadata would be garbage because the on-media data
would have totally changed.

So, when dealing with a filesystem, the flow must be:

unmount your filesystem
redo your DM device, changing out devices
reformat your filesystem on the new DM device
remount your filesystem

Right?  If so, then I don't see how a transition of the DM device from
supporting DAX to not supporting DAX or vice versa could harm us, as we can't
be doing filesystem I/O at the time when we change the composition of the DM
device.
Mike Snitzer June 6, 2018, 10:29 p.m. UTC | #6
On Wed, Jun 06 2018 at  1:24P -0400,
Ross Zwisler <ross.zwisler@linux.intel.com> wrote:

> On Mon, Jun 04, 2018 at 08:46:28PM -0400, Mike Snitzer wrote:
> > On Mon, Jun 04 2018 at  7:24pm -0400,
> > Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> > 
> > > On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> > > > On Tue, May 29 2018 at  3:51pm -0400,
> > > > Ross Zwisler <ross.zwisler@linux.intel.com> wrote:
> > > > 
> > > > > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > > > > devices that could possibly support DAX from transitioning into DM devices
> > > > > that cannot support DAX.
> > > > > 
> > > > > For example, the following transition will currently fail:
> > > > > 
> > > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > > > 	      DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > > > > 
> > > > > but these will both succeed:
> > > > > 
> > > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > > >  	      DM_TYPE_DAX_BIO_BASED        DM_TYPE_BIO_BASED
> > > > > 
> > > > 
> > > > I fail to see how this succeeds given
> > > > drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> > > > 
> > > > DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED
> > > 
> > > Right, sorry, that was a typo.  What I meant was:
> > > 
> > > > For example, the following transition will currently fail:
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > >               DM_TYPE_DAX_BIO_BASED       DM_TYPE_BIO_BASED
> > > > 
> > > > but these will both succeed:
> > > > 
> > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > >                 DM_TYPE_BIO_BASED        DM_TYPE_BIO_BASED
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > >                 DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > > 
> > > So we allow 2 of the 3 transitions, but the reason that we disallow the third
> > > isn't fully clear to me.
> > > 
> > > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > > >  		DM_TYPE_BIO_BASED        DM_TYPE_DAX_BIO_BASED
> > > > > 
> > > > > This seems arbitrary, as really the choice on whether to use DAX happens at
> > > > > filesystem mount time.  There's no guarantee that the in the first case
> > > > > (double fsdax pmem) we were using the dax mount option with our file
> > > > > system.
> > > > > 
> > > > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing around
> > > > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one source
> > > > > of truth.  If this is set, we can use DAX, and if not, not.  We keep this
> > > > > up to date in table_load() as the table changes.  As with regular block
> > > > > devices the filesystem will then know at mount time whether DAX is a
> > > > > supported mount option or not.
> > > > 
> > > > If you don't think you need this specialization that is fine.. but DM
> > > > devices supporting suspending (as part of table reloads) so is there any
> > > > risk that there will be inflight IO (say if someone did 'dmsetup suspend
> > > > --noflush').. and then upon reload the device type changed out from
> > > > under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> > > > my head yet.
> > > > 
> > > > But this just seems like we really shouldn't be allowing the
> > > > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED
> > > 
> > > I admit I don't fully understand all the ways that DM supports suspending and
> > > resuming devices.  Is there actually a case where we can change out the DM
> > > devices while I/O is running, and somehow end up trying to issue a DAX I/O to
> > > a device that doesn't support DAX?
> > 
> > Yes, provided root permissions, it's very easy to dmsetup suspend/load/resume
> > to replace any portion of the DM device's logical address space to map to an
> > entirely different DM target (with a different backing store).  It's
> > pretty intrusive to do such things, but easily done and powerful.
> > 
> > Mike
> 
> Hmmm, I don't understand how you can do this if there is a filesystem built on
> your DM device?  Say you have a DM device, either striped or linear, that is
> made up of 2 devices, and then you use dmsetup to replace one of the DM member
> devices with something else.  You've just swapped out half of your LBA space
> with new data, right? 
> 
> I don't understand how you can expect a filesystem built on the old DM device
> to still work?  You especially can't do this while the filesystem is mounted -
> all the in-core filesystem metadata would be garbage because the on-media data
> would have totally changed.

Sure it can cause you to no longer have access to the original backing
store (e.g. swapping in an "error" target instead of linear).

But this ability to suspend a DM device with a table that is using
"linear", load a new table (that uses a different target) into the
inactive table slot, and then resume to make the device active is how
things like snapshot support are achieved.  The "linear" target gets
replaced with "snapshot-origin', etc.
 
> So, when dealing with a filesystem, the flow must be:
> 
> unmount your filesystem
> redo your DM device, changing out devices
> reformat your filesystem on the new DM device
> remount your filesystem
> 
> Right?

No.

> If so, then I don't see how a transition of the DM device from
> supporting DAX to not supporting DAX or vice versa could harm us, as we can't
> be doing filesystem I/O at the time when we change the composition of the DM
> device.

Yes you can.  That is the entire point.

BTW, I'm not saying you have to change the backing store.  I'm saying
you _can_ if you would like.  Obviously if you remove the backing store
with the filesystem then you'll not be able to access the filesystem.

The point of all this is to say, DM table swaps can be good and
powerful.  But they can also be harmful.  The goal of preventing
transitions from DM_TYPE_DAX_BIO_BASED to DM_TYPE_BIO_BASED was to
shield users from doing obviously wrong things... things they might not
realize are problematic because of the relatively opaque nature of "DAX"
support.

Mike
diff mbox

Patch

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 5acf77de5945..d1f86d0bb2d0 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1292,15 +1292,6 @@  static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
-static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
-{
-	if (cur == new ||
-	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
-		return true;
-
-	return false;
-}
-
 static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r;
@@ -1343,12 +1334,17 @@  static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
 			DMWARN("unable to set up device queue for new table.");
 			goto err_unlock_md_type;
 		}
-	} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
+	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {
 		DMWARN("can't change device type after initial table load.");
 		r = -EINVAL;
 		goto err_unlock_md_type;
 	}
 
+	if (dm_table_supports_dax(t))
+		blk_queue_flag_set(QUEUE_FLAG_DAX, md->queue);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_DAX, md->queue);
+
 	dm_unlock_md_type(md);
 
 	/* stage inactive table */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5bb994b012ca..ea5c4a1e6f5b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -866,7 +866,6 @@  EXPORT_SYMBOL(dm_consume_args);
 static bool __table_type_bio_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_BIO_BASED ||
-		table_type == DM_TYPE_DAX_BIO_BASED ||
 		table_type == DM_TYPE_NVME_BIO_BASED);
 }
 
@@ -888,7 +887,7 @@  static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 	return bdev_dax_supported(dev->bdev, PAGE_SIZE);
 }
 
-static bool dm_table_supports_dax(struct dm_table *t)
+bool dm_table_supports_dax(struct dm_table *t)
 {
 	struct dm_target *ti;
 	unsigned i;
@@ -907,6 +906,7 @@  static bool dm_table_supports_dax(struct dm_table *t)
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(dm_table_supports_dax);
 
 static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
 
@@ -944,7 +944,6 @@  static int dm_table_determine_type(struct dm_table *t)
 			/* possibly upgrade to a variant of bio-based */
 			goto verify_bio_based;
 		}
-		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
 		BUG_ON(t->type == DM_TYPE_NVME_BIO_BASED);
 		goto verify_rq_based;
 	}
@@ -981,18 +980,14 @@  static int dm_table_determine_type(struct dm_table *t)
 verify_bio_based:
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
-		if (dm_table_supports_dax(t) ||
-		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
-			t->type = DM_TYPE_DAX_BIO_BASED;
-		} else {
-			/* Check if upgrading to NVMe bio-based is valid or required */
-			tgt = dm_table_get_immutable_target(t);
-			if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
-				t->type = DM_TYPE_NVME_BIO_BASED;
-				goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
-			} else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
-				t->type = DM_TYPE_NVME_BIO_BASED;
-			}
+
+		/* Check if upgrading to NVMe bio-based is valid or required */
+		tgt = dm_table_get_immutable_target(t);
+		if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
+			t->type = DM_TYPE_NVME_BIO_BASED;
+			goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
+		} else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
+			t->type = DM_TYPE_NVME_BIO_BASED;
 		}
 		return 0;
 	}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9728433362d1..0ce06fa292fd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2192,7 +2192,6 @@  int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		}
 		break;
 	case DM_TYPE_BIO_BASED:
-	case DM_TYPE_DAX_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		break;
@@ -2910,7 +2909,6 @@  struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
-	case DM_TYPE_DAX_BIO_BASED:
 	case DM_TYPE_NVME_BIO_BASED:
 		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 31fef7c34185..cbf3d7e7ed33 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -27,8 +27,7 @@  enum dm_queue_mode {
 	DM_TYPE_BIO_BASED	 = 1,
 	DM_TYPE_REQUEST_BASED	 = 2,
 	DM_TYPE_MQ_REQUEST_BASED = 3,
-	DM_TYPE_DAX_BIO_BASED	 = 4,
-	DM_TYPE_NVME_BIO_BASED	 = 5,
+	DM_TYPE_NVME_BIO_BASED	 = 4,
 };
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
@@ -460,6 +459,11 @@  void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
  */
 void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
 
+/*
+ * Check to see if this target type and all table devices support DAX.
+ */
+bool dm_table_supports_dax(struct dm_table *t);
+
 /*
  * Finally call this to make the table ready for use.
  */