diff mbox

[v4,12/16] libnvdimm, nfit: enable support for volatile ranges

Message ID 149875884190.10031.6179599135820559644.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams June 29, 2017, 5:54 p.m. UTC
Allow volatile nfit ranges to participate in all the same infrastructure
provided for persistent memory regions. A resulting resulting namespace
device will still be called "pmem", but the parent region type will be
"nd_volatile". This is in preparation for disabling the dax ->flush()
operation in the pmem driver when it is hosted on a volatile range.

Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        |    9 ++++++++-
 drivers/nvdimm/bus.c            |    8 ++++----
 drivers/nvdimm/core.c           |    2 +-
 drivers/nvdimm/dax_devs.c       |    2 +-
 drivers/nvdimm/dimm_devs.c      |    2 +-
 drivers/nvdimm/namespace_devs.c |    8 ++++----
 drivers/nvdimm/nd-core.h        |    9 +++++++++
 drivers/nvdimm/pfn_devs.c       |    4 ++--
 drivers/nvdimm/region_devs.c    |   27 ++++++++++++++-------------
 9 files changed, 44 insertions(+), 27 deletions(-)

Comments

Linda Knippers June 29, 2017, 7:20 p.m. UTC | #1
On 06/29/2017 01:54 PM, Dan Williams wrote:
> Allow volatile nfit ranges to participate in all the same infrastructure
> provided for persistent memory regions. 

This seems to be a bit more than "other rework".

> A resulting resulting namespace
> device will still be called "pmem", but the parent region type will be
> "nd_volatile". 

What does this look like to a user or admin?  How does someone know that
/dev/pmemX is persistent memory and /dev/pmemY isn't?  Someone shouldn't
have to weed through /sys or ndctl some other interface to figure that out
in the future if they don't have to do that today.  We have different
names for BTT namespaces.  Is there a different name for volatile ranges?

-- ljk

> This is in preparation for disabling the dax ->flush()
> operation in the pmem driver when it is hosted on a volatile range.
> 
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Matthew Wilcox <mawilcox@microsoft.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/acpi/nfit/core.c        |    9 ++++++++-
>  drivers/nvdimm/bus.c            |    8 ++++----
>  drivers/nvdimm/core.c           |    2 +-
>  drivers/nvdimm/dax_devs.c       |    2 +-
>  drivers/nvdimm/dimm_devs.c      |    2 +-
>  drivers/nvdimm/namespace_devs.c |    8 ++++----
>  drivers/nvdimm/nd-core.h        |    9 +++++++++
>  drivers/nvdimm/pfn_devs.c       |    4 ++--
>  drivers/nvdimm/region_devs.c    |   27 ++++++++++++++-------------
>  9 files changed, 44 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
> index ac2436538b7e..60d1ca149cc1 100644
> --- a/drivers/acpi/nfit/core.c
> +++ b/drivers/acpi/nfit/core.c
> @@ -2227,6 +2227,13 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa)
>  		nfit_spa_type(spa) == NFIT_SPA_PCD);
>  }
>  
> +static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa)
> +{
> +	return (nfit_spa_type(spa) == NFIT_SPA_VDISK ||
> +		nfit_spa_type(spa) == NFIT_SPA_VCD   ||
> +		nfit_spa_type(spa) == NFIT_SPA_VOLATILE);
> +}
> +
>  static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
>  		struct nfit_spa *nfit_spa)
>  {
> @@ -2301,7 +2308,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
>  				ndr_desc);
>  		if (!nfit_spa->nd_region)
>  			rc = -ENOMEM;
> -	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
> +	} else if (nfit_spa_is_volatile(spa)) {
>  		nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus,
>  				ndr_desc);
>  		if (!nfit_spa->nd_region)
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index e9361bffe5ee..4cfba534814b 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -38,13 +38,13 @@ static int to_nd_device_type(struct device *dev)
>  {
>  	if (is_nvdimm(dev))
>  		return ND_DEVICE_DIMM;
> -	else if (is_nd_pmem(dev))
> +	else if (is_memory(dev))
>  		return ND_DEVICE_REGION_PMEM;
>  	else if (is_nd_blk(dev))
>  		return ND_DEVICE_REGION_BLK;
>  	else if (is_nd_dax(dev))
>  		return ND_DEVICE_DAX_PMEM;
> -	else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
> +	else if (is_nd_region(dev->parent))
>  		return nd_region_to_nstype(to_nd_region(dev->parent));
>  
>  	return 0;
> @@ -56,7 +56,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
>  	 * Ensure that region devices always have their numa node set as
>  	 * early as possible.
>  	 */
> -	if (is_nd_pmem(dev) || is_nd_blk(dev))
> +	if (is_nd_region(dev))
>  		set_dev_node(dev, to_nd_region(dev)->numa_node);
>  	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
>  			to_nd_device_type(dev));
> @@ -65,7 +65,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
>  static struct module *to_bus_provider(struct device *dev)
>  {
>  	/* pin bus providers while regions are enabled */
> -	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
> +	if (is_nd_region(dev)) {
>  		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
>  
>  		return nvdimm_bus->nd_desc->module;
> diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
> index 2dee908e4bae..22e3ef463401 100644
> --- a/drivers/nvdimm/core.c
> +++ b/drivers/nvdimm/core.c
> @@ -504,7 +504,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
>  	struct nvdimm_bus *nvdimm_bus;
>  	struct list_head *poison_list;
>  
> -	if (!is_nd_pmem(&nd_region->dev)) {
> +	if (!is_memory(&nd_region->dev)) {
>  		dev_WARN_ONCE(&nd_region->dev, 1,
>  				"%s only valid for pmem regions\n", __func__);
>  		return;
> diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
> index c1b6556aea6e..a304983ac417 100644
> --- a/drivers/nvdimm/dax_devs.c
> +++ b/drivers/nvdimm/dax_devs.c
> @@ -89,7 +89,7 @@ struct device *nd_dax_create(struct nd_region *nd_region)
>  	struct device *dev = NULL;
>  	struct nd_dax *nd_dax;
>  
> -	if (!is_nd_pmem(&nd_region->dev))
> +	if (!is_memory(&nd_region->dev))
>  		return NULL;
>  
>  	nd_dax = nd_dax_alloc(nd_region);
> diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
> index 6a1e7a3c0c17..f0d1b7e5de01 100644
> --- a/drivers/nvdimm/dimm_devs.c
> +++ b/drivers/nvdimm/dimm_devs.c
> @@ -419,7 +419,7 @@ int alias_dpa_busy(struct device *dev, void *data)
>  	struct resource *res;
>  	int i;
>  
> -	if (!is_nd_pmem(dev))
> +	if (!is_memory(dev))
>  		return 0;
>  
>  	nd_region = to_nd_region(dev);
> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
> index 4e9261ef8a95..57724da484d0 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -112,7 +112,7 @@ static int is_uuid_busy(struct device *dev, void *data)
>  
>  static int is_namespace_uuid_busy(struct device *dev, void *data)
>  {
> -	if (is_nd_pmem(dev) || is_nd_blk(dev))
> +	if (is_nd_region(dev))
>  		return device_for_each_child(dev, data, is_uuid_busy);
>  	return 0;
>  }
> @@ -783,7 +783,7 @@ static int __reserve_free_pmem(struct device *dev, void *data)
>  	struct nd_label_id label_id;
>  	int i;
>  
> -	if (!is_nd_pmem(dev))
> +	if (!is_memory(dev))
>  		return 0;
>  
>  	nd_region = to_nd_region(dev);
> @@ -1872,7 +1872,7 @@ static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
>  	struct resource *res;
>  	struct device *dev;
>  
> -	if (!is_nd_pmem(&nd_region->dev))
> +	if (!is_memory(&nd_region->dev))
>  		return NULL;
>  
>  	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
> @@ -2152,7 +2152,7 @@ static struct device **scan_labels(struct nd_region *nd_region)
>  		}
>  		dev->parent = &nd_region->dev;
>  		devs[count++] = dev;
> -	} else if (is_nd_pmem(&nd_region->dev)) {
> +	} else if (is_memory(&nd_region->dev)) {
>  		/* clean unselected labels */
>  		for (i = 0; i < nd_region->ndr_mappings; i++) {
>  			struct list_head *l, *e;
> diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
> index 4c4bd209e725..86bc19ae30da 100644
> --- a/drivers/nvdimm/nd-core.h
> +++ b/drivers/nvdimm/nd-core.h
> @@ -64,7 +64,16 @@ struct blk_alloc_info {
>  
>  bool is_nvdimm(struct device *dev);
>  bool is_nd_pmem(struct device *dev);
> +bool is_nd_volatile(struct device *dev);
>  bool is_nd_blk(struct device *dev);
> +static inline bool is_nd_region(struct device *dev)
> +{
> +	return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
> +}
> +static inline bool is_memory(struct device *dev)
> +{
> +	return is_nd_pmem(dev) || is_nd_volatile(dev);
> +}
>  struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
>  int __init nvdimm_bus_init(void);
>  void nvdimm_bus_exit(void);
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index a6c403600d19..5929eb65cee3 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -331,7 +331,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
>  	struct nd_pfn *nd_pfn;
>  	struct device *dev;
>  
> -	if (!is_nd_pmem(&nd_region->dev))
> +	if (!is_memory(&nd_region->dev))
>  		return NULL;
>  
>  	nd_pfn = nd_pfn_alloc(nd_region);
> @@ -354,7 +354,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
>  	if (!pfn_sb || !ndns)
>  		return -ENODEV;
>  
> -	if (!is_nd_pmem(nd_pfn->dev.parent))
> +	if (!is_memory(nd_pfn->dev.parent))
>  		return -ENODEV;
>  
>  	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index 41b4cdf5dea8..53a64a16aba4 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -168,6 +168,11 @@ bool is_nd_blk(struct device *dev)
>  	return dev ? dev->type == &nd_blk_device_type : false;
>  }
>  
> +bool is_nd_volatile(struct device *dev)
> +{
> +	return dev ? dev->type == &nd_volatile_device_type : false;
> +}
> +
>  struct nd_region *to_nd_region(struct device *dev)
>  {
>  	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
> @@ -214,7 +219,7 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
>   */
>  int nd_region_to_nstype(struct nd_region *nd_region)
>  {
> -	if (is_nd_pmem(&nd_region->dev)) {
> +	if (is_memory(&nd_region->dev)) {
>  		u16 i, alias;
>  
>  		for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
> @@ -242,7 +247,7 @@ static ssize_t size_show(struct device *dev,
>  	struct nd_region *nd_region = to_nd_region(dev);
>  	unsigned long long size = 0;
>  
> -	if (is_nd_pmem(dev)) {
> +	if (is_memory(dev)) {
>  		size = nd_region->ndr_size;
>  	} else if (nd_region->ndr_mappings == 1) {
>  		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
> @@ -307,7 +312,7 @@ static ssize_t set_cookie_show(struct device *dev,
>  	struct nd_region *nd_region = to_nd_region(dev);
>  	struct nd_interleave_set *nd_set = nd_region->nd_set;
>  
> -	if (is_nd_pmem(dev) && nd_set)
> +	if (is_memory(dev) && nd_set)
>  		/* pass, should be precluded by region_visible */;
>  	else
>  		return -ENXIO;
> @@ -334,7 +339,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
>  		if (!ndd)
>  			return 0;
>  
> -		if (is_nd_pmem(&nd_region->dev)) {
> +		if (is_memory(&nd_region->dev)) {
>  			available += nd_pmem_available_dpa(nd_region,
>  					nd_mapping, &overlap);
>  			if (overlap > blk_max_overlap) {
> @@ -520,10 +525,10 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
>  	struct nd_interleave_set *nd_set = nd_region->nd_set;
>  	int type = nd_region_to_nstype(nd_region);
>  
> -	if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
> +	if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr)
>  		return 0;
>  
> -	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
> +	if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
>  		return 0;
>  
>  	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
> @@ -551,7 +556,7 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
>  				|| type == ND_DEVICE_NAMESPACE_BLK)
>  			&& a == &dev_attr_available_size.attr)
>  		return a->mode;
> -	else if (is_nd_pmem(dev) && nd_set)
> +	else if (is_memory(dev) && nd_set)
>  		return a->mode;
>  
>  	return 0;
> @@ -603,7 +608,7 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
>  {
>  	struct nd_region *nd_region;
>  
> -	if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
> +	if (!probe && is_nd_region(dev)) {
>  		int i;
>  
>  		nd_region = to_nd_region(dev);
> @@ -621,12 +626,8 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
>  			if (ndd)
>  				atomic_dec(&nvdimm->busy);
>  		}
> -
> -		if (is_nd_pmem(dev))
> -			return;
>  	}
> -	if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent))
> -			&& probe) {
> +	if (dev->parent && is_nd_region(dev->parent) && probe) {
>  		nd_region = to_nd_region(dev->parent);
>  		nvdimm_bus_lock(dev);
>  		if (nd_region->ns_seed == dev)
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
>
Dan Williams June 29, 2017, 8:42 p.m. UTC | #2
On Thu, Jun 29, 2017 at 12:20 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> On 06/29/2017 01:54 PM, Dan Williams wrote:
>> Allow volatile nfit ranges to participate in all the same infrastructure
>> provided for persistent memory regions.
>
> This seems to be a bit more than "other rework".

It's part of the rationale for having a "write_cache" control
attribute. There's only so much I can squeeze into the subject line,
but it is mentioned in the cover letter.

>> A resulting resulting namespace
>> device will still be called "pmem", but the parent region type will be
>> "nd_volatile".
>
> What does this look like to a user or admin?  How does someone know that
> /dev/pmemX is persistent memory and /dev/pmemY isn't?  Someone shouldn't
> have to weed through /sys or ndctl some other interface to figure that out
> in the future if they don't have to do that today.  We have different
> names for BTT namespaces.  Is there a different name for volatile ranges?

No, the block device name is still /dev/pmem. It's already the case
that you need to check behind just the name of the device to figure
out if something is actually volatile or not (see memmap=ss!nn
configurations), so I would not be in favor of changing the device
name if we think the memory might not be persistent. Moreover, I think
it was a mistake that we change the device name for btt or not, and
I'm glad Matthew talked me out of making the same mistake with
memory-mode vs raw-mode pmem namespaces. So, the block device name
just reflects the driver of the block device, not the properties of
the device, just like all other block device instances.
Linda Knippers June 29, 2017, 9:16 p.m. UTC | #3
On 06/29/2017 04:42 PM, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 12:20 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> On 06/29/2017 01:54 PM, Dan Williams wrote:
>>> Allow volatile nfit ranges to participate in all the same infrastructure
>>> provided for persistent memory regions.
>>
>> This seems to be a bit more than "other rework".
> 
> It's part of the rationale for having a "write_cache" control
> attribute. There's only so much I can squeeze into the subject line,
> but it is mentioned in the cover letter.
> 
>>> A resulting resulting namespace
>>> device will still be called "pmem", but the parent region type will be
>>> "nd_volatile".
>>
>> What does this look like to a user or admin?  How does someone know that
>> /dev/pmemX is persistent memory and /dev/pmemY isn't?  Someone shouldn't
>> have to weed through /sys or ndctl some other interface to figure that out
>> in the future if they don't have to do that today.  We have different
>> names for BTT namespaces.  Is there a different name for volatile ranges?
> 
> No, the block device name is still /dev/pmem. It's already the case
> that you need to check behind just the name of the device to figure
> out if something is actually volatile or not (see memmap=ss!nn
> configurations),

I don't have any experience with using memmap but if it's primarily used
by developers without NVDIMMs, they'd know it's not persistent.  Or is it
primarily used by administrators using non-NFIT NVDIMMs, in which case it
is persistent?

In any case, how exactly does one determine whether the device is volatile
or not?  I'm dumb so tell me the command line or API.

> so I would not be in favor of changing the device
> name if we think the memory might not be persistent. Moreover, I think
> it was a mistake that we change the device name for btt or not, and
> I'm glad Matthew talked me out of making the same mistake with
> memory-mode vs raw-mode pmem namespaces. So, the block device name
> just reflects the driver of the block device, not the properties of
> the device, just like all other block device instances.

I agree that creating a new device name for BTT was perhaps a mistake,
although it would be good to know how to query a device property for
sector atomicity.  The difference between BTT vs. non-BTT seems less
critical to me than knowing in an obvious way whether the device is
actually persistent.

-- ljk
Dan Williams June 29, 2017, 9:50 p.m. UTC | #4
On Thu, Jun 29, 2017 at 2:16 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> On 06/29/2017 04:42 PM, Dan Williams wrote:
>> On Thu, Jun 29, 2017 at 12:20 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>> On 06/29/2017 01:54 PM, Dan Williams wrote:
>>>> Allow volatile nfit ranges to participate in all the same infrastructure
>>>> provided for persistent memory regions.
>>>
>>> This seems to be a bit more than "other rework".
>>
>> It's part of the rationale for having a "write_cache" control
>> attribute. There's only so much I can squeeze into the subject line,
>> but it is mentioned in the cover letter.
>>
>>>> A resulting resulting namespace
>>>> device will still be called "pmem", but the parent region type will be
>>>> "nd_volatile".
>>>
>>> What does this look like to a user or admin?  How does someone know that
>>> /dev/pmemX is persistent memory and /dev/pmemY isn't?  Someone shouldn't
>>> have to weed through /sys or ndctl some other interface to figure that out
>>> in the future if they don't have to do that today.  We have different
>>> names for BTT namespaces.  Is there a different name for volatile ranges?
>>
>> No, the block device name is still /dev/pmem. It's already the case
>> that you need to check behind just the name of the device to figure
>> out if something is actually volatile or not (see memmap=ss!nn
>> configurations),
>
> I don't have any experience with using memmap but if it's primarily used
> by developers without NVDIMMs, they'd know it's not persistent.  Or is it
> primarily used by administrators using non-NFIT NVDIMMs, in which case it
> is persistent?
>
> In any case, how exactly does one determine whether the device is volatile
> or not?  I'm dumb so tell me the command line or API.

Especially with memmap= or e820-defined memory it's unknowable from
the kernel. We don't know if the user is using it to cover for a
platform where there is no BIOS support for advertising persistent
memory, or if they have a BIOS that does not produce an NFIT as is the
case here [1], or if it is some developer just testing with no
expectation of persistence.

[1]: https://github.com/pmem/ndctl/issues/21

>> so I would not be in favor of changing the device
>> name if we think the memory might not be persistent. Moreover, I think
>> it was a mistake that we change the device name for btt or not, and
>> I'm glad Matthew talked me out of making the same mistake with
>> memory-mode vs raw-mode pmem namespaces. So, the block device name
>> just reflects the driver of the block device, not the properties of
>> the device, just like all other block device instances.
>
> I agree that creating a new device name for BTT was perhaps a mistake,
> although it would be good to know how to query a device property for
> sector atomicity.  The difference between BTT vs. non-BTT seems less
> critical to me than knowing in an obvious way whether the device is
> actually persistent.

We don't have a good way to answer "actually persistent" in the
general case. I'm thinking of cases where the energy source on the
DIMM has died, or we trigger one of the conditions that leads to the
""unable to guarantee persistence of writes" message. The /dev/pmem
device name just tells you that your block device is hosted by a
driver that knows how to handle persistent memory constraints, but any
other details about the nature of the address range need to come from
other sources of information, and potentially information sources that
the kernel does not know about.
Linda Knippers June 29, 2017, 10:12 p.m. UTC | #5
On 6/29/2017 5:50 PM, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 2:16 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> On 06/29/2017 04:42 PM, Dan Williams wrote:
>>> On Thu, Jun 29, 2017 at 12:20 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>>> On 06/29/2017 01:54 PM, Dan Williams wrote:
>>>>> Allow volatile nfit ranges to participate in all the same infrastructure
>>>>> provided for persistent memory regions.
>>>>
>>>> This seems to be a bit more than "other rework".
>>>
>>> It's part of the rationale for having a "write_cache" control
>>> attribute. There's only so much I can squeeze into the subject line,
>>> but it is mentioned in the cover letter.
>>>
>>>>> A resulting resulting namespace
>>>>> device will still be called "pmem", but the parent region type will be
>>>>> "nd_volatile".
>>>>
>>>> What does this look like to a user or admin?  How does someone know that
>>>> /dev/pmemX is persistent memory and /dev/pmemY isn't?  Someone shouldn't
>>>> have to weed through /sys or ndctl some other interface to figure that out
>>>> in the future if they don't have to do that today.  We have different
>>>> names for BTT namespaces.  Is there a different name for volatile ranges?
>>>
>>> No, the block device name is still /dev/pmem. It's already the case
>>> that you need to check behind just the name of the device to figure
>>> out if something is actually volatile or not (see memmap=ss!nn
>>> configurations),
>>
>> I don't have any experience with using memmap but if it's primarily used
>> by developers without NVDIMMs, they'd know it's not persistent.  Or is it
>> primarily used by administrators using non-NFIT NVDIMMs, in which case it
>> is persistent?
>>
>> In any case, how exactly does one determine whether the device is volatile
>> or not?  I'm dumb so tell me the command line or API.
>
> Especially with memmap= or e820-defined memory it's unknowable from
> the kernel. We don't know if the user is using it to cover for a
> platform where there is no BIOS support for advertising persistent
> memory, or if they have a BIOS that does not produce an NFIT as is the
> case here [1], or if it is some developer just testing with no
> expectation of persistence.
>
> [1]: https://github.com/pmem/ndctl/issues/21

Ok.  I'm not really concerned about those cases but was asking since
you mentioned memmap as an example.

In any case, how does someone, like a system administrator, confirm that
a /dev/pmem device is a device that claims to be persistent?  Is there
a specific ndctl command line that would make it obvious what the Linux
device is on a device that claims to be persistent?

>>> so I would not be in favor of changing the device
>>> name if we think the memory might not be persistent. Moreover, I think
>>> it was a mistake that we change the device name for btt or not, and
>>> I'm glad Matthew talked me out of making the same mistake with
>>> memory-mode vs raw-mode pmem namespaces. So, the block device name
>>> just reflects the driver of the block device, not the properties of
>>> the device, just like all other block device instances.
>>
>> I agree that creating a new device name for BTT was perhaps a mistake,
>> although it would be good to know how to query a device property for
>> sector atomicity.  The difference between BTT vs. non-BTT seems less
>> critical to me than knowing in an obvious way whether the device is
>> actually persistent.
>
> We don't have a good way to answer "actually persistent" in the
> general case. I'm thinking of cases where the energy source on the
> DIMM has died, or we trigger one of the conditions that leads to the
> ""unable to guarantee persistence of writes" message.

There are certainly error conditions that can happen and we've talked
about that bit in our health status discussions.  I think the question
of whether the device is healthy enough to be persistent right now
is different from whether the device is never ever going to be persistent.

> The /dev/pmem
> device name just tells you that your block device is hosted by a
> driver that knows how to handle persistent memory constraints, but any
> other details about the nature of the address range need to come from
> other sources of information, and potentially information sources that
> the kernel does not know about.

I'm asking about the other source of information in this specific case
where we're exposing pmem devices that will never ever be persistent.
Before we add these devices, I think we should be able to tell the user
how they can know the properties of the underlying device.

-- ljk
Dan Williams June 29, 2017, 10:28 p.m. UTC | #6
On Thu, Jun 29, 2017 at 3:12 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
[..]
>> The /dev/pmem
>> device name just tells you that your block device is hosted by a
>> driver that knows how to handle persistent memory constraints, but any
>> other details about the nature of the address range need to come from
>> other sources of information, and potentially information sources that
>> the kernel does not know about.
>
>
> I'm asking about the other source of information in this specific case
> where we're exposing pmem devices that will never ever be persistent.
> Before we add these devices, I think we should be able to tell the user
> how they can know the properties of the underlying device.

The only way I can think to indicate this is with a platform + device
whitelist in a tool like ndctl. Where the tool says "yes, these
xyz-vendor DIMMs on this abc-vendor platform with this 123-version
BIOS" is a known good persistent configuration.
Linda Knippers June 29, 2017, 10:35 p.m. UTC | #7
On 06/29/2017 06:28 PM, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 3:12 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> [..]
>>> The /dev/pmem
>>> device name just tells you that your block device is hosted by a
>>> driver that knows how to handle persistent memory constraints, but any
>>> other details about the nature of the address range need to come from
>>> other sources of information, and potentially information sources that
>>> the kernel does not know about.
>>
>>
>> I'm asking about the other source of information in this specific case
>> where we're exposing pmem devices that will never ever be persistent.
>> Before we add these devices, I think we should be able to tell the user
>> how they can know the properties of the underlying device.
> 
> The only way I can think to indicate this is with a platform + device
> whitelist in a tool like ndctl. Where the tool says "yes, these
> xyz-vendor DIMMs on this abc-vendor platform with this 123-version
> BIOS" is a known good persistent configuration.

Doesn't the kernel know that something will never ever be persistent
because the NFIT type says NFIT_SPA_VDISK, NFIT_SPA_VCD, or NFIT_SPA_VOLATILE?
That's the case I'm asking about here.   In this patch, you're adding support
for creating /dev/pmem devices for those address ranges.  My question is
how the admin/user knows that those devices will never ever be persistent.

I don't think we need ndctl to know which vendors' hardware/firmware
actually works as advertised.

-- ljk
Dan Williams June 29, 2017, 10:43 p.m. UTC | #8
On Thu, Jun 29, 2017 at 3:35 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> On 06/29/2017 06:28 PM, Dan Williams wrote:
>> On Thu, Jun 29, 2017 at 3:12 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> [..]
>>>> The /dev/pmem
>>>> device name just tells you that your block device is hosted by a
>>>> driver that knows how to handle persistent memory constraints, but any
>>>> other details about the nature of the address range need to come from
>>>> other sources of information, and potentially information sources that
>>>> the kernel does not know about.
>>>
>>>
>>> I'm asking about the other source of information in this specific case
>>> where we're exposing pmem devices that will never ever be persistent.
>>> Before we add these devices, I think we should be able to tell the user
>>> how they can know the properties of the underlying device.
>>
>> The only way I can think to indicate this is with a platform + device
>> whitelist in a tool like ndctl. Where the tool says "yes, these
>> xyz-vendor DIMMs on this abc-vendor platform with this 123-version
>> BIOS" is a known good persistent configuration.
>
> Doesn't the kernel know that something will never ever be persistent
> because the NFIT type says NFIT_SPA_VDISK, NFIT_SPA_VCD, or NFIT_SPA_VOLATILE?
> That's the case I'm asking about here.   In this patch, you're adding support
> for creating /dev/pmem devices for those address ranges.  My question is
> how the admin/user knows that those devices will never ever be persistent.

The parent region of the namespace will have a 'volatile' type:

# cat /sys/bus/nd/devices/region0/devtype
nd_volatile

> I don't think we need ndctl to know which vendors' hardware/firmware
> actually works as advertised.

Certification stickers is what I was thinking, but I was missing your
direction question.
Linda Knippers June 29, 2017, 10:49 p.m. UTC | #9
On 6/29/2017 6:43 PM, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 3:35 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> On 06/29/2017 06:28 PM, Dan Williams wrote:
>>> On Thu, Jun 29, 2017 at 3:12 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>> [..]
>>>>> The /dev/pmem
>>>>> device name just tells you that your block device is hosted by a
>>>>> driver that knows how to handle persistent memory constraints, but any
>>>>> other details about the nature of the address range need to come from
>>>>> other sources of information, and potentially information sources that
>>>>> the kernel does not know about.
>>>>
>>>>
>>>> I'm asking about the other source of information in this specific case
>>>> where we're exposing pmem devices that will never ever be persistent.
>>>> Before we add these devices, I think we should be able to tell the user
>>>> how they can know the properties of the underlying device.
>>>
>>> The only way I can think to indicate this is with a platform + device
>>> whitelist in a tool like ndctl. Where the tool says "yes, these
>>> xyz-vendor DIMMs on this abc-vendor platform with this 123-version
>>> BIOS" is a known good persistent configuration.
>>
>> Doesn't the kernel know that something will never ever be persistent
>> because the NFIT type says NFIT_SPA_VDISK, NFIT_SPA_VCD, or NFIT_SPA_VOLATILE?
>> That's the case I'm asking about here.   In this patch, you're adding support
>> for creating /dev/pmem devices for those address ranges.  My question is
>> how the admin/user knows that those devices will never ever be persistent.
>
> The parent region of the namespace will have a 'volatile' type:
>
> # cat /sys/bus/nd/devices/region0/devtype
> nd_volatile

If all I know is the /dev/pmem device name, how do I find that?

-- ljk
>
>> I don't think we need ndctl to know which vendors' hardware/firmware
>> actually works as advertised.
>
> Certification stickers is what I was thinking, but I was missing your
> direction question.
>
Dan Williams June 29, 2017, 10:58 p.m. UTC | #10
On Thu, Jun 29, 2017 at 3:49 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>> The parent region of the namespace will have a 'volatile' type:
>>
>> # cat /sys/bus/nd/devices/region0/devtype
>> nd_volatile
>
>
> If all I know is the /dev/pmem device name, how do I find that?
>

    cat $(readlink -f /sys/block/pmem0/device)/../devtype

...this is where 'ndctl list' will get the information.
Linda Knippers June 29, 2017, 11:14 p.m. UTC | #11
On 06/29/2017 06:58 PM, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 3:49 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>> The parent region of the namespace will have a 'volatile' type:
>>>
>>> # cat /sys/bus/nd/devices/region0/devtype
>>> nd_volatile
>>
>>
>> If all I know is the /dev/pmem device name, how do I find that?
>>
> 
>     cat $(readlink -f /sys/block/pmem0/device)/../devtype
> 
> ...this is where 'ndctl list' will get the information.
> 

Thanks.

I think we need a section 4 pmem manpage like exists for
mem, sd, fd, md, etc., where we can put stuff like this, as well
as providing some overview information that will point people to
other resources.  I'll give that some thought unless there is one
already that I'm not finding.

-- ljk
Dan Williams June 30, 2017, 1:28 a.m. UTC | #12
On Thu, Jun 29, 2017 at 4:14 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
> On 06/29/2017 06:58 PM, Dan Williams wrote:
>> On Thu, Jun 29, 2017 at 3:49 PM, Linda Knippers <linda.knippers@hpe.com> wrote:
>>>> The parent region of the namespace will have a 'volatile' type:
>>>>
>>>> # cat /sys/bus/nd/devices/region0/devtype
>>>> nd_volatile
>>>
>>>
>>> If all I know is the /dev/pmem device name, how do I find that?
>>>
>>
>>     cat $(readlink -f /sys/block/pmem0/device)/../devtype
>>
>> ...this is where 'ndctl list' will get the information.
>>
>
> Thanks.
>
> I think we need a section 4 pmem manpage like exists for
> mem, sd, fd, md, etc., where we can put stuff like this, as well
> as providing some overview information that will point people to
> other resources.  I'll give that some thought unless there is one
> already that I'm not finding.
>

A "pmem" man page sounds like a great idea, I wasn't aware we even had
an sd man page.
Kani, Toshi July 5, 2017, 11:46 p.m. UTC | #13
On Thu, 2017-06-29 at 18:28 -0700, Dan Williams wrote:
> On Thu, Jun 29, 2017 at 4:14 PM, Linda Knippers <linda.knippers@hpe.c

> om> wrote:

> > On 06/29/2017 06:58 PM, Dan Williams wrote:

> > > On Thu, Jun 29, 2017 at 3:49 PM, Linda Knippers <linda.knippers@h

> > > pe.com> wrote:

> > > > > The parent region of the namespace will have a 'volatile'

> > > > > type:

> > > > > 

> > > > > # cat /sys/bus/nd/devices/region0/devtype

> > > > > nd_volatile

> > > > 

> > > > 

> > > > If all I know is the /dev/pmem device name, how do I find that?

> > > > 

> > > 

> > >     cat $(readlink -f /sys/block/pmem0/device)/../devtype

> > > 

> > > ...this is where 'ndctl list' will get the information.

> > > 

> > 

> > Thanks.

> > 

> > I think we need a section 4 pmem manpage like exists for

> > mem, sd, fd, md, etc., where we can put stuff like this, as well

> > as providing some overview information that will point people to

> > other resources.  I'll give that some thought unless there is one

> > already that I'm not finding.

> > 

> 

> A "pmem" man page sounds like a great idea, I wasn't aware we even

> had an sd man page.


Sorry for being late to respond, but I agree with Linda that this 
naming policy is likely to confuse users.  I also care less about the
current users who use memmap option.  This case is pmem-emulation and
they know what they are doing.

Assuming block device interface is needed (in addition to device-dax)
for volatile range for use-cases like swap device, I wonder if user can
actually specify a right pmem device for swap from OS-install GUI when
both volatile and persistent block devices are listed as /dev/pmemN. 
Sometimes we are restricted with GUI menu.  Some users use GUI all the
time like Windows as well.

Can we differentiate the naming by adding 'v' like 'pmemNv' (if you
can't go with 'vmemN')?  I don't think having 's' for BTT was that bad.
 It's been helpful to tell users that these pmem devices are not byte-
addressable.  I also think that BTT for volatile range makes no sense
(unless emulated as persistent memory by memmap option).

Thanks,
-Toshi
Dan Williams July 6, 2017, 12:07 a.m. UTC | #14
On Wed, Jul 5, 2017 at 4:46 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Thu, 2017-06-29 at 18:28 -0700, Dan Williams wrote:
>> On Thu, Jun 29, 2017 at 4:14 PM, Linda Knippers <linda.knippers@hpe.c
>> om> wrote:
>> > On 06/29/2017 06:58 PM, Dan Williams wrote:
>> > > On Thu, Jun 29, 2017 at 3:49 PM, Linda Knippers <linda.knippers@h
>> > > pe.com> wrote:
>> > > > > The parent region of the namespace will have a 'volatile'
>> > > > > type:
>> > > > >
>> > > > > # cat /sys/bus/nd/devices/region0/devtype
>> > > > > nd_volatile
>> > > >
>> > > >
>> > > > If all I know is the /dev/pmem device name, how do I find that?
>> > > >
>> > >
>> > >     cat $(readlink -f /sys/block/pmem0/device)/../devtype
>> > >
>> > > ...this is where 'ndctl list' will get the information.
>> > >
>> >
>> > Thanks.
>> >
>> > I think we need a section 4 pmem manpage like exists for
>> > mem, sd, fd, md, etc., where we can put stuff like this, as well
>> > as providing some overview information that will point people to
>> > other resources.  I'll give that some thought unless there is one
>> > already that I'm not finding.
>> >
>>
>> A "pmem" man page sounds like a great idea, I wasn't aware we even
>> had an sd man page.
>
> Sorry for being late to respond, but I agree with Linda that this
> naming policy is likely to confuse users.  I also care less about the
> current users who use memmap option.  This case is pmem-emulation and
> they know what they are doing.
>
> Assuming block device interface is needed (in addition to device-dax)
> for volatile range for use-cases like swap device, I wonder if user can
> actually specify a right pmem device for swap from OS-install GUI when
> both volatile and persistent block devices are listed as /dev/pmemN.
> Sometimes we are restricted with GUI menu.  Some users use GUI all the
> time like Windows as well.
>
> Can we differentiate the naming by adding 'v' like 'pmemNv' (if you
> can't go with 'vmemN')?  I don't think having 's' for BTT was that bad.
>  It's been helpful to tell users that these pmem devices are not byte-
> addressable.  I also think that BTT for volatile range makes no sense
> (unless emulated as persistent memory by memmap option).

I'm more worried about sending the wrong signal the other way. That
users believe that the 'p' means definitely "persistent" when we have
no way to guarantee that.

If it was only memmap= that we had to worry about that would be one
thing, but we apparently have vendors that are shipping "e820-type-12
memory" as their NVDIMM solution [1].

We've also been shipping the policy that 'pmem' may front a volatile
range ever since v4.8 (commit c2f32acdf848 "acpi, nfit: treat virtual
ramdisk SPA as pmem region"). At least now we have the "nd_volatile"
region type. Any change of the device name now is potentially a
regression for environments that are already expecting /dev/pmemX.

As far as I know there are no OS installers that understand pmem. When
they do add support I think it would be straightforward to avoid
confusion and filter "volatile" hosted pmem devices from the install
target list. I don't see this being much different from the confusion
when users can not differentiate their 'sd' device between USB and
SATA. We have symlinks in /dev/disk/by* to make it easier to identify
storage devices, I think it makes sense to add udev rules for
identifying volatile pmem and not try to differentiate this in the
default kernel device name.

[1]: https://github.com/pmem/ndctl/issues/21
Kani, Toshi July 6, 2017, 1:17 a.m. UTC | #15
On Wed, 2017-07-05 at 17:07 -0700, Dan Williams wrote:
> On Wed, Jul 5, 2017 at 4:46 PM, Kani, Toshimitsu <toshi.kani@hpe.com>

> wrote:

> > On Thu, 2017-06-29 at 18:28 -0700, Dan Williams wrote:

 :
> > 

> > Sorry for being late to respond, but I agree with Linda that this

> > naming policy is likely to confuse users.  I also care less about

> > the current users who use memmap option.  This case is pmem-

> > emulation and they know what they are doing.

> > 

> > Assuming block device interface is needed (in addition to device-

> > dax) for volatile range for use-cases like swap device, I wonder if

> > user can actually specify a right pmem device for swap from OS-

> > install GUI when both volatile and persistent block devices are

> > listed as /dev/pmemN.  Sometimes we are restricted with GUI

> > menu.  Some users use GUI all the time like Windows as well.

> > 

> > Can we differentiate the naming by adding 'v' like 'pmemNv' (if you

> > can't go with 'vmemN')?  I don't think having 's' for BTT was that

> > bad.  It's been helpful to tell users that these pmem devices are

> > not byte-addressable.  I also think that BTT for volatile range

> > makes no sense (unless emulated as persistent memory by memmap

> > option).

> 

> I'm more worried about sending the wrong signal the other way. That

> users believe that the 'p' means definitely "persistent" when we have

> no way to guarantee that.


That's a valid point.  But isn't it vendors' responsibility to
guarantee it when their devices are described as persistent in one way
or the other in FW?

> If it was only memmap= that we had to worry about that would be one

> thing, but we apparently have vendors that are shipping "e820-type-12

> memory" as their NVDIMM solution [1].


Type-12 is persistent memory in a non-standard FW interface.  So, it
makes sense to show it as pmem.

> We've also been shipping the policy that 'pmem' may front a volatile

> range ever since v4.8 (commit c2f32acdf848 "acpi, nfit: treat virtual

> ramdisk SPA as pmem region"). At least now we have the "nd_volatile"

> region type. Any change of the device name now is potentially a

> regression for environments that are already expecting /dev/pmemX.


Hmm... I thought this was for mapping ISO image for booting.  Does it
get listed as a regular pmem device and allow user to modify its
content?  I doubt this case being used today, though.  (It was
prototyped on an HPE box and I can check the status if needed.)

> As far as I know there are no OS installers that understand pmem. 


It's actually the other way around.  It was changed not to list pmem
devices since OS cannot boot from a pmem yet...

> When they do add support I think it would be straightforward to avoid

> confusion and filter "volatile" hosted pmem devices from the install

> target list. I don't see this being much different from the confusion

> when users can not differentiate their 'sd' device between USB and

> SATA. 


Right, such changes can be made.  It was just an example that typical
use-cases today do not require additional step to check persistence of
block devices.

> We have symlinks in /dev/disk/by* to make it easier to identify

> storage devices, I think it makes sense to add udev rules for

> identifying volatile pmem and not try to differentiate this in the

> default kernel device name.


I am not sure what might be a good way, but I am concerned because a
single block device naming do not represent both volatile and
persistent media today.

Thanks,
-Toshi
Dan Williams July 6, 2017, 2:08 a.m. UTC | #16
[ adding Jeff, and Johannes ]

On Wed, Jul 5, 2017 at 6:17 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> On Wed, 2017-07-05 at 17:07 -0700, Dan Williams wrote:
[..]
>> We have symlinks in /dev/disk/by* to make it easier to identify
>> storage devices, I think it makes sense to add udev rules for
>> identifying volatile pmem and not try to differentiate this in the
>> default kernel device name.
>
> I am not sure what might be a good way, but I am concerned because a
> single block device naming do not represent both volatile and
> persistent media today.

We do have time to changes this if we find out this is critical. Maybe
it's best to ask Linux distro folks what would be easier for them?

Jeff, Johannes, any thoughts on whether we should produce a
"/dev/vmemX" device when we know the backing memory range is volatile?
In this patch everything shows up as /dev/pmemX and you need to look
elsewhere in sysfs to find that the memory range is defined as
volatile by the NFIT.
Christoph Hellwig July 6, 2017, 2:11 a.m. UTC | #17
On Wed, Jul 05, 2017 at 07:08:54PM -0700, Dan Williams wrote:
> [ adding Jeff, and Johannes ]
> 
> On Wed, Jul 5, 2017 at 6:17 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
> > On Wed, 2017-07-05 at 17:07 -0700, Dan Williams wrote:
> [..]
> >> We have symlinks in /dev/disk/by* to make it easier to identify
> >> storage devices, I think it makes sense to add udev rules for
> >> identifying volatile pmem and not try to differentiate this in the
> >> default kernel device name.
> >
> > I am not sure what might be a good way, but I am concerned because a
> > single block device naming do not represent both volatile and
> > persistent media today.
> 
> We do have time to changes this if we find out this is critical. Maybe
> it's best to ask Linux distro folks what would be easier for them?

I'm not really concerned about it, because SCSI devices for example
might not be persistent as well with ѕcsi_debug, target_core_rd or
volatile qemu devices.

That being said I really don't understand the purpose of these volatile
nfit ranges.  Are they seen in the wild?  If yes what's the use case?
If not why do we even need to support them?
Oliver O'Halloran July 6, 2017, 2:53 a.m. UTC | #18
On Thu, Jul 6, 2017 at 12:11 PM, hch@lst.de <hch@lst.de> wrote:
> On Wed, Jul 05, 2017 at 07:08:54PM -0700, Dan Williams wrote:
>> [ adding Jeff, and Johannes ]
>>
>> On Wed, Jul 5, 2017 at 6:17 PM, Kani, Toshimitsu <toshi.kani@hpe.com> wrote:
>> > On Wed, 2017-07-05 at 17:07 -0700, Dan Williams wrote:
>> [..]
>> >> We have symlinks in /dev/disk/by* to make it easier to identify
>> >> storage devices, I think it makes sense to add udev rules for
>> >> identifying volatile pmem and not try to differentiate this in the
>> >> default kernel device name.
>> >
>> > I am not sure what might be a good way, but I am concerned because a
>> > single block device naming do not represent both volatile and
>> > persistent media today.
>>
>> We do have time to changes this if we find out this is critical. Maybe
>> it's best to ask Linux distro folks what would be easier for them?
>
> I'm not really concerned about it, because SCSI devices for example
> might not be persistent as well with ѕcsi_debug, target_core_rd or
> volatile qemu devices.
>
> That being said I really don't understand the purpose of these volatile
> nfit ranges.  Are they seen in the wild?  If yes what's the use case?
> If not why do we even need to support them?

The main use case is provisioning install media for bare metal
servers. Traditionally that's been handled by having the BMC emulate a
USB CD drive. Unfortunately, most BMCs have limited CPU, limited
memory and a wet-string network connection so a host based alternative
is nice to have.
Christoph Hellwig July 6, 2017, 2:56 a.m. UTC | #19
On Thu, Jul 06, 2017 at 12:53:13PM +1000, Oliver wrote:
> The main use case is provisioning install media for bare metal
> servers. Traditionally that's been handled by having the BMC emulate a
> USB CD drive. Unfortunately, most BMCs have limited CPU, limited
> memory and a wet-string network connection so a host based alternative
> is nice to have.

If they are CD replacement they should be marked as read-only, which
would solve any concerns about them being volatile or not.
diff mbox

Patch

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ac2436538b7e..60d1ca149cc1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2227,6 +2227,13 @@  static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa)
 		nfit_spa_type(spa) == NFIT_SPA_PCD);
 }
 
+static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa)
+{
+	return (nfit_spa_type(spa) == NFIT_SPA_VDISK ||
+		nfit_spa_type(spa) == NFIT_SPA_VCD   ||
+		nfit_spa_type(spa) == NFIT_SPA_VOLATILE);
+}
+
 static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_spa *nfit_spa)
 {
@@ -2301,7 +2308,7 @@  static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 				ndr_desc);
 		if (!nfit_spa->nd_region)
 			rc = -ENOMEM;
-	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
+	} else if (nfit_spa_is_volatile(spa)) {
 		nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus,
 				ndr_desc);
 		if (!nfit_spa->nd_region)
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index e9361bffe5ee..4cfba534814b 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -38,13 +38,13 @@  static int to_nd_device_type(struct device *dev)
 {
 	if (is_nvdimm(dev))
 		return ND_DEVICE_DIMM;
-	else if (is_nd_pmem(dev))
+	else if (is_memory(dev))
 		return ND_DEVICE_REGION_PMEM;
 	else if (is_nd_blk(dev))
 		return ND_DEVICE_REGION_BLK;
 	else if (is_nd_dax(dev))
 		return ND_DEVICE_DAX_PMEM;
-	else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
+	else if (is_nd_region(dev->parent))
 		return nd_region_to_nstype(to_nd_region(dev->parent));
 
 	return 0;
@@ -56,7 +56,7 @@  static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 	 * Ensure that region devices always have their numa node set as
 	 * early as possible.
 	 */
-	if (is_nd_pmem(dev) || is_nd_blk(dev))
+	if (is_nd_region(dev))
 		set_dev_node(dev, to_nd_region(dev)->numa_node);
 	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
 			to_nd_device_type(dev));
@@ -65,7 +65,7 @@  static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 static struct module *to_bus_provider(struct device *dev)
 {
 	/* pin bus providers while regions are enabled */
-	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+	if (is_nd_region(dev)) {
 		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 
 		return nvdimm_bus->nd_desc->module;
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 2dee908e4bae..22e3ef463401 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -504,7 +504,7 @@  void nvdimm_badblocks_populate(struct nd_region *nd_region,
 	struct nvdimm_bus *nvdimm_bus;
 	struct list_head *poison_list;
 
-	if (!is_nd_pmem(&nd_region->dev)) {
+	if (!is_memory(&nd_region->dev)) {
 		dev_WARN_ONCE(&nd_region->dev, 1,
 				"%s only valid for pmem regions\n", __func__);
 		return;
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index c1b6556aea6e..a304983ac417 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -89,7 +89,7 @@  struct device *nd_dax_create(struct nd_region *nd_region)
 	struct device *dev = NULL;
 	struct nd_dax *nd_dax;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nd_dax = nd_dax_alloc(nd_region);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 6a1e7a3c0c17..f0d1b7e5de01 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -419,7 +419,7 @@  int alias_dpa_busy(struct device *dev, void *data)
 	struct resource *res;
 	int i;
 
-	if (!is_nd_pmem(dev))
+	if (!is_memory(dev))
 		return 0;
 
 	nd_region = to_nd_region(dev);
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4e9261ef8a95..57724da484d0 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -112,7 +112,7 @@  static int is_uuid_busy(struct device *dev, void *data)
 
 static int is_namespace_uuid_busy(struct device *dev, void *data)
 {
-	if (is_nd_pmem(dev) || is_nd_blk(dev))
+	if (is_nd_region(dev))
 		return device_for_each_child(dev, data, is_uuid_busy);
 	return 0;
 }
@@ -783,7 +783,7 @@  static int __reserve_free_pmem(struct device *dev, void *data)
 	struct nd_label_id label_id;
 	int i;
 
-	if (!is_nd_pmem(dev))
+	if (!is_memory(dev))
 		return 0;
 
 	nd_region = to_nd_region(dev);
@@ -1872,7 +1872,7 @@  static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
 	struct resource *res;
 	struct device *dev;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
@@ -2152,7 +2152,7 @@  static struct device **scan_labels(struct nd_region *nd_region)
 		}
 		dev->parent = &nd_region->dev;
 		devs[count++] = dev;
-	} else if (is_nd_pmem(&nd_region->dev)) {
+	} else if (is_memory(&nd_region->dev)) {
 		/* clean unselected labels */
 		for (i = 0; i < nd_region->ndr_mappings; i++) {
 			struct list_head *l, *e;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 4c4bd209e725..86bc19ae30da 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -64,7 +64,16 @@  struct blk_alloc_info {
 
 bool is_nvdimm(struct device *dev);
 bool is_nd_pmem(struct device *dev);
+bool is_nd_volatile(struct device *dev);
 bool is_nd_blk(struct device *dev);
+static inline bool is_nd_region(struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
+}
+static inline bool is_memory(struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
+}
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index a6c403600d19..5929eb65cee3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -331,7 +331,7 @@  struct device *nd_pfn_create(struct nd_region *nd_region)
 	struct nd_pfn *nd_pfn;
 	struct device *dev;
 
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!is_memory(&nd_region->dev))
 		return NULL;
 
 	nd_pfn = nd_pfn_alloc(nd_region);
@@ -354,7 +354,7 @@  int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 	if (!pfn_sb || !ndns)
 		return -ENODEV;
 
-	if (!is_nd_pmem(nd_pfn->dev.parent))
+	if (!is_memory(nd_pfn->dev.parent))
 		return -ENODEV;
 
 	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 41b4cdf5dea8..53a64a16aba4 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -168,6 +168,11 @@  bool is_nd_blk(struct device *dev)
 	return dev ? dev->type == &nd_blk_device_type : false;
 }
 
+bool is_nd_volatile(struct device *dev)
+{
+	return dev ? dev->type == &nd_volatile_device_type : false;
+}
+
 struct nd_region *to_nd_region(struct device *dev)
 {
 	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
@@ -214,7 +219,7 @@  EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
  */
 int nd_region_to_nstype(struct nd_region *nd_region)
 {
-	if (is_nd_pmem(&nd_region->dev)) {
+	if (is_memory(&nd_region->dev)) {
 		u16 i, alias;
 
 		for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
@@ -242,7 +247,7 @@  static ssize_t size_show(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev);
 	unsigned long long size = 0;
 
-	if (is_nd_pmem(dev)) {
+	if (is_memory(dev)) {
 		size = nd_region->ndr_size;
 	} else if (nd_region->ndr_mappings == 1) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
@@ -307,7 +312,7 @@  static ssize_t set_cookie_show(struct device *dev,
 	struct nd_region *nd_region = to_nd_region(dev);
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
 
-	if (is_nd_pmem(dev) && nd_set)
+	if (is_memory(dev) && nd_set)
 		/* pass, should be precluded by region_visible */;
 	else
 		return -ENXIO;
@@ -334,7 +339,7 @@  resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 		if (!ndd)
 			return 0;
 
-		if (is_nd_pmem(&nd_region->dev)) {
+		if (is_memory(&nd_region->dev)) {
 			available += nd_pmem_available_dpa(nd_region,
 					nd_mapping, &overlap);
 			if (overlap > blk_max_overlap) {
@@ -520,10 +525,10 @@  static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
 	int type = nd_region_to_nstype(nd_region);
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
+	if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
+	if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
 	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
@@ -551,7 +556,7 @@  static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 				|| type == ND_DEVICE_NAMESPACE_BLK)
 			&& a == &dev_attr_available_size.attr)
 		return a->mode;
-	else if (is_nd_pmem(dev) && nd_set)
+	else if (is_memory(dev) && nd_set)
 		return a->mode;
 
 	return 0;
@@ -603,7 +608,7 @@  static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 {
 	struct nd_region *nd_region;
 
-	if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
+	if (!probe && is_nd_region(dev)) {
 		int i;
 
 		nd_region = to_nd_region(dev);
@@ -621,12 +626,8 @@  static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			if (ndd)
 				atomic_dec(&nvdimm->busy);
 		}
-
-		if (is_nd_pmem(dev))
-			return;
 	}
-	if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent))
-			&& probe) {
+	if (dev->parent && is_nd_region(dev->parent) && probe) {
 		nd_region = to_nd_region(dev->parent);
 		nvdimm_bus_lock(dev);
 		if (nd_region->ns_seed == dev)