diff mbox series

[RFC,10/10] nvdimm/e820: add multiple namespaces support

Message ID 20200110190313.17144-11-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RFC,01/10] mm: Add pmd support for _PAGE_SPECIAL | expand

Commit Message

Joao Martins Jan. 10, 2020, 7:03 p.m. UTC
User can define regions with 'memmap=size!offset' which in turn
creates PMEM legacy devices. But because it is a label-less
NVDIMM device we only have one namespace for the whole device.

Add support for multiple namespaces by adding ndctl control
support, and exposing a minimal set of features:
(ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
store labels.

Initialization is a little different: We allocate and register an
nvdimm bus with an @nvdimm_descriptor which we use to locate
where we are keeping our label storage area. The config data
get/set/size operations are then simply memcpying to this area.

Equivalent approach can also be found in the NFIT tests which
emulate the same thing.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/nvdimm/e820.c | 212 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 191 insertions(+), 21 deletions(-)

Comments

Barret Rhoden Feb. 4, 2020, 3:28 p.m. UTC | #1
Hi -

On 1/10/20 2:03 PM, Joao Martins wrote:
> User can define regions with 'memmap=size!offset' which in turn
> creates PMEM legacy devices. But because it is a label-less
> NVDIMM device we only have one namespace for the whole device.
> 
> Add support for multiple namespaces by adding ndctl control
> support, and exposing a minimal set of features:
> (ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
> ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
> store labels.

FWIW, I like this a lot.  If we move away from using memmap in favor of 
efi_fake_mem, ideally we'd have the same support for full-fledged 
pmem/dax regions and namespaces that this patch brings.

Thanks,
Barret


> 
> Initialization is a little different: We allocate and register an
> nvdimm bus with an @nvdimm_descriptor which we use to locate
> where we are keeping our label storage area. The config data
> get/set/size operations are then simply memcpying to this area.
> 
> Equivalent approach can also be found in the NFIT tests which
> emulate the same thing.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   drivers/nvdimm/e820.c | 212 +++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 191 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
> index e02f60ad6c99..36fbff3d7110 100644
> --- a/drivers/nvdimm/e820.c
> +++ b/drivers/nvdimm/e820.c
> @@ -7,14 +7,21 @@
>   #include <linux/memory_hotplug.h>
>   #include <linux/libnvdimm.h>
>   #include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/ndctl.h>
> +#include <linux/nd.h>
>   
> -static int e820_pmem_remove(struct platform_device *pdev)
> -{
> -	struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
> +#define LABEL_SIZE SZ_128K
>   
> -	nvdimm_bus_unregister(nvdimm_bus);
> -	return 0;
> -}
> +struct e820_descriptor {
> +	struct nd_interleave_set nd_set;
> +	struct nvdimm_bus_descriptor nd_desc;
> +	void *label;
> +	unsigned char cookie1[16];
> +	unsigned char cookie2[16];
> +	struct nvdimm_bus *nvdimm_bus;
> +	struct nvdimm *nvdimm;
> +};
>   
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   static int e820_range_to_nid(resource_size_t addr)
> @@ -28,43 +35,206 @@ static int e820_range_to_nid(resource_size_t addr)
>   }
>   #endif
>   
> +static int e820_get_config_size(struct nd_cmd_get_config_size *nd_cmd,
> +				unsigned int buf_len)
> +{
> +	if (buf_len < sizeof(*nd_cmd))
> +		return -EINVAL;
> +
> +	nd_cmd->status = 0;
> +	nd_cmd->config_size = LABEL_SIZE;
> +	nd_cmd->max_xfer = SZ_4K;
> +
> +	return 0;
> +}
> +
> +static int e820_get_config_data(struct nd_cmd_get_config_data_hdr
> +		*nd_cmd, unsigned int buf_len, void *label)
> +{
> +	unsigned int len, offset = nd_cmd->in_offset;
> +	int rc;
> +
> +	if (buf_len < sizeof(*nd_cmd))
> +		return -EINVAL;
> +	if (offset >= LABEL_SIZE)
> +		return -EINVAL;
> +	if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
> +		return -EINVAL;
> +
> +	nd_cmd->status = 0;
> +	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
> +	memcpy(nd_cmd->out_buf, label + offset, len);
> +	rc = buf_len - sizeof(*nd_cmd) - len;
> +
> +	return rc;
> +}
> +
> +static int e820_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
> +		unsigned int buf_len, void *label)
> +{
> +	unsigned int len, offset = nd_cmd->in_offset;
> +	u32 *status;
> +	int rc;
> +
> +	if (buf_len < sizeof(*nd_cmd))
> +		return -EINVAL;
> +	if (offset >= LABEL_SIZE)
> +		return -EINVAL;
> +	if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
> +		return -EINVAL;
> +
> +	status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd);
> +	*status = 0;
> +	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
> +	memcpy(label + offset, nd_cmd->in_buf, len);
> +	rc = buf_len - sizeof(*nd_cmd) - (len + 4);
> +
> +	return rc;
> +}
> +
> +static struct e820_descriptor *to_e820_desc(struct nvdimm_bus_descriptor *desc)
> +{
> +	return container_of(desc, struct e820_descriptor, nd_desc);
> +}
> +
> +static int e820_ndctl(struct nvdimm_bus_descriptor *nd_desc,
> +			 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
> +			 unsigned int buf_len, int *cmd_rc)
> +{
> +	struct e820_descriptor *t = to_e820_desc(nd_desc);
> +	int rc = -EINVAL;
> +
> +	switch (cmd) {
> +	case ND_CMD_GET_CONFIG_SIZE:
> +		rc = e820_get_config_size(buf, buf_len);
> +		break;
> +	case ND_CMD_GET_CONFIG_DATA:
> +		rc = e820_get_config_data(buf, buf_len, t->label);
> +		break;
> +	case ND_CMD_SET_CONFIG_DATA:
> +		rc = e820_set_config_data(buf, buf_len, t->label);
> +		break;
> +	default:
> +		return rc;
> +	}
> +
> +	return rc;
> +}
> +
> +static void e820_desc_free(struct e820_descriptor *desc)
> +{
> +	if (!desc)
> +		return;
> +
> +	nvdimm_bus_unregister(desc->nvdimm_bus);
> +	kfree(desc->label);
> +	kfree(desc);
> +}
> +
> +static struct e820_descriptor *e820_desc_alloc(struct platform_device *pdev)
> +{
> +	struct nvdimm_bus_descriptor *nd_desc;
> +	unsigned int cmd_mask, dimm_flags;
> +	struct device *dev = &pdev->dev;
> +	struct nvdimm_bus *nvdimm_bus;
> +	struct e820_descriptor *desc;
> +	struct nvdimm *nvdimm;
> +
> +	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
> +	if (!desc)
> +		goto err;
> +
> +	desc->label = kzalloc(LABEL_SIZE, GFP_KERNEL);
> +	if (!desc->label)
> +		goto err;
> +
> +	nd_desc = &desc->nd_desc;
> +	nd_desc->provider_name = "e820";
> +	nd_desc->module = THIS_MODULE;
> +	nd_desc->ndctl = e820_ndctl;
> +	nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
> +	if (!nvdimm_bus) {
> +		dev_err(dev, "nvdimm bus registration failure\n");
> +		goto err;
> +	}
> +	desc->nvdimm_bus = nvdimm_bus;
> +
> +	cmd_mask = (1UL << ND_CMD_GET_CONFIG_SIZE |
> +			1UL << ND_CMD_GET_CONFIG_DATA |
> +			1UL << ND_CMD_SET_CONFIG_DATA);
> +	dimm_flags = (1UL << NDD_ALIASING);
> +	nvdimm = nvdimm_create(nvdimm_bus, pdev, NULL,
> +				dimm_flags, cmd_mask, 0, NULL);
> +	if (!nvdimm) {
> +		dev_err(dev, "nvdimm creation failure\n");
> +		goto err;
> +	}
> +	desc->nvdimm = nvdimm;
> +	return desc;
> +
> +err:
> +	e820_desc_free(desc);
> +	return NULL;
> +}
> +
>   static int e820_register_one(struct resource *res, void *data)
>   {
> +	struct platform_device *pdev = data;
>   	struct nd_region_desc ndr_desc;
> -	struct nvdimm_bus *nvdimm_bus = data;
> +	struct nd_mapping_desc mapping;
> +	struct e820_descriptor *desc;
> +
> +	desc = e820_desc_alloc(pdev);
> +	if (!desc)
> +		return -ENOMEM;
> +
> +	mapping.nvdimm = desc->nvdimm;
> +	mapping.start = res->start;
> +	mapping.size = resource_size(res);
> +	mapping.position = 0;
> +
> +	generate_random_uuid(desc->cookie1);
> +	desc->nd_set.cookie1 = (u64) desc->cookie1;
> +	generate_random_uuid(desc->cookie2);
> +	desc->nd_set.cookie2 = (u64) desc->cookie2;
>   
>   	memset(&ndr_desc, 0, sizeof(ndr_desc));
>   	ndr_desc.res = res;
>   	ndr_desc.numa_node = e820_range_to_nid(res->start);
>   	ndr_desc.target_node = ndr_desc.numa_node;
> +	ndr_desc.mapping = &mapping;
> +	ndr_desc.num_mappings = 1;
> +	ndr_desc.nd_set = &desc->nd_set;
>   	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
> -	if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
> +	if (!nvdimm_pmem_region_create(desc->nvdimm_bus, &ndr_desc)) {
> +		e820_desc_free(desc);
> +		dev_err(&pdev->dev, "nvdimm region creation failure\n");
>   		return -ENXIO;
> +	}
> +
> +	platform_set_drvdata(pdev, desc);
> +	return 0;
> +}
> +
> +static int e820_pmem_remove(struct platform_device *pdev)
> +{
> +	struct e820_descriptor *desc = platform_get_drvdata(pdev);
> +
> +	e820_desc_free(desc);
>   	return 0;
>   }
>   
>   static int e820_pmem_probe(struct platform_device *pdev)
>   {
> -	static struct nvdimm_bus_descriptor nd_desc;
> -	struct device *dev = &pdev->dev;
> -	struct nvdimm_bus *nvdimm_bus;
>   	int rc = -ENXIO;
>   
> -	nd_desc.provider_name = "e820";
> -	nd_desc.module = THIS_MODULE;
> -	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
> -	if (!nvdimm_bus)
> -		goto err;
> -	platform_set_drvdata(pdev, nvdimm_bus);
> -
>   	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
> -			IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
> +			IORESOURCE_MEM, 0, -1, pdev, e820_register_one);
>   	if (rc)
>   		goto err;
>   	return 0;
>   err:
> -	nvdimm_bus_unregister(nvdimm_bus);
> -	dev_err(dev, "failed to register legacy persistent memory ranges\n");
> +	dev_err(&pdev->dev, "failed to register legacy persistent memory ranges\n");
>   	return rc;
>   }
>   
>
Dan Williams Feb. 4, 2020, 4:44 p.m. UTC | #2
On Tue, Feb 4, 2020 at 7:30 AM Barret Rhoden <brho@google.com> wrote:
>
> Hi -
>
> On 1/10/20 2:03 PM, Joao Martins wrote:
> > User can define regions with 'memmap=size!offset' which in turn
> > creates PMEM legacy devices. But because it is a label-less
> > NVDIMM device we only have one namespace for the whole device.
> >
> > Add support for multiple namespaces by adding ndctl control
> > support, and exposing a minimal set of features:
> > (ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
> > ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
> > store labels.
>
> FWIW, I like this a lot.  If we move away from using memmap in favor of
> efi_fake_mem, ideally we'd have the same support for full-fledged
> pmem/dax regions and namespaces that this patch brings.

No, efi_fake_mem only supports creating dax-regions. What's the use
case that can't be satisfied by just specifying multiple memmap=
ranges?
Barret Rhoden Feb. 4, 2020, 6:20 p.m. UTC | #3
Hi -

On 2/4/20 11:44 AM, Dan Williams wrote:
> On Tue, Feb 4, 2020 at 7:30 AM Barret Rhoden <brho@google.com> wrote:
>>
>> Hi -
>>
>> On 1/10/20 2:03 PM, Joao Martins wrote:
>>> User can define regions with 'memmap=size!offset' which in turn
>>> creates PMEM legacy devices. But because it is a label-less
>>> NVDIMM device we only have one namespace for the whole device.
>>>
>>> Add support for multiple namespaces by adding ndctl control
>>> support, and exposing a minimal set of features:
>>> (ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
>>> ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
>>> store labels.
>>
>> FWIW, I like this a lot.  If we move away from using memmap in favor of
>> efi_fake_mem, ideally we'd have the same support for full-fledged
>> pmem/dax regions and namespaces that this patch brings.
> 
> No, efi_fake_mem only supports creating dax-regions. What's the use
> case that can't be satisfied by just specifying multiple memmap=
> ranges?

I'd like to be able to create and destroy dax regions on the fly.  In 
particular, I want to run guest VMs using the dax files for guest 
memory, but I don't know at boot time how many VMs I'll have, or what 
their sizes are.  Ideally, I'd have separate files for each VM, instead 
of a single /dev/dax.

I currently do this with fs-dax with one big memmap region (ext4 on 
/dev/pmem0), and I use the file system to handle the 
creation/destruction/resizing and metadata management.  But since fs-dax 
won't work with device pass-through, I started looking at dev-dax, with 
the expectation that I'd need some software to manage the memory (i.e. 
allocation).  That led me to ndctl, which seems to need namespace labels 
to have the level of control I was looking for.

Thanks,

Barret
Joao Martins Feb. 4, 2020, 7:24 p.m. UTC | #4
On 2/4/20 6:20 PM, Barret Rhoden wrote:
> Hi -
> 
> On 2/4/20 11:44 AM, Dan Williams wrote:
>> On Tue, Feb 4, 2020 at 7:30 AM Barret Rhoden <brho@google.com> wrote:
>>>
>>> Hi -
>>>
>>> On 1/10/20 2:03 PM, Joao Martins wrote:
>>>> User can define regions with 'memmap=size!offset' which in turn
>>>> creates PMEM legacy devices. But because it is a label-less
>>>> NVDIMM device we only have one namespace for the whole device.
>>>>
>>>> Add support for multiple namespaces by adding ndctl control
>>>> support, and exposing a minimal set of features:
>>>> (ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
>>>> ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
>>>> store labels.
>>>
>>> FWIW, I like this a lot.  If we move away from using memmap in favor of
>>> efi_fake_mem, ideally we'd have the same support for full-fledged
>>> pmem/dax regions and namespaces that this patch brings.
>>
>> No, efi_fake_mem only supports creating dax-regions. What's the use
>> case that can't be satisfied by just specifying multiple memmap=
>> ranges?
> 
> I'd like to be able to create and destroy dax regions on the fly.  In 
> particular, I want to run guest VMs using the dax files for guest 
> memory, but I don't know at boot time how many VMs I'll have, or what 
> their sizes are.  Ideally, I'd have separate files for each VM, instead 
> of a single /dev/dax.
> 
> I currently do this with fs-dax with one big memmap region (ext4 on 
> /dev/pmem0), and I use the file system to handle the 
> creation/destruction/resizing and metadata management.  But since fs-dax 
> won't work with device pass-through, I started looking at dev-dax, with 
> the expectation that I'd need some software to manage the memory (i.e. 
> allocation).  That led me to ndctl, which seems to need namespace labels 
> to have the level of control I was looking for.
> 

Indeed this is the intent of the patch.

As Barret mentioned, memmap= is limited to the one namespace covering the entire
region, and this would fix it (regardless of namespace mode). Otherwise we gotta
know in advance the amount of guests and its exact sizes, which would be
somewhat unflexible.

But given that it's 'pmem emulation' I thought it was OK to twist the label-less
aspect of nd_e820 (unless there's hardware out there which does this?).

If Dan agrees, I can continue with the patch.
Dan Williams Feb. 4, 2020, 9:43 p.m. UTC | #5
On Tue, Feb 4, 2020 at 10:20 AM Barret Rhoden <brho@google.com> wrote:
>
> Hi -
>
> On 2/4/20 11:44 AM, Dan Williams wrote:
> > On Tue, Feb 4, 2020 at 7:30 AM Barret Rhoden <brho@google.com> wrote:
> >>
> >> Hi -
> >>
> >> On 1/10/20 2:03 PM, Joao Martins wrote:
> >>> User can define regions with 'memmap=size!offset' which in turn
> >>> creates PMEM legacy devices. But because it is a label-less
> >>> NVDIMM device we only have one namespace for the whole device.
> >>>
> >>> Add support for multiple namespaces by adding ndctl control
> >>> support, and exposing a minimal set of features:
> >>> (ND_CMD_GET_CONFIG_SIZE, ND_CMD_GET_CONFIG_DATA,
> >>> ND_CMD_SET_CONFIG_DATA) alongside NDD_ALIASING because we can
> >>> store labels.
> >>
> >> FWIW, I like this a lot.  If we move away from using memmap in favor of
> >> efi_fake_mem, ideally we'd have the same support for full-fledged
> >> pmem/dax regions and namespaces that this patch brings.
> >
> > No, efi_fake_mem only supports creating dax-regions. What's the use
> > case that can't be satisfied by just specifying multiple memmap=
> > ranges?
>
> I'd like to be able to create and destroy dax regions on the fly.  In
> particular, I want to run guest VMs using the dax files for guest
> memory, but I don't know at boot time how many VMs I'll have, or what
> their sizes are.  Ideally, I'd have separate files for each VM, instead
> of a single /dev/dax.
>
> I currently do this with fs-dax with one big memmap region (ext4 on
> /dev/pmem0), and I use the file system to handle the
> creation/destruction/resizing and metadata management.  But since fs-dax
> won't work with device pass-through, I started looking at dev-dax, with
> the expectation that I'd need some software to manage the memory (i.e.
> allocation).  That led me to ndctl, which seems to need namespace labels
> to have the level of control I was looking for.

Ah, got it, you only ended up at wanting namespace labels because
there was no other way to carve up device-dax. That's changing as part
of the efi_fake_mem= enabling and I have a patch set in the works to
allow discontiguous sub-divisions of a device-dax range. Note that is
this branch rebases frequently:

https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=libnvdimm-pending


>
> Thanks,
>
> Barret
>
Barret Rhoden Feb. 4, 2020, 9:57 p.m. UTC | #6
On 2/4/20 4:43 PM, Dan Williams wrote:
> Ah, got it, you only ended up at wanting namespace labels because
> there was no other way to carve up device-dax. That's changing as part
> of the efi_fake_mem= enabling and I have a patch set in the works to
> allow discontiguous sub-divisions of a device-dax range. Note that is
> this branch rebases frequently:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=libnvdimm-pending

Cool, thanks.  I'll check it out!

Barret
diff mbox series

Patch

diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index e02f60ad6c99..36fbff3d7110 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -7,14 +7,21 @@ 
 #include <linux/memory_hotplug.h>
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/ndctl.h>
+#include <linux/nd.h>
 
-static int e820_pmem_remove(struct platform_device *pdev)
-{
-	struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
+#define LABEL_SIZE SZ_128K
 
-	nvdimm_bus_unregister(nvdimm_bus);
-	return 0;
-}
+struct e820_descriptor {
+	struct nd_interleave_set nd_set;
+	struct nvdimm_bus_descriptor nd_desc;
+	void *label;
+	unsigned char cookie1[16];
+	unsigned char cookie2[16];
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm *nvdimm;
+};
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 static int e820_range_to_nid(resource_size_t addr)
@@ -28,43 +35,206 @@  static int e820_range_to_nid(resource_size_t addr)
 }
 #endif
 
+static int e820_get_config_size(struct nd_cmd_get_config_size *nd_cmd,
+				unsigned int buf_len)
+{
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+
+	nd_cmd->status = 0;
+	nd_cmd->config_size = LABEL_SIZE;
+	nd_cmd->max_xfer = SZ_4K;
+
+	return 0;
+}
+
+static int e820_get_config_data(struct nd_cmd_get_config_data_hdr
+		*nd_cmd, unsigned int buf_len, void *label)
+{
+	unsigned int len, offset = nd_cmd->in_offset;
+	int rc;
+
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+	if (offset >= LABEL_SIZE)
+		return -EINVAL;
+	if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
+		return -EINVAL;
+
+	nd_cmd->status = 0;
+	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+	memcpy(nd_cmd->out_buf, label + offset, len);
+	rc = buf_len - sizeof(*nd_cmd) - len;
+
+	return rc;
+}
+
+static int e820_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
+		unsigned int buf_len, void *label)
+{
+	unsigned int len, offset = nd_cmd->in_offset;
+	u32 *status;
+	int rc;
+
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+	if (offset >= LABEL_SIZE)
+		return -EINVAL;
+	if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
+		return -EINVAL;
+
+	status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd);
+	*status = 0;
+	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+	memcpy(label + offset, nd_cmd->in_buf, len);
+	rc = buf_len - sizeof(*nd_cmd) - (len + 4);
+
+	return rc;
+}
+
+static struct e820_descriptor *to_e820_desc(struct nvdimm_bus_descriptor *desc)
+{
+	return container_of(desc, struct e820_descriptor, nd_desc);
+}
+
+static int e820_ndctl(struct nvdimm_bus_descriptor *nd_desc,
+			 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+			 unsigned int buf_len, int *cmd_rc)
+{
+	struct e820_descriptor *t = to_e820_desc(nd_desc);
+	int rc = -EINVAL;
+
+	switch (cmd) {
+	case ND_CMD_GET_CONFIG_SIZE:
+		rc = e820_get_config_size(buf, buf_len);
+		break;
+	case ND_CMD_GET_CONFIG_DATA:
+		rc = e820_get_config_data(buf, buf_len, t->label);
+		break;
+	case ND_CMD_SET_CONFIG_DATA:
+		rc = e820_set_config_data(buf, buf_len, t->label);
+		break;
+	default:
+		return rc;
+	}
+
+	return rc;
+}
+
+static void e820_desc_free(struct e820_descriptor *desc)
+{
+	if (!desc)
+		return;
+
+	nvdimm_bus_unregister(desc->nvdimm_bus);
+	kfree(desc->label);
+	kfree(desc);
+}
+
+static struct e820_descriptor *e820_desc_alloc(struct platform_device *pdev)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	unsigned int cmd_mask, dimm_flags;
+	struct device *dev = &pdev->dev;
+	struct nvdimm_bus *nvdimm_bus;
+	struct e820_descriptor *desc;
+	struct nvdimm *nvdimm;
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		goto err;
+
+	desc->label = kzalloc(LABEL_SIZE, GFP_KERNEL);
+	if (!desc->label)
+		goto err;
+
+	nd_desc = &desc->nd_desc;
+	nd_desc->provider_name = "e820";
+	nd_desc->module = THIS_MODULE;
+	nd_desc->ndctl = e820_ndctl;
+	nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
+	if (!nvdimm_bus) {
+		dev_err(dev, "nvdimm bus registration failure\n");
+		goto err;
+	}
+	desc->nvdimm_bus = nvdimm_bus;
+
+	cmd_mask = (1UL << ND_CMD_GET_CONFIG_SIZE |
+			1UL << ND_CMD_GET_CONFIG_DATA |
+			1UL << ND_CMD_SET_CONFIG_DATA);
+	dimm_flags = (1UL << NDD_ALIASING);
+	nvdimm = nvdimm_create(nvdimm_bus, pdev, NULL,
+				dimm_flags, cmd_mask, 0, NULL);
+	if (!nvdimm) {
+		dev_err(dev, "nvdimm creation failure\n");
+		goto err;
+	}
+	desc->nvdimm = nvdimm;
+	return desc;
+
+err:
+	e820_desc_free(desc);
+	return NULL;
+}
+
 static int e820_register_one(struct resource *res, void *data)
 {
+	struct platform_device *pdev = data;
 	struct nd_region_desc ndr_desc;
-	struct nvdimm_bus *nvdimm_bus = data;
+	struct nd_mapping_desc mapping;
+	struct e820_descriptor *desc;
+
+	desc = e820_desc_alloc(pdev);
+	if (!desc)
+		return -ENOMEM;
+
+	mapping.nvdimm = desc->nvdimm;
+	mapping.start = res->start;
+	mapping.size = resource_size(res);
+	mapping.position = 0;
+
+	generate_random_uuid(desc->cookie1);
+	desc->nd_set.cookie1 = (u64) desc->cookie1;
+	generate_random_uuid(desc->cookie2);
+	desc->nd_set.cookie2 = (u64) desc->cookie2;
 
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
 	ndr_desc.res = res;
 	ndr_desc.numa_node = e820_range_to_nid(res->start);
 	ndr_desc.target_node = ndr_desc.numa_node;
+	ndr_desc.mapping = &mapping;
+	ndr_desc.num_mappings = 1;
+	ndr_desc.nd_set = &desc->nd_set;
 	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
-	if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+	if (!nvdimm_pmem_region_create(desc->nvdimm_bus, &ndr_desc)) {
+		e820_desc_free(desc);
+		dev_err(&pdev->dev, "nvdimm region creation failure\n");
 		return -ENXIO;
+	}
+
+	platform_set_drvdata(pdev, desc);
+	return 0;
+}
+
+static int e820_pmem_remove(struct platform_device *pdev)
+{
+	struct e820_descriptor *desc = platform_get_drvdata(pdev);
+
+	e820_desc_free(desc);
 	return 0;
 }
 
 static int e820_pmem_probe(struct platform_device *pdev)
 {
-	static struct nvdimm_bus_descriptor nd_desc;
-	struct device *dev = &pdev->dev;
-	struct nvdimm_bus *nvdimm_bus;
 	int rc = -ENXIO;
 
-	nd_desc.provider_name = "e820";
-	nd_desc.module = THIS_MODULE;
-	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
-	if (!nvdimm_bus)
-		goto err;
-	platform_set_drvdata(pdev, nvdimm_bus);
-
 	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
-			IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+			IORESOURCE_MEM, 0, -1, pdev, e820_register_one);
 	if (rc)
 		goto err;
 	return 0;
 err:
-	nvdimm_bus_unregister(nvdimm_bus);
-	dev_err(dev, "failed to register legacy persistent memory ranges\n");
+	dev_err(&pdev->dev, "failed to register legacy persistent memory ranges\n");
 	return rc;
 }