diff mbox

[v2,10/20] pmem: use ida

Message ID 20150428182506.35812.4007.stgit@dwillia2-desk3.amr.corp.intel.com
State Superseded
Delegated to: Dan Williams
Headers show

Commit Message

Dan Williams April 28, 2015, 6:25 p.m. UTC
In preparation for the pmem driver attaching to pmem-namespaces emitted
by libnd, convert it to use an ida instead of an always increasing
atomic index.  This provides a bit of stability to pmem device names in
the presence of driver re-bind events.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/block/pmem.c |   22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

Comments

Toshi Kani April 29, 2015, 6:25 p.m. UTC | #1
Hi Dan,

Thanks for the update.  This version of the patchset enumerates our NFIT
table properly. :-)

On Tue, 2015-04-28 at 14:25 -0400, Dan Williams wrote:
> In preparation for the pmem driver attaching to pmem-namespaces emitted
> by libnd, convert it to use an ida instead of an always increasing
> atomic index.  This provides a bit of stability to pmem device names in
> the presence of driver re-bind events.
  :
> @@ -122,20 +123,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>  {
>  	struct pmem_device *pmem;
>  	struct gendisk *disk;
> -	int idx, err;
> +	int err;
>  
>  	err = -ENOMEM;
>  	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
>  	if (!pmem)
>  		goto out;
>  
> +	pmem->id = ida_simple_get(&pmem_ida, 0, 0, GFP_KERNEL);

nd_pmem_probe() is called asynchronously via async_schedule_domain
().  We have seen a case that the region#->pmem# binding becomes
inconsistent across a reboot when there are 8 NVDIMM cards (reported by
Robert Elliott).  This leads user to access a wrong device.

I think pmem id needs to be assigned before async_schedule_domain(), and
cascaded to nd_pmem_probe().

-Toshi
Toshi Kani April 29, 2015, 6:53 p.m. UTC | #2
On Wed, 2015-04-29 at 11:59 -0700, Dan Williams wrote:
> On Wed, Apr 29, 2015 at 11:25 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> > Hi Dan,
> >
> > Thanks for the update.  This version of the patchset enumerates our NFIT
> > table properly. :-)
> >
> > On Tue, 2015-04-28 at 14:25 -0400, Dan Williams wrote:
> >> In preparation for the pmem driver attaching to pmem-namespaces emitted
> >> by libnd, convert it to use an ida instead of an always increasing
> >> atomic index.  This provides a bit of stability to pmem device names in
> >> the presence of driver re-bind events.
> >   :
> >> @@ -122,20 +123,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
> >>  {
> >>       struct pmem_device *pmem;
> >>       struct gendisk *disk;
> >> -     int idx, err;
> >> +     int err;
> >>
> >>       err = -ENOMEM;
> >>       pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
> >>       if (!pmem)
> >>               goto out;
> >>
> >> +     pmem->id = ida_simple_get(&pmem_ida, 0, 0, GFP_KERNEL);
> >
> > nd_pmem_probe() is called asynchronously via async_schedule_domain
> > ().  We have seen a case that the region#->pmem# binding becomes
> > inconsistent across a reboot when there are 8 NVDIMM cards (reported by
> > Robert Elliott).  This leads user to access a wrong device.
> >
> > I think pmem id needs to be assigned before async_schedule_domain(), and
> > cascaded to nd_pmem_probe().
> >
> 
> I'll take a look at making this better, but it will never be
> bulletproof.  For the same reason that root=UUID=<uuid> is preferred
> over root=/dev/sda userspace should never rely on consistent pmem
> device names from boot to boot.

I agree that constant unique IDs, such as UUIDs, are necessary to
guarantee their consistent numbering regardless of configuration
changes.  For now, /dev/pmem%d should have consistent numbering while
NFIT table entries are consistent.

Thanks,
-Toshi
Dan Williams April 29, 2015, 6:59 p.m. UTC | #3
On Wed, Apr 29, 2015 at 11:25 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> Hi Dan,
>
> Thanks for the update.  This version of the patchset enumerates our NFIT
> table properly. :-)
>
> On Tue, 2015-04-28 at 14:25 -0400, Dan Williams wrote:
>> In preparation for the pmem driver attaching to pmem-namespaces emitted
>> by libnd, convert it to use an ida instead of an always increasing
>> atomic index.  This provides a bit of stability to pmem device names in
>> the presence of driver re-bind events.
>   :
>> @@ -122,20 +123,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>>  {
>>       struct pmem_device *pmem;
>>       struct gendisk *disk;
>> -     int idx, err;
>> +     int err;
>>
>>       err = -ENOMEM;
>>       pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
>>       if (!pmem)
>>               goto out;
>>
>> +     pmem->id = ida_simple_get(&pmem_ida, 0, 0, GFP_KERNEL);
>
> nd_pmem_probe() is called asynchronously via async_schedule_domain
> ().  We have seen a case that the region#->pmem# binding becomes
> inconsistent across a reboot when there are 8 NVDIMM cards (reported by
> Robert Elliott).  This leads user to access a wrong device.
>
> I think pmem id needs to be assigned before async_schedule_domain(), and
> cascaded to nd_pmem_probe().
>

I'll take a look at making this better, but it will never be
bulletproof.  For the same reason that root=UUID=<uuid> is preferred
over root=/dev/sda userspace should never rely on consistent pmem
device names from boot to boot.
Linda Knippers April 29, 2015, 8:49 p.m. UTC | #4
On 4/29/2015 2:53 PM, Toshi Kani wrote:
> On Wed, 2015-04-29 at 11:59 -0700, Dan Williams wrote:
>> On Wed, Apr 29, 2015 at 11:25 AM, Toshi Kani <toshi.kani@hp.com> wrote:
>>> Hi Dan,
>>>
>>> Thanks for the update.  This version of the patchset enumerates our NFIT
>>> table properly. :-)
>>>
>>> On Tue, 2015-04-28 at 14:25 -0400, Dan Williams wrote:
>>>> In preparation for the pmem driver attaching to pmem-namespaces emitted
>>>> by libnd, convert it to use an ida instead of an always increasing
>>>> atomic index.  This provides a bit of stability to pmem device names in
>>>> the presence of driver re-bind events.
>>>   :
>>>> @@ -122,20 +123,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>>>>  {
>>>>       struct pmem_device *pmem;
>>>>       struct gendisk *disk;
>>>> -     int idx, err;
>>>> +     int err;
>>>>
>>>>       err = -ENOMEM;
>>>>       pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
>>>>       if (!pmem)
>>>>               goto out;
>>>>
>>>> +     pmem->id = ida_simple_get(&pmem_ida, 0, 0, GFP_KERNEL);
>>>
>>> nd_pmem_probe() is called asynchronously via async_schedule_domain
>>> ().  We have seen a case that the region#->pmem# binding becomes
>>> inconsistent across a reboot when there are 8 NVDIMM cards (reported by
>>> Robert Elliott).  This leads user to access a wrong device.
>>>
>>> I think pmem id needs to be assigned before async_schedule_domain(), and
>>> cascaded to nd_pmem_probe().
>>>
>>
>> I'll take a look at making this better, but it will never be
>> bulletproof.  For the same reason that root=UUID=<uuid> is preferred
>> over root=/dev/sda userspace should never rely on consistent pmem
>> device names from boot to boot.
> 
> I agree that constant unique IDs, such as UUIDs, are necessary to
> guarantee their consistent numbering regardless of configuration
> changes.  For now, /dev/pmem%d should have consistent numbering while
> NFIT table entries are consistent.

What's the right answer for this in the long run?

The NFIT has information like "Memory Device Physical ID" that is an SMBIOS
type 17 handle.  SMBIOS could have some naming information, like it does for
NICs.  There is also a "Memory Device Region ID".  Is that combination
enough to give us a unique identifier?

Of course, I'm assuming that information is consistent across reboots
and ideally across configuration changes, like adding more NVDIMMs.
For NICs, that's where the SMBIOS consistent naming information is helpful.
Is that what we need for devices that don't have namespaces and labels?

-- ljk

> 
> Thanks,
> -Toshi
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
>
Dan Williams April 29, 2015, 9:36 p.m. UTC | #5
On Wed, Apr 29, 2015 at 1:49 PM, Linda Knippers <linda.knippers@hp.com> wrote:
> On 4/29/2015 2:53 PM, Toshi Kani wrote:
> What's the right answer for this in the long run?

Short term, /dev/disk/by-uuid to take a stable identifier from the
contents of the device.

Longer term teach udev to populate /dev/disk/by-id with stable names
for libnd devices.  The trick is identifiers for interleaved PMEM
ranges comprised of multiple physical devices.  I'm thinking something
like /dev/disk/by-id/nd-<set_cookie>
diff mbox

Patch

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index eabf4a8d0085..e3cf9142b172 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -34,10 +34,11 @@  struct pmem_device {
 	phys_addr_t		phys_addr;
 	void			*virt_addr;
 	size_t			size;
+	int			id;
 };
 
 static int pmem_major;
-static atomic_t pmem_index;
+static DEFINE_IDA(pmem_ida);
 
 static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, int rw,
@@ -122,20 +123,26 @@  static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
 {
 	struct pmem_device *pmem;
 	struct gendisk *disk;
-	int idx, err;
+	int err;
 
 	err = -ENOMEM;
 	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
 		goto out;
 
+	pmem->id = ida_simple_get(&pmem_ida, 0, 0, GFP_KERNEL);
+	if (pmem->id < 0) {
+		err = pmem->id;
+		goto out_free_dev;
+	}
+
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 
 	err = -EINVAL;
 	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
 		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size);
-		goto out_free_dev;
+		goto out_free_ida;
 	}
 
 	/*
@@ -159,15 +166,13 @@  static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
 	if (!disk)
 		goto out_free_queue;
 
-	idx = atomic_inc_return(&pmem_index) - 1;
-
 	disk->major		= pmem_major;
-	disk->first_minor	= PMEM_MINORS * idx;
+	disk->first_minor	= PMEM_MINORS * pmem->id;
 	disk->fops		= &pmem_fops;
 	disk->private_data	= pmem;
 	disk->queue		= pmem->pmem_queue;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "pmem%d", idx);
+	sprintf(disk->disk_name, "pmem%d", pmem->id);
 	disk->driverfs_dev = dev;
 	set_capacity(disk, pmem->size >> 9);
 	pmem->pmem_disk = disk;
@@ -182,6 +187,8 @@  out_unmap:
 	iounmap(pmem->virt_addr);
 out_release_region:
 	release_mem_region(pmem->phys_addr, pmem->size);
+out_free_ida:
+	ida_simple_remove(&pmem_ida, pmem->id);
 out_free_dev:
 	kfree(pmem);
 out:
@@ -195,6 +202,7 @@  static void pmem_free(struct pmem_device *pmem)
 	blk_cleanup_queue(pmem->pmem_queue);
 	iounmap(pmem->virt_addr);
 	release_mem_region(pmem->phys_addr, pmem->size);
+	ida_simple_remove(&pmem_ida, pmem->id);
 	kfree(pmem);
 }