diff mbox series

libnvdimm/dimm: Add a no-BLK quirk based on NVDIMM family

Message ID 154915640488.3541628.4947340559510114602.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Superseded
Headers show
Series libnvdimm/dimm: Add a no-BLK quirk based on NVDIMM family | expand

Commit Message

Dan Williams Feb. 3, 2019, 1:13 a.m. UTC
As Dexuan reports the NVDIMM_FAMILY_HYPERV platform is incompatible with
the existing Linux namespace implementation because it uses
NSLABEL_FLAG_LOCAL for x1-width PMEM interleave sets. Quirk it as an
platform / DIMM that does not provide BLK-aperture access. Allow the
libnvdimm core to assume no potential for aliasing. In case other
implementations make the same mistake, provide a "noblk" module
parameter to force-enable the quirk.

Link: https://lkml.kernel.org/r/PU1P153MB0169977604493B82B662A01CBF920@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM
Reported-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        |    4 ++++
 drivers/nvdimm/dimm_devs.c      |    7 ++++++-
 drivers/nvdimm/label.c          |    3 +++
 drivers/nvdimm/namespace_devs.c |    6 ++++++
 drivers/nvdimm/region_devs.c    |    7 +++++++
 include/linux/libnvdimm.h       |    2 ++
 6 files changed, 28 insertions(+), 1 deletion(-)

Comments

kernel test robot Feb. 3, 2019, 2:31 p.m. UTC | #1
Hi Dan,

I love your patch! Yet something to improve:

[auto build test ERROR on linux-nvdimm/libnvdimm-for-next]
[also build test ERROR on v5.0-rc4 next-20190201]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Dan-Williams/libnvdimm-dimm-Add-a-no-BLK-quirk-based-on-NVDIMM-family/20190203-213444
base:   https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git libnvdimm-for-next
config: x86_64-randconfig-x018-201905 (attached as .config)
compiler: gcc-8 (Debian 8.2.0-14) 8.2.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/acpi//nfit/core.c: In function 'acpi_nfit_register_dimms':
>> drivers/acpi//nfit/core.c:2003:27: error: 'NVDIMM_FAMILY_HYPERV' undeclared (first use in this function); did you mean 'NVDIMM_FAMILY_HPE2'?
      if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
                              ^~~~~~~~~~~~~~~~~~~~
                              NVDIMM_FAMILY_HPE2
   drivers/acpi//nfit/core.c:2003:27: note: each undeclared identifier is reported only once for each function it appears in

vim +2003 drivers/acpi//nfit/core.c

  1944	
  1945	static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
  1946	{
  1947		struct nfit_mem *nfit_mem;
  1948		int dimm_count = 0, rc;
  1949		struct nvdimm *nvdimm;
  1950	
  1951		list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
  1952			struct acpi_nfit_flush_address *flush;
  1953			unsigned long flags = 0, cmd_mask;
  1954			struct nfit_memdev *nfit_memdev;
  1955			u32 device_handle;
  1956			u16 mem_flags;
  1957	
  1958			device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
  1959			nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
  1960			if (nvdimm) {
  1961				dimm_count++;
  1962				continue;
  1963			}
  1964	
  1965			if (nfit_mem->bdw && nfit_mem->memdev_pmem)
  1966				set_bit(NDD_ALIASING, &flags);
  1967	
  1968			/* collate flags across all memdevs for this dimm */
  1969			list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
  1970				struct acpi_nfit_memory_map *dimm_memdev;
  1971	
  1972				dimm_memdev = __to_nfit_memdev(nfit_mem);
  1973				if (dimm_memdev->device_handle
  1974						!= nfit_memdev->memdev->device_handle)
  1975					continue;
  1976				dimm_memdev->flags |= nfit_memdev->memdev->flags;
  1977			}
  1978	
  1979			mem_flags = __to_nfit_memdev(nfit_mem)->flags;
  1980			if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
  1981				set_bit(NDD_UNARMED, &flags);
  1982	
  1983			rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
  1984			if (rc)
  1985				continue;
  1986	
  1987			/*
  1988			 * TODO: provide translation for non-NVDIMM_FAMILY_INTEL
  1989			 * devices (i.e. from nd_cmd to acpi_dsm) to standardize the
  1990			 * userspace interface.
  1991			 */
  1992			cmd_mask = 1UL << ND_CMD_CALL;
  1993			if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
  1994				/*
  1995				 * These commands have a 1:1 correspondence
  1996				 * between DSM payload and libnvdimm ioctl
  1997				 * payload format.
  1998				 */
  1999				cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
  2000			}
  2001	
  2002			/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
> 2003			if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
  2004				set_bit(NDD_NOBLK, &flags);
  2005	
  2006			if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
  2007				set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
  2008				set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
  2009			}
  2010			if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags))
  2011				set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
  2012	
  2013			flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
  2014				: NULL;
  2015			nvdimm = __nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
  2016					acpi_nfit_dimm_attribute_groups,
  2017					flags, cmd_mask, flush ? flush->hint_count : 0,
  2018					nfit_mem->flush_wpq, &nfit_mem->id[0],
  2019					acpi_nfit_get_security_ops(nfit_mem->family));
  2020			if (!nvdimm)
  2021				return -ENOMEM;
  2022	
  2023			nfit_mem->nvdimm = nvdimm;
  2024			dimm_count++;
  2025	
  2026			if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
  2027				continue;
  2028	
  2029			dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n",
  2030					nvdimm_name(nvdimm),
  2031			  mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "",
  2032			  mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"",
  2033			  mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "",
  2034			  mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "",
  2035			  mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : "");
  2036	
  2037		}
  2038	
  2039		rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
  2040		if (rc)
  2041			return rc;
  2042	
  2043		/*
  2044		 * Now that dimms are successfully registered, and async registration
  2045		 * is flushed, attempt to enable event notification.
  2046		 */
  2047		list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
  2048			struct kernfs_node *nfit_kernfs;
  2049	
  2050			nvdimm = nfit_mem->nvdimm;
  2051			if (!nvdimm)
  2052				continue;
  2053	
  2054			rc = nvdimm_security_setup_events(nvdimm);
  2055			if (rc < 0)
  2056				dev_warn(acpi_desc->dev,
  2057					"security event setup failed: %d\n", rc);
  2058	
  2059			nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit");
  2060			if (nfit_kernfs)
  2061				nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs,
  2062						"flags");
  2063			sysfs_put(nfit_kernfs);
  2064			if (!nfit_mem->flags_attr)
  2065				dev_warn(acpi_desc->dev, "%s: notifications disabled\n",
  2066						nvdimm_name(nvdimm));
  2067		}
  2068	
  2069		return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify,
  2070				acpi_desc);
  2071	}
  2072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Feb. 3, 2019, 2:38 p.m. UTC | #2
Hi Dan,

I love your patch! Yet something to improve:

[auto build test ERROR on linux-nvdimm/libnvdimm-for-next]
[also build test ERROR on v5.0-rc4 next-20190201]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Dan-Williams/libnvdimm-dimm-Add-a-no-BLK-quirk-based-on-NVDIMM-family/20190203-213444
base:   https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git libnvdimm-for-next
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-8 (Debian 8.2.0-14) 8.2.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/acpi/nfit/core.c: In function 'acpi_nfit_register_dimms':
>> drivers/acpi/nfit/core.c:2003:27: error: 'NVDIMM_FAMILY_HYPERV' undeclared (first use in this function); did you mean 'NVDIMM_FAMILY_HPE1'?
      if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
                              ^~~~~~~~~~~~~~~~~~~~
                              NVDIMM_FAMILY_HPE1
   drivers/acpi/nfit/core.c:2003:27: note: each undeclared identifier is reported only once for each function it appears in

vim +2003 drivers/acpi/nfit/core.c

  1944	
  1945	static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
  1946	{
  1947		struct nfit_mem *nfit_mem;
  1948		int dimm_count = 0, rc;
  1949		struct nvdimm *nvdimm;
  1950	
  1951		list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
  1952			struct acpi_nfit_flush_address *flush;
  1953			unsigned long flags = 0, cmd_mask;
  1954			struct nfit_memdev *nfit_memdev;
  1955			u32 device_handle;
  1956			u16 mem_flags;
  1957	
  1958			device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
  1959			nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
  1960			if (nvdimm) {
  1961				dimm_count++;
  1962				continue;
  1963			}
  1964	
  1965			if (nfit_mem->bdw && nfit_mem->memdev_pmem)
  1966				set_bit(NDD_ALIASING, &flags);
  1967	
  1968			/* collate flags across all memdevs for this dimm */
  1969			list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
  1970				struct acpi_nfit_memory_map *dimm_memdev;
  1971	
  1972				dimm_memdev = __to_nfit_memdev(nfit_mem);
  1973				if (dimm_memdev->device_handle
  1974						!= nfit_memdev->memdev->device_handle)
  1975					continue;
  1976				dimm_memdev->flags |= nfit_memdev->memdev->flags;
  1977			}
  1978	
  1979			mem_flags = __to_nfit_memdev(nfit_mem)->flags;
  1980			if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
  1981				set_bit(NDD_UNARMED, &flags);
  1982	
  1983			rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
  1984			if (rc)
  1985				continue;
  1986	
  1987			/*
  1988			 * TODO: provide translation for non-NVDIMM_FAMILY_INTEL
  1989			 * devices (i.e. from nd_cmd to acpi_dsm) to standardize the
  1990			 * userspace interface.
  1991			 */
  1992			cmd_mask = 1UL << ND_CMD_CALL;
  1993			if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
  1994				/*
  1995				 * These commands have a 1:1 correspondence
  1996				 * between DSM payload and libnvdimm ioctl
  1997				 * payload format.
  1998				 */
  1999				cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
  2000			}
  2001	
  2002			/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
> 2003			if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
  2004				set_bit(NDD_NOBLK, &flags);
  2005	
  2006			if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
  2007				set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
  2008				set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
  2009			}
  2010			if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags))
  2011				set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
  2012	
  2013			flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
  2014				: NULL;
  2015			nvdimm = __nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
  2016					acpi_nfit_dimm_attribute_groups,
  2017					flags, cmd_mask, flush ? flush->hint_count : 0,
  2018					nfit_mem->flush_wpq, &nfit_mem->id[0],
  2019					acpi_nfit_get_security_ops(nfit_mem->family));
  2020			if (!nvdimm)
  2021				return -ENOMEM;
  2022	
  2023			nfit_mem->nvdimm = nvdimm;
  2024			dimm_count++;
  2025	
  2026			if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
  2027				continue;
  2028	
  2029			dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n",
  2030					nvdimm_name(nvdimm),
  2031			  mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "",
  2032			  mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"",
  2033			  mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "",
  2034			  mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "",
  2035			  mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : "");
  2036	
  2037		}
  2038	
  2039		rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
  2040		if (rc)
  2041			return rc;
  2042	
  2043		/*
  2044		 * Now that dimms are successfully registered, and async registration
  2045		 * is flushed, attempt to enable event notification.
  2046		 */
  2047		list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
  2048			struct kernfs_node *nfit_kernfs;
  2049	
  2050			nvdimm = nfit_mem->nvdimm;
  2051			if (!nvdimm)
  2052				continue;
  2053	
  2054			rc = nvdimm_security_setup_events(nvdimm);
  2055			if (rc < 0)
  2056				dev_warn(acpi_desc->dev,
  2057					"security event setup failed: %d\n", rc);
  2058	
  2059			nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit");
  2060			if (nfit_kernfs)
  2061				nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs,
  2062						"flags");
  2063			sysfs_put(nfit_kernfs);
  2064			if (!nfit_mem->flags_attr)
  2065				dev_warn(acpi_desc->dev, "%s: notifications disabled\n",
  2066						nvdimm_name(nvdimm));
  2067		}
  2068	
  2069		return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify,
  2070				acpi_desc);
  2071	}
  2072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dexuan Cui Feb. 3, 2019, 5:21 p.m. UTC | #3
> From: Dan Williams <dan.j.williams@intel.com>
> Sent: Saturday, February 2, 2019 5:13 PM
> ...
> As Dexuan reports the NVDIMM_FAMILY_HYPERV platform is incompatible with
> the existing Linux namespace implementation because it uses
> NSLABEL_FLAG_LOCAL for x1-width PMEM interleave sets. Quirk it as an
> platform / DIMM that does not provide BLK-aperture access. Allow the
> libnvdimm core to assume no potential for aliasing. In case other
> implementations make the same mistake, provide a "noblk" module
> parameter to force-enable the quirk.

Hi Dan,
Thanks very much for the patch! With it, "ndctl list" can show the below:

root@decui-gen2-u1904:~/ndctl# ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"raw",
    "size":137438953472,
    "uuid":"c258aaab-f72b-e546-bfa5-be5e07761dbc",
    "blockdev":"pmem1",
    "name":"Microsoft Hyper-V NVDIMM 1 Label"
  },
  {
    "dev":"namespace0.0",
    "mode":"raw",
    "size":34359738368,
    "uuid":"9f0497a7-4453-7c40-ad35-21a791e00345",
    "blockdev":"pmem0",
    "name":"Microsoft Hyper-V NVDIMM 0 Label"
  }
]

And /dev/pmem0 can appear, but /dev/pmem0p1 can not appear, and the "mode" of
"namespace0.0" is not correct.  With the Ubuntu 19.04 4.18 kenel, I get the below:

root@decui-gen2-u1904:~/ndctl# ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"raw",
    "size":137438953472,
    "blockdev":"pmem1"
  },
  {
    "dev":"namespace0.0",
    "mode":"fsdax",
    "map":"dev",
    "size":33820770304,
    "uuid":"ef028c4e-2b1f-4bf8-b92a-1109d7a1c914",
    "blockdev":"pmem0"
  }
]
and /dev/pmem0p1 can appear.

It looks the namespace created by Ubuntu 19.04 (4.18) is incompatible with
the libnvdimm-pending branch + this patch.

It looks we need to completely disable the label usage for NVDIMM_FAMILY_HYPERV 
due to compability?

Thanks,
-- Dexuan
Dexuan Cui Feb. 3, 2019, 5:39 p.m. UTC | #4
> From: kbuild test robot <lkp@intel.com>
> Sent: Sunday, February 3, 2019 6:32 AM
> To: Dan Williams <dan.j.williams@intel.com>
> Cc: kbuild-all@01.org; linux-nvdimm@lists.01.org; Dexuan Cui
> <decui@microsoft.com>; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] libnvdimm/dimm: Add a no-BLK quirk based on NVDIMM
> family
> 
> Hi Dan,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on linux-nvdimm/libnvdimm-for-next]
> [also build test ERROR on v5.0-rc4 next-20190201]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system]
> ...
> All errors (new ones prefixed by >>):
> 
>    drivers/acpi//nfit/core.c: In function 'acpi_nfit_register_dimms':
> >> drivers/acpi//nfit/core.c:2003:27: error: 'NVDIMM_FAMILY_HYPERV'
> undeclared (first use in this function); did you mean 'NVDIMM_FAMILY_HPE2'?
>       if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
>                               ^~~~~~~~~~~~~~~~~~~~
>                               NVDIMM_FAMILY_HPE2
>    drivers/acpi//nfit/core.c:2003:27: note: each undeclared identifier is
> reported only once for each function it appears in

This is a false alarm. The patch is only for
https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=libnvdimm-pending
where we have the dependant commid
1194c4133195 ("nfit: Add Hyper-V NVDIMM DSM command set to white list")

Thanks,
-- Dexuan
Dexuan Cui Feb. 3, 2019, 5:40 p.m. UTC | #5
> From: kbuild test robot <lkp@intel.com>
> Sent: Sunday, February 3, 2019 6:38 AM
>  ...
> Hi Dan,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on linux-nvdimm/libnvdimm-for-next]
> [also build test ERROR on v5.0-rc4 next-20190201]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system]
> ...
> config: x86_64-allmodconfig (attached as .config)
> compiler: gcc-8 (Debian 8.2.0-14) 8.2.0
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64
> 
> All errors (new ones prefixed by >>):
> 
>    drivers/acpi/nfit/core.c: In function 'acpi_nfit_register_dimms':
> >> drivers/acpi/nfit/core.c:2003:27: error: 'NVDIMM_FAMILY_HYPERV'
> undeclared (first use in this function); did you mean 'NVDIMM_FAMILY_HPE1'?
>       if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
>                               ^~~~~~~~~~~~~~~~~~~~
>                               NVDIMM_FAMILY_HPE1
>    drivers/acpi/nfit/core.c:2003:27: note: each undeclared identifier is
> reported only once for each function it appears in

This is a false alarm, too. The patch is only for
https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=libnvdimm-pending
where we have the dependant commit:
1194c4133195 ("nfit: Add Hyper-V NVDIMM DSM command set to white list")

Thanks,
-- Dexuan
Dan Williams Feb. 3, 2019, 5:49 p.m. UTC | #6
On Sun, Feb 3, 2019 at 9:22 AM Dexuan Cui <decui@microsoft.com> wrote:
>
> > From: Dan Williams <dan.j.williams@intel.com>
> > Sent: Saturday, February 2, 2019 5:13 PM
> > ...
> > As Dexuan reports the NVDIMM_FAMILY_HYPERV platform is incompatible with
> > the existing Linux namespace implementation because it uses
> > NSLABEL_FLAG_LOCAL for x1-width PMEM interleave sets. Quirk it as an
> > platform / DIMM that does not provide BLK-aperture access. Allow the
> > libnvdimm core to assume no potential for aliasing. In case other
> > implementations make the same mistake, provide a "noblk" module
> > parameter to force-enable the quirk.
>
> Hi Dan,
> Thanks very much for the patch! With it, "ndctl list" can show the below:
>
> root@decui-gen2-u1904:~/ndctl# ndctl list
> [
>   {
>     "dev":"namespace1.0",
>     "mode":"raw",
>     "size":137438953472,
>     "uuid":"c258aaab-f72b-e546-bfa5-be5e07761dbc",
>     "blockdev":"pmem1",
>     "name":"Microsoft Hyper-V NVDIMM 1 Label"
>   },
>   {
>     "dev":"namespace0.0",
>     "mode":"raw",
>     "size":34359738368,
>     "uuid":"9f0497a7-4453-7c40-ad35-21a791e00345",
>     "blockdev":"pmem0",
>     "name":"Microsoft Hyper-V NVDIMM 0 Label"
>   }
> ]
>
> And /dev/pmem0 can appear, but /dev/pmem0p1 can not appear, and the "mode" of
> "namespace0.0" is not correct.  With the Ubuntu 19.04 4.18 kenel, I get the below:
>
> root@decui-gen2-u1904:~/ndctl# ndctl list
> [
>   {
>     "dev":"namespace1.0",
>     "mode":"raw",
>     "size":137438953472,
>     "blockdev":"pmem1"
>   },
>   {
>     "dev":"namespace0.0",
>     "mode":"fsdax",
>     "map":"dev",
>     "size":33820770304,
>     "uuid":"ef028c4e-2b1f-4bf8-b92a-1109d7a1c914",
>     "blockdev":"pmem0"
>   }
> ]
> and /dev/pmem0p1 can appear.
>
> It looks the namespace created by Ubuntu 19.04 (4.18) is incompatible with
> the libnvdimm-pending branch + this patch.

This is correct, the configuration switched from label-less by default
to labeled.

> It looks we need to completely disable the label usage for NVDIMM_FAMILY_HYPERV
> due to compability?

It's either going to be a quirk in Linux or a quirk in the Hyper-V
configuration. I think it's more manageable for Hyper-V to ship a
"label disable" switch than to try to ship a workaround in Linux. The
"noblk" quirk in Linux works around the LOCAL bit in the labels.
However, the namespace mode regression can only be resolved by hiding
the label capability until the administrator manually switches from
label-less to labeled operation.
Dexuan Cui Feb. 3, 2019, 6:13 p.m. UTC | #7
> From: Dan Williams <dan.j.williams@intel.com>
> Sent: Sunday, February 3, 2019 9:49 AM
> > ...
> > It looks the namespace created by Ubuntu 19.04 (4.18) is incompatible with
> > the libnvdimm-pending branch + this patch.
> 
> This is correct, the configuration switched from label-less by default
> to labeled.
Thanks for the confirmation! 
 
> > It looks we need to completely disable the label usage for
> NVDIMM_FAMILY_HYPERV
> > due to compability?
> 
> It's either going to be a quirk in Linux or a quirk in the Hyper-V
> configuration. I think it's more manageable for Hyper-V to ship a
> "label disable" switch than to try to ship a workaround in Linux. The
> "noblk" quirk in Linux works around the LOCAL bit in the labels.
> However, the namespace mode regression can only be resolved by hiding
> the label capability until the administrator manually switches from
> label-less to labeled operation.

As I understand, the essence of the issue is: Hyper-V emulates the
label mechanism (i.e. it supports _LSI and LSR), but doesn't do it
right (i.e. it doesn't support _LSW).

To manage the namespaces, Linux can choose to use label or not.

If _LSI/_LSR are supported, Linux assumes _LSW is supported as well
and chooses to use label (i.e. the label mode), but since Hyper-V doesn't
support _LSW, Linux fails to change the namespace configuration. In
Ubuntu 19.04 (4.18), the kernel is not aware of Hyper-V _LSI/_LSR, so
the created namespace is in "label-less" mode, and hence can't be used
with the libnvdimm-pending branch + this patch, unless we add a quirk
in Linux to explicitly not use the label.

I agree ideally Hyper-V should hide the label capability, and I'll request
Hyper-V team to do that. I hope Hyper-V guys are still able to do that
in time so we won't need a quirk in Linux kernel.

BTW, I suppose Windows VM should be in "label-less" mode.

Thanks for the help, Dan!

Thanks,
-- Dexuan
Dan Williams Feb. 3, 2019, 7:13 p.m. UTC | #8
On Sun, Feb 3, 2019 at 10:13 AM Dexuan Cui <decui@microsoft.com> wrote:
>
> > From: Dan Williams <dan.j.williams@intel.com>
> > Sent: Sunday, February 3, 2019 9:49 AM
> > > ...
> > > It looks the namespace created by Ubuntu 19.04 (4.18) is incompatible with
> > > the libnvdimm-pending branch + this patch.
> >
> > This is correct, the configuration switched from label-less by default
> > to labeled.
> Thanks for the confirmation!
>
> > > It looks we need to completely disable the label usage for
> > NVDIMM_FAMILY_HYPERV
> > > due to compability?
> >
> > It's either going to be a quirk in Linux or a quirk in the Hyper-V
> > configuration. I think it's more manageable for Hyper-V to ship a
> > "label disable" switch than to try to ship a workaround in Linux. The
> > "noblk" quirk in Linux works around the LOCAL bit in the labels.
> > However, the namespace mode regression can only be resolved by hiding
> > the label capability until the administrator manually switches from
> > label-less to labeled operation.
>
> As I understand, the essence of the issue is: Hyper-V emulates the
> label mechanism (i.e. it supports _LSI and LSR), but doesn't do it
> right (i.e. it doesn't support _LSW).
>
> To manage the namespaces, Linux can choose to use label or not.
>
> If _LSI/_LSR are supported, Linux assumes _LSW is supported as well
> and chooses to use label (i.e. the label mode), but since Hyper-V doesn't
> support _LSW, Linux fails to change the namespace configuration.

No, that's not quite right. The reason Linux does not see the fsdax
mode configuration is due to the missing "address abstraction GUID" in
the label produced the default Hyper-V configuration. In label-less
mode there is no "address abstraction GUID" to validate so it falls
back to just using the info-block directly.

The _LSW issue is if you wanted to reconfigure a raw namespace to
fsdax mode. The current flow tries to delete the label, but that's
only for reconfiguration, not the initial boot-up case that is
currently failing. The deletion would fail on finding no label-write
capability, but to be clear the boot-up case does not perform any
writes.

> In
> Ubuntu 19.04 (4.18), the kernel is not aware of Hyper-V _LSI/_LSR, so
> the created namespace is in "label-less" mode, and hence can't be used
> with the libnvdimm-pending branch + this patch, unless we add a quirk
> in Linux to explicitly not use the label.
>
> I agree ideally Hyper-V should hide the label capability, and I'll request
> Hyper-V team to do that. I hope Hyper-V guys are still able to do that
> in time so we won't need a quirk in Linux kernel.

After some more thought, the "no regressions" rule means that Linux
should ship a quirk for this by default. I think a good heuristic is
to disable label support by default if no _LSW method is detected. An
opt-in can be specified to accept "read-only" configurations, but
that's an exceptional case. I'll send a patch for this.

> BTW, I suppose Windows VM should be in "label-less" mode.

I expect Windows mandates labeled operation. This label-less concept
was something Linux shipped in advance of a specification being
ratified and to support early NVDIMMs that don't advertise label
space.

> Thanks for the help, Dan!

Thanks for the quick feedback and testing.
Dexuan Cui Feb. 5, 2019, 4:53 p.m. UTC | #9
> From: Dan Williams <dan.j.williams@intel.com>
> Sent: Sunday, February 3, 2019 11:14 AM
> > ...
> > As I understand, the essence of the issue is: Hyper-V emulates the
> > label mechanism (i.e. it supports _LSI and LSR), but doesn't do it
> > right (i.e. it doesn't support _LSW).
> >
> > To manage the namespaces, Linux can choose to use label or not.
> >
> > If _LSI/_LSR are supported, Linux assumes _LSW is supported as well
> > and chooses to use label (i.e. the label mode), but since Hyper-V doesn't
> > support _LSW, Linux fails to change the namespace configuration.
>
> No, that's not quite right. The reason Linux does not see the fsdax
> mode configuration is due to the missing "address abstraction GUID" in
> the label produced the default Hyper-V configuration.
Hi Dan,
Do you mean NVDIMM_DAX_GUID?

> In label-less mode there is no "address abstraction GUID" to validate so
> it falls back to just using the info-block directly.
In the case of not using label storage area, as I understand the info-block
resides in regular data storage area. Can you please tell me where exactly
the info-block is and how its location is decided?
And I suppose the info-block contains the NVDIMM_DAX_GUID?

I'm asking because I found I lose ~500MBytes of the 32GBytes Hyper-V
NVDIMM device, when the namespace is in fsdax mode. When it's in
raw mode, I'm able to use all of the 32GB space.

> The _LSW issue is if you wanted to reconfigure a raw namespace to
> fsdax mode. The current flow tries to delete the label, but that's
> only for reconfiguration, not the initial boot-up case that is
> currently failing. The deletion would fail on finding no label-write
> capability, but to be clear the boot-up case does not perform any
> writes.
Thanks for the explanation!

> > In Ubuntu 19.04 (4.18), the kernel is not aware of Hyper-V _LSI/_LSR, so
> > the created namespace is in "label-less" mode, and hence can't be used
> > with the libnvdimm-pending branch + this patch, unless we add a quirk
> > in Linux to explicitly not use the label.
> >
> > I agree ideally Hyper-V should hide the label capability, and I'll request
> > Hyper-V team to do that. I hope Hyper-V guys are still able to do that
> > in time so we won't need a quirk in Linux kernel.
>
> After some more thought, the "no regressions" rule means that Linux
> should ship a quirk for this by default. I think a good heuristic is
> to disable label support by default if no _LSW method is detected. An
> opt-in can be specified to accept "read-only" configurations, but
> that's an exceptional case. I'll send a patch for this.
>
> > BTW, I suppose Windows VM should be in "label-less" mode.
>
> I expect Windows mandates labeled operation. This label-less concept
> was something Linux shipped in advance of a specification being
> ratified and to support early NVDIMMs that don't advertise label
> space.
Since Hyper-V NVDIMM doesn't support _LSW, IMO Windows VM
can't update the LSI area either, so I guess Windows VM can't use
label either, just like Linux VM? I guess Windows VM also stores an
"info block" when the namespace is in dax mode, just like Linux VM?

BTW, it looks the dax mode configuration in Linux VM is incompatible
with that in Windows VM(?) When I create a DAX-enabled NTFS partition
in Windows VM in the same Hyper-V NVDIMM, Windows VM is able to
use all of the 32GB space (As I mentioned above, in Linux I lose ~500MB).
And the "dax" mode namespace created in Windows VM is detected
as "raw", and vice verse (I think).

I'm trying to understand why I lose ~500MB in Linux dax mode.
Your insights are appreciated!

Thanks,
--Dexuan
Dan Williams Feb. 5, 2019, 5:11 p.m. UTC | #10
On Tue, Feb 5, 2019 at 8:53 AM Dexuan Cui <decui@microsoft.com> wrote:
>
> > From: Dan Williams <dan.j.williams@intel.com>
> > Sent: Sunday, February 3, 2019 11:14 AM
> > > ...
> > > As I understand, the essence of the issue is: Hyper-V emulates the
> > > label mechanism (i.e. it supports _LSI and LSR), but doesn't do it
> > > right (i.e. it doesn't support _LSW).
> > >
> > > To manage the namespaces, Linux can choose to use label or not.
> > >
> > > If _LSI/_LSR are supported, Linux assumes _LSW is supported as well
> > > and chooses to use label (i.e. the label mode), but since Hyper-V doesn't
> > > support _LSW, Linux fails to change the namespace configuration.
> >
> > No, that's not quite right. The reason Linux does not see the fsdax
> > mode configuration is due to the missing "address abstraction GUID" in
> > the label produced the default Hyper-V configuration.
> Hi Dan,
> Do you mean NVDIMM_DAX_GUID?

Correct.

>
> > In label-less mode there is no "address abstraction GUID" to validate so
> > it falls back to just using the info-block directly.
> In the case of not using label storage area, as I understand the info-block
> resides in regular data storage area. Can you please tell me where exactly
> the info-block is and how its location is decided?
> And I suppose the info-block contains the NVDIMM_DAX_GUID?

The info-block always lives in the data-storage area, even with
labels. It's placed at namespace base address + 4K.

When labels are present the label gives the namespace a uuid and the
info-block "parent uuid" field must match that value. Without labels
the "parent uuid" field is unused / filled with zero's. Also with v1.2
labels the address abstraction uuid must match the info-block type.

> I'm asking because I found I lose ~500MBytes of the 32GBytes Hyper-V
> NVDIMM device, when the namespace is in fsdax mode. When it's in
> raw mode, I'm able to use all of the 32GB space.

Yes. A portion of the capacity is reserved for page structures.

https://lwn.net/Articles/656197/
Dexuan Cui Feb. 6, 2019, 12:57 a.m. UTC | #11
> From: Dan Williams <dan.j.williams@intel.com>
> Sent: Tuesday, February 5, 2019 9:12 AM
> On Tue, Feb 5, 2019 at 8:53 AM Dexuan Cui <decui@microsoft.com> wrote:
> >
> > > From: Dan Williams <dan.j.williams@intel.com>
> > > Sent: Sunday, February 3, 2019 11:14 AM
> > > > ...
> > > > As I understand, the essence of the issue is: Hyper-V emulates the
> > > > label mechanism (i.e. it supports _LSI and LSR), but doesn't do it
> > > > right (i.e. it doesn't support _LSW).
> > > >
> > > > To manage the namespaces, Linux can choose to use label or not.
> > > >
> > > > If _LSI/_LSR are supported, Linux assumes _LSW is supported as well
> > > > and chooses to use label (i.e. the label mode), but since Hyper-V doesn't
> > > > support _LSW, Linux fails to change the namespace configuration.
> > >
> > > No, that's not quite right. The reason Linux does not see the fsdax
> > > mode configuration is due to the missing "address abstraction GUID" in
> > > the label produced the default Hyper-V configuration.
> > Hi Dan,
> > Do you mean NVDIMM_DAX_GUID?
> 
> Correct.
FYI: in create_namespace_pmem(), label0->abstraction_guid is guid_null in
the case of Hyper-V NVDIMM. It looks Hyper-V does't use the guid.

> > > In label-less mode there is no "address abstraction GUID" to validate so
> > > it falls back to just using the info-block directly.
> > In the case of not using label storage area, as I understand the info-block
> > resides in regular data storage area. Can you please tell me where exactly
> > the info-block is and how its location is decided?
> > And I suppose the info-block contains the NVDIMM_DAX_GUID?
> 
> The info-block always lives in the data-storage area, even with
> labels. It's placed at namespace base address + 4K.
>
> When labels are present the label gives the namespace a uuid and the
> info-block "parent uuid" field must match that value. Without labels
> the "parent uuid" field is unused / filled with zero's. Also with v1.2
> labels the address abstraction uuid must match the info-block type.
Thanks for elaborating on this!

It looks the label mechanism is for advanced usages, e.g., a PMEM namespace
can consist of multiple NVDIMMs, or a namespace only uses part of the
capacity of a NVDIMM, or a NVDIMM contains both PMEM and Block-Mode
namespaces.

For a simple usage, e.g. if a NVDIMM only contains one PMEM namespace which
consumes all the storage space of the NVDIMM, it looks the namespace can
be normally used with the help of the info-block, and we don't really need
the label. IMO this is the case of Hyper-V, i.e. we don't use the label at all,
since _LSW is not implemented.

> > I'm asking because I found I lose ~500MBytes of the 32GBytes Hyper-V
> > NVDIMM device, when the namespace is in fsdax mode. When it's in
> > raw mode, I'm able to use all of the 32GB space.
> 
> Yes. A portion of the capacity is reserved for page structures.
Got it. It looks the size of the info-block (and the related padding?) is 2MB,
and "the overhead is 64-bytes per 4K (16GB per 1TB) on x86", so the total
overhead in my 32GB case is: 2MB + 32GB/(4096/64) = 514MB.

Thanks for sharing the link! Now I realized we can use the -M parameter
to not store the page metadata in the NVDIMM: 

-M; --map=
A pmem namespace in “fsdax” or “devdax” mode requires allocation of
per-page metadata. The allocation can be drawn from either:
“mem”: typical system memory
“dev”: persistent memory reserved from the namespace

Thanks,
--Dexuan
diff mbox series

Patch

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 4a7e8b1fa43b..811c399a3a76 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2016,6 +2016,10 @@  static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
 		}
 
+		/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
+		if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
+			set_bit(NDD_NOBLK, &flags);
+
 		if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
 			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
 			set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 4890310df874..186d63f16434 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -11,6 +11,7 @@ 
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/moduleparam.h>
 #include <linux/vmalloc.h>
 #include <linux/device.h>
 #include <linux/ndctl.h>
@@ -25,6 +26,10 @@ 
 
 static DEFINE_IDA(dimm_ida);
 
+static bool noblk;
+module_param(noblk, bool, 0444);
+MODULE_PARM_DESC(noblk, "force disable BLK / local alias support");
+
 /*
  * Retrieve bus and dimm handle and return if this bus supports
  * get_config_data commands
@@ -551,7 +556,7 @@  struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 
 	nvdimm->dimm_id = dimm_id;
 	nvdimm->provider_data = provider_data;
-	nvdimm->flags = flags;
+	nvdimm->flags = flags | noblk ? (1 << NDD_NOBLK) : 0;
 	nvdimm->cmd_mask = cmd_mask;
 	nvdimm->num_flush = num_flush;
 	nvdimm->flush_wpq = flush_wpq;
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 6d6e9a12150b..f3d753d3169c 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -392,6 +392,7 @@  int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 		return 0; /* no label, nothing to reserve */
 
 	for_each_clear_bit_le(slot, free, nslot) {
+		struct nvdimm *nvdimm = to_nvdimm(ndd->dev);
 		struct nd_namespace_label *nd_label;
 		struct nd_region *nd_region = NULL;
 		u8 label_uuid[NSLABEL_UUID_LEN];
@@ -406,6 +407,8 @@  int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 
 		memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
 		flags = __le32_to_cpu(nd_label->flags);
+		if (test_bit(NDD_NOBLK, &nvdimm->flags))
+			flags &= ~NSLABEL_FLAG_LOCAL;
 		nd_label_gen_id(&label_id, label_uuid, flags);
 		res = nvdimm_allocate_dpa(ndd, &label_id,
 				__le64_to_cpu(nd_label->dpa),
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4b077555ac70..3677b0c4a33d 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2492,6 +2492,12 @@  static int init_active_labels(struct nd_region *nd_region)
 			if (!label_ent)
 				break;
 			label = nd_label_active(ndd, j);
+			if (test_bit(NDD_NOBLK, &nvdimm->flags)) {
+				u32 flags = __le32_to_cpu(label->flags);
+
+				flags &= ~NSLABEL_FLAG_LOCAL;
+				label->flags = __cpu_to_le32(flags);
+			}
 			label_ent->label = label;
 
 			mutex_lock(&nd_mapping->lock);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e2818f94f292..3b58baa44b5c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1003,6 +1003,13 @@  static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
+
+		if (test_bit(NDD_NOBLK, &nvdimm->flags)
+				&& dev_type == &nd_blk_device_type) {
+			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not BLK capable\n",
+					caller, dev_name(&nvdimm->dev), i);
+			return NULL;
+		}
 	}
 
 	if (dev_type == &nd_blk_device_type) {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 5440f11b0907..7da406ae3a2b 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -42,6 +42,8 @@  enum {
 	NDD_SECURITY_OVERWRITE = 3,
 	/*  tracking whether or not there is a pending device reference */
 	NDD_WORK_PENDING = 4,
+	/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
+	NDD_NOBLK = 5,
 
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,