diff mbox

[v2,2/3] libnvdimm, pmem: adjust for section collisions with 'System RAM'

Message ID 20160303215315.1014.95661.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams March 3, 2016, 9:53 p.m. UTC
On a platform where 'Persistent Memory' and 'System RAM' are mixed
within a given sparsemem section, trim the namespace and notify about the
sub-optimal alignment.

Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/namespace_devs.c |    7 ++
 drivers/nvdimm/pfn.h            |   10 ++-
 drivers/nvdimm/pfn_devs.c       |    5 ++
 drivers/nvdimm/pmem.c           |  125 ++++++++++++++++++++++++++++-----------
 4 files changed, 111 insertions(+), 36 deletions(-)

Comments

Dan Williams March 5, 2016, 2:23 a.m. UTC | #1
On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com> wrote:
> On Thu, 2016-03-03 at 13:53 -0800, Dan Williams wrote:
>> On a platform where 'Persistent Memory' and 'System RAM' are mixed
>> within a given sparsemem section, trim the namespace and notify about the
>> sub-optimal alignment.
>>
>> Cc: Toshi Kani <toshi.kani@hpe.com>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  drivers/nvdimm/namespace_devs.c |    7 ++
>>  drivers/nvdimm/pfn.h            |   10 ++-
>>  drivers/nvdimm/pfn_devs.c       |    5 ++
>>  drivers/nvdimm/pmem.c           |  125 ++++++++++++++++++++++++++++-----
>> ------
>>  4 files changed, 111 insertions(+), 36 deletions(-)
>>
>> diff --git a/drivers/nvdimm/namespace_devs.c
>> b/drivers/nvdimm/namespace_devs.c
>> index 8ebfcaae3f5a..463756ca2d4b 100644
>> --- a/drivers/nvdimm/namespace_devs.c
>> +++ b/drivers/nvdimm/namespace_devs.c
>> @@ -133,6 +133,7 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
>>  bool pmem_should_map_pages(struct device *dev)
>>  {
>>       struct nd_region *nd_region = to_nd_region(dev->parent);
>> +     struct nd_namespace_io *nsio;
>>
>>       if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
>>               return false;
>> @@ -143,6 +144,12 @@ bool pmem_should_map_pages(struct device *dev)
>>       if (is_nd_pfn(dev) || is_nd_btt(dev))
>>               return false;
>>
>> +     nsio = to_nd_namespace_io(dev);
>> +     if (region_intersects(nsio->res.start, resource_size(&nsio-
>> >res),
>> +                             IORESOURCE_SYSTEM_RAM,
>> +                             IORES_DESC_NONE) == REGION_MIXED)
>
> Should this be != REGION_DISJOINT for safe?

Acutally, it's ok.  It doesn't need to be disjoint.  The problem is
mixing an mm-zone within a given section.  If the region intersects
system-ram then devm_memremap_pages() is a no-op and we can use the
existing page allocation and linear mapping.

>
>> +             return false;
>> +
>
>  :
>
>> @@ -304,21 +311,56 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>>       }
>>
>>       memset(pfn_sb, 0, sizeof(*pfn_sb));
>> -     npfns = (pmem->size - SZ_8K) / SZ_4K;
>> +
>> +     /*
>> +      * Check if pmem collides with 'System RAM' when section aligned
>> and
>> +      * trim it accordingly
>> +      */
>> +     nsio = to_nd_namespace_io(&ndns->dev);
>> +     start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
>> +     size = resource_size(&nsio->res);
>> +     if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
>> +                             IORES_DESC_NONE) == REGION_MIXED) {
>> +
>> +             start = nsio->res.start;
>> +             start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
>> +     }
>> +
>> +     start = nsio->res.start;
>> +     size = PHYS_SECTION_ALIGN_UP(start + size) - start;
>> +     if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
>> +                             IORES_DESC_NONE) == REGION_MIXED) {
>> +             size = resource_size(&nsio->res);
>> +             end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start
>> + size);
>> +     }
>
> This check seems to assume that guest's regular memory layout does not
> change.  That is, if there is no collision at first, there won't be any
> later.  Is this a valid assumption?

If platform firmware changes the physical alignment during the
lifetime of the namespace there's not much we can do.  Another problem
not addressed by this patch is firmware choosing to hot plug system
ram into the same section as persistent memory.  As far as I can see
all we do is ask firmware implementations to respect Linux section
boundaries and otherwise not change alignments.
Kani, Toshi March 5, 2016, 2:48 a.m. UTC | #2
On Thu, 2016-03-03 at 13:53 -0800, Dan Williams wrote:
> On a platform where 'Persistent Memory' and 'System RAM' are mixed
> within a given sparsemem section, trim the namespace and notify about the
> sub-optimal alignment.
> 
> Cc: Toshi Kani <toshi.kani@hpe.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/nvdimm/namespace_devs.c |    7 ++
>  drivers/nvdimm/pfn.h            |   10 ++-
>  drivers/nvdimm/pfn_devs.c       |    5 ++
>  drivers/nvdimm/pmem.c           |  125 ++++++++++++++++++++++++++++-----
> ------
>  4 files changed, 111 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/nvdimm/namespace_devs.c
> b/drivers/nvdimm/namespace_devs.c
> index 8ebfcaae3f5a..463756ca2d4b 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -133,6 +133,7 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
>  bool pmem_should_map_pages(struct device *dev)
>  {
>  	struct nd_region *nd_region = to_nd_region(dev->parent);
> +	struct nd_namespace_io *nsio;
>  
>  	if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
>  		return false;
> @@ -143,6 +144,12 @@ bool pmem_should_map_pages(struct device *dev)
>  	if (is_nd_pfn(dev) || is_nd_btt(dev))
>  		return false;
>  
> +	nsio = to_nd_namespace_io(dev);
> +	if (region_intersects(nsio->res.start, resource_size(&nsio-
> >res),
> +				IORESOURCE_SYSTEM_RAM,
> +				IORES_DESC_NONE) == REGION_MIXED)

Should this be != REGION_DISJOINT for safe?

> +		return false;
> +

 :

> @@ -304,21 +311,56 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>  	}
>  
>  	memset(pfn_sb, 0, sizeof(*pfn_sb));
> -	npfns = (pmem->size - SZ_8K) / SZ_4K;
> +
> +	/*
> +	 * Check if pmem collides with 'System RAM' when section aligned
> and
> +	 * trim it accordingly
> +	 */
> +	nsio = to_nd_namespace_io(&ndns->dev);
> +	start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
> +	size = resource_size(&nsio->res);
> +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +				IORES_DESC_NONE) == REGION_MIXED) {
> +
> +		start = nsio->res.start;
> +		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> +	}
> +
> +	start = nsio->res.start;
> +	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +				IORES_DESC_NONE) == REGION_MIXED) {
> +		size = resource_size(&nsio->res);
> +		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start
> + size);
> +	}

This check seems to assume that guest's regular memory layout does not
change.  That is, if there is no collision at first, there won't be any
later.  Is this a valid assumption?

Thanks,
-Toshi
Dan Williams March 7, 2016, 5:18 p.m. UTC | #3
On Mon, Mar 7, 2016 at 9:56 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
> On Fri, 2016-03-04 at 18:23 -0800, Dan Williams wrote:
>> On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com> wrote:
[..]
>> As far as I can see
>> all we do is ask firmware implementations to respect Linux section
>> boundaries and otherwise not change alignments.
>
> In addition to the requirement that pmem range alignment may not change,
> the code also requires a regular memory range does not change to intersect
> with a pmem section later.  This seems fragile to me since guest config may
> vary / change as I mentioned above.
>
> So, shouldn't the driver fails to attach when the range is not aligned by
> the section size?  Since we need to place a requirement to firmware anyway,
> we can simply state that it must be aligned by 128MiB (at least) on x86.
>  Then, memory and pmem physical layouts can be changed as long as this
> requirement is met.

We can state that it must be aligned, but without a hard specification
I don't see how we can guarantee it.  We will fail the driver load
with a warning if our alignment fixups end up getting invalidated by a
later configuration change, but in the meantime we cover the gap of a
BIOS that has generated a problematic configuration.
Kani, Toshi March 7, 2016, 5:56 p.m. UTC | #4
On Fri, 2016-03-04 at 18:23 -0800, Dan Williams wrote:
> On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com> wrote:
> > On Thu, 2016-03-03 at 13:53 -0800, Dan Williams wrote:
> > > On a platform where 'Persistent Memory' and 'System RAM' are mixed
> > > within a given sparsemem section, trim the namespace and notify about
> > > the
> > > sub-optimal alignment.
> > > 
> > > Cc: Toshi Kani <toshi.kani@hpe.com>
> > > Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > ---
> > >  drivers/nvdimm/namespace_devs.c |    7 ++
> > >  drivers/nvdimm/pfn.h            |   10 ++-
> > >  drivers/nvdimm/pfn_devs.c       |    5 ++
> > >  drivers/nvdimm/pmem.c           |  125 ++++++++++++++++++++++++++++-
> > > ----
> > > ------
> > >  4 files changed, 111 insertions(+), 36 deletions(-)
> > > 
> > > diff --git a/drivers/nvdimm/namespace_devs.c
> > > b/drivers/nvdimm/namespace_devs.c
> > > index 8ebfcaae3f5a..463756ca2d4b 100644
> > > --- a/drivers/nvdimm/namespace_devs.c
> > > +++ b/drivers/nvdimm/namespace_devs.c
> > > @@ -133,6 +133,7 @@ bool nd_is_uuid_unique(struct device *dev, u8
> > > *uuid)
> > >  bool pmem_should_map_pages(struct device *dev)
> > >  {
> > >       struct nd_region *nd_region = to_nd_region(dev->parent);
> > > +     struct nd_namespace_io *nsio;
> > > 
> > >       if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
> > >               return false;
> > > @@ -143,6 +144,12 @@ bool pmem_should_map_pages(struct device *dev)
> > >       if (is_nd_pfn(dev) || is_nd_btt(dev))
> > >               return false;
> > > 
> > > +     nsio = to_nd_namespace_io(dev);
> > > +     if (region_intersects(nsio->res.start, resource_size(&nsio-
> > > > res),
> > > +                             IORESOURCE_SYSTEM_RAM,
> > > +                             IORES_DESC_NONE) == REGION_MIXED)
> > 
> > Should this be != REGION_DISJOINT for safe?
> 
> Acutally, it's ok.  It doesn't need to be disjoint.  The problem is
> mixing an mm-zone within a given section.  If the region intersects
> system-ram then devm_memremap_pages() is a no-op and we can use the
> existing page allocation and linear mapping.

Oh, I see.

> > 
> > > +             return false;
> > > +
> > 
> >  :
> > 
> > > @@ -304,21 +311,56 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> > >       }
> > > 
> > >       memset(pfn_sb, 0, sizeof(*pfn_sb));
> > > -     npfns = (pmem->size - SZ_8K) / SZ_4K;
> > > +
> > > +     /*
> > > +      * Check if pmem collides with 'System RAM' when section
> > > aligned
> > > and
> > > +      * trim it accordingly
> > > +      */
> > > +     nsio = to_nd_namespace_io(&ndns->dev);
> > > +     start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
> > > +     size = resource_size(&nsio->res);
> > > +     if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > > +                             IORES_DESC_NONE) == REGION_MIXED) {
> > > +
> > > +             start = nsio->res.start;
> > > +             start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> > > +     }
> > > +
> > > +     start = nsio->res.start;
> > > +     size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> > > +     if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > > +                             IORES_DESC_NONE) == REGION_MIXED) {
> > > +             size = resource_size(&nsio->res);
> > > +             end_trunc = start + size -
> > > PHYS_SECTION_ALIGN_DOWN(start
> > > + size);
> > > +     }
> > 
> > This check seems to assume that guest's regular memory layout does not
> > change.  That is, if there is no collision at first, there won't be any
> > later.  Is this a valid assumption?
> 
> If platform firmware changes the physical alignment during the
> lifetime of the namespace there's not much we can do.  

The physical alignment can be changed as long as it is large enough (see
below).

> Another problem
> not addressed by this patch is firmware choosing to hot plug system
> ram into the same section as persistent memory.  

Yes, and it does not have to be a hot-plug operation.  Memory size may be
changed off-line.  Data image can be copied to different guests for instant
deployment, or may be migrated to a different guest.

> As far as I can see
> all we do is ask firmware implementations to respect Linux section
> boundaries and otherwise not change alignments.

In addition to the requirement that pmem range alignment may not change,
the code also requires a regular memory range does not change to intersect
with a pmem section later.  This seems fragile to me since guest config may
vary / change as I mentioned above.

So, shouldn't the driver fails to attach when the range is not aligned by
the section size?  Since we need to place a requirement to firmware anyway,
we can simply state that it must be aligned by 128MiB (at least) on x86.
 Then, memory and pmem physical layouts can be changed as long as this
requirement is met.

Thanks,
-Toshi
Dan Williams March 7, 2016, 6:19 p.m. UTC | #5
On Mon, Mar 7, 2016 at 10:58 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
> On Mon, 2016-03-07 at 09:18 -0800, Dan Williams wrote:
>> On Mon, Mar 7, 2016 at 9:56 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
>> > On Fri, 2016-03-04 at 18:23 -0800, Dan Williams wrote:
>> > > On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com>
>> > > wrote:
>> [..]
>> > > As far as I can see
>> > > all we do is ask firmware implementations to respect Linux section
>> > > boundaries and otherwise not change alignments.
>> >
>> > In addition to the requirement that pmem range alignment may not
>> > change, the code also requires a regular memory range does not change
>> > to intersect with a pmem section later.  This seems fragile to me since
>> > guest config may vary / change as I mentioned above.
>> >
>> > So, shouldn't the driver fails to attach when the range is not aligned
>> > by the section size?  Since we need to place a requirement to firmware
>> > anyway, we can simply state that it must be aligned by 128MiB (at
>> > least) on x86.  Then, memory and pmem physical layouts can be changed
>> > as long as this requirement is met.
>>
>> We can state that it must be aligned, but without a hard specification
>> I don't see how we can guarantee it.  We will fail the driver load
>> with a warning if our alignment fixups end up getting invalidated by a
>> later configuration change, but in the meantime we cover the gap of a
>> BIOS that has generated a problematic configuration.
>
> I do not think it has to be stated in the spec (although it may be a good
> idea to state it as an implementation note :-).
>
> This is an OS-unique requirement (and the size is x86-specific) that if it
> wants to support Linux pmem pfn, then the alignment needs to be at least
> 128MiB.  Regular pmem does not have this restriction, but it needs to be
> aligned by 2MiB or 1GiB for using huge page mapping, which does not have to
> be stated in the spec, either.

We can check that the alignment is correct when the namespace is first
instantiated, but we're still stuck if the configuration ever changes.

> For KVM to support the pmem pfn feature on x86, it needs to guarantee this
> 128MiB alignment.  Otherwise, this feature is not supported.  (I do not
> worry about NVDIMM-N since it is naturally aligned by its size.)
>
> If we allow unaligned cases, then the driver needs to detect change from
> the initial condition and fail to attach for protecting data.  I did not
> see such check in the code, but I may have overlooked.  We cannot check if
> KVM has any guarantee to keep the alignment at the initial setup, though.

devm_memremap_pages() will fail if the driver tries to pass in an
unaligned address [1] ...and now that I look again, that patch
mishandles the aligning 'size', will fix.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2016-February/004729.html
Dan Williams March 7, 2016, 6:37 p.m. UTC | #6
[ adding Haozhong and Xiao for the alignment concerns below ]

On Mon, Mar 7, 2016 at 10:58 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
> On Mon, 2016-03-07 at 09:18 -0800, Dan Williams wrote:
>> On Mon, Mar 7, 2016 at 9:56 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
>> > On Fri, 2016-03-04 at 18:23 -0800, Dan Williams wrote:
>> > > On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com>
>> > > wrote:
>> [..]
>> > > As far as I can see
>> > > all we do is ask firmware implementations to respect Linux section
>> > > boundaries and otherwise not change alignments.
>> >
>> > In addition to the requirement that pmem range alignment may not
>> > change, the code also requires a regular memory range does not change
>> > to intersect with a pmem section later.  This seems fragile to me since
>> > guest config may vary / change as I mentioned above.
>> >
>> > So, shouldn't the driver fails to attach when the range is not aligned
>> > by the section size?  Since we need to place a requirement to firmware
>> > anyway, we can simply state that it must be aligned by 128MiB (at
>> > least) on x86.  Then, memory and pmem physical layouts can be changed
>> > as long as this requirement is met.
>>
>> We can state that it must be aligned, but without a hard specification
>> I don't see how we can guarantee it.  We will fail the driver load
>> with a warning if our alignment fixups end up getting invalidated by a
>> later configuration change, but in the meantime we cover the gap of a
>> BIOS that has generated a problematic configuration.
>
> I do not think it has to be stated in the spec (although it may be a good
> idea to state it as an implementation note :-).
>
> This is an OS-unique requirement (and the size is x86-specific) that if it
> wants to support Linux pmem pfn, then the alignment needs to be at least
> 128MiB.  Regular pmem does not have this restriction, but it needs to be
> aligned by 2MiB or 1GiB for using huge page mapping, which does not have to
> be stated in the spec, either.
>
> For KVM to support the pmem pfn feature on x86, it needs to guarantee this
> 128MiB alignment.  Otherwise, this feature is not supported.  (I do not
> worry about NVDIMM-N since it is naturally aligned by its size.)
>
> If we allow unaligned cases, then the driver needs to detect change from
> the initial condition and fail to attach for protecting data.  I did not
> see such check in the code, but I may have overlooked.  We cannot check if
> KVM has any guarantee to keep the alignment at the initial setup, though.
>
> Thanks,
> -Toshi
Kani, Toshi March 7, 2016, 6:58 p.m. UTC | #7
On Mon, 2016-03-07 at 09:18 -0800, Dan Williams wrote:
> On Mon, Mar 7, 2016 at 9:56 AM, Toshi Kani <toshi.kani@hpe.com> wrote:
> > On Fri, 2016-03-04 at 18:23 -0800, Dan Williams wrote:
> > > On Fri, Mar 4, 2016 at 6:48 PM, Toshi Kani <toshi.kani@hpe.com>
> > > wrote:
> [..]
> > > As far as I can see
> > > all we do is ask firmware implementations to respect Linux section
> > > boundaries and otherwise not change alignments.
> > 
> > In addition to the requirement that pmem range alignment may not
> > change, the code also requires a regular memory range does not change
> > to intersect with a pmem section later.  This seems fragile to me since
> > guest config may vary / change as I mentioned above.
> > 
> > So, shouldn't the driver fails to attach when the range is not aligned
> > by the section size?  Since we need to place a requirement to firmware
> > anyway, we can simply state that it must be aligned by 128MiB (at
> > least) on x86.  Then, memory and pmem physical layouts can be changed
> > as long as this requirement is met.
> 
> We can state that it must be aligned, but without a hard specification
> I don't see how we can guarantee it.  We will fail the driver load
> with a warning if our alignment fixups end up getting invalidated by a
> later configuration change, but in the meantime we cover the gap of a
> BIOS that has generated a problematic configuration.

I do not think it has to be stated in the spec (although it may be a good
idea to state it as an implementation note :-).

This is an OS-unique requirement (and the size is x86-specific) that if it
wants to support Linux pmem pfn, then the alignment needs to be at least
128MiB.  Regular pmem does not have this restriction, but it needs to be
aligned by 2MiB or 1GiB for using huge page mapping, which does not have to
be stated in the spec, either.

For KVM to support the pmem pfn feature on x86, it needs to guarantee this
128MiB alignment.  Otherwise, this feature is not supported.  (I do not
worry about NVDIMM-N since it is naturally aligned by its size.)

If we allow unaligned cases, then the driver needs to detect change from
the initial condition and fail to attach for protecting data.  I did not
see such check in the code, but I may have overlooked.  We cannot check if
KVM has any guarantee to keep the alignment at the initial setup, though.

Thanks,
-Toshi
diff mbox

Patch

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 8ebfcaae3f5a..463756ca2d4b 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -133,6 +133,7 @@  bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
 bool pmem_should_map_pages(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_namespace_io *nsio;
 
 	if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
 		return false;
@@ -143,6 +144,12 @@  bool pmem_should_map_pages(struct device *dev)
 	if (is_nd_pfn(dev) || is_nd_btt(dev))
 		return false;
 
+	nsio = to_nd_namespace_io(dev);
+	if (region_intersects(nsio->res.start, resource_size(&nsio->res),
+				IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED)
+		return false;
+
 #ifdef ARCH_MEMREMAP_PMEM
 	return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
 #else
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 6ee707e5b279..8e343a3ca873 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -27,10 +27,13 @@  struct nd_pfn_sb {
 	__le32 flags;
 	__le16 version_major;
 	__le16 version_minor;
-	__le64 dataoff;
+	__le64 dataoff; /* relative to namespace_base + start_pad */
 	__le64 npfns;
 	__le32 mode;
-	u8 padding[4012];
+	/* minor-version-1 additions for section alignment */
+	__le32 start_pad;
+	__le32 end_trunc;
+	u8 padding[4004];
 	__le64 checksum;
 };
 
@@ -45,4 +48,7 @@  struct nd_pfn_sb {
 #define PFN_SECTION_ALIGN_DOWN(x) (x)
 #define PFN_SECTION_ALIGN_UP(x) (x)
 #endif
+
+#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
+#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 0cc9048b86e2..14642617a153 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -299,6 +299,11 @@  int nd_pfn_validate(struct nd_pfn *nd_pfn)
 	if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
 		return -ENODEV;
 
+	if (__le16_to_cpu(pfn_sb->version_minor) < 1) {
+		pfn_sb->start_pad = 0;
+		pfn_sb->end_trunc = 0;
+	}
+
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	case PFN_MODE_RAM:
 		break;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 68a20b2e3d03..15ec290bd23b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -43,7 +43,10 @@  struct pmem_device {
 	phys_addr_t		data_offset;
 	unsigned long		pfn_flags;
 	void __pmem		*virt_addr;
+	/* immutable base size of the namespace */
 	size_t			size;
+	/* trim size when namespace capacity has been section aligned */
+	u32			pfn_pad;
 	struct badblocks	bb;
 };
 
@@ -145,7 +148,7 @@  static long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
-	return pmem->size - offset;
+	return pmem->size - pmem->pfn_pad - offset;
 }
 
 static const struct block_device_operations pmem_fops = {
@@ -236,7 +239,8 @@  static int pmem_attach_disk(struct device *dev,
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	disk->driverfs_dev = dev;
-	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
+	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
+			/ 512);
 	pmem->pmem_disk = disk;
 	devm_exit_badblocks(dev, &pmem->bb);
 	if (devm_init_badblocks(dev, &pmem->bb))
@@ -279,6 +283,9 @@  static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
 	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u32 start_pad = 0, end_trunc = 0;
+	resource_size_t start, size;
+	struct nd_namespace_io *nsio;
 	struct nd_region *nd_region;
 	unsigned long npfns;
 	phys_addr_t offset;
@@ -304,21 +311,56 @@  static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	}
 
 	memset(pfn_sb, 0, sizeof(*pfn_sb));
-	npfns = (pmem->size - SZ_8K) / SZ_4K;
+
+	/*
+	 * Check if pmem collides with 'System RAM' when section aligned and
+	 * trim it accordingly
+	 */
+	nsio = to_nd_namespace_io(&ndns->dev);
+	start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
+	size = resource_size(&nsio->res);
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+
+		start = nsio->res.start;
+		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
+	}
+
+	start = nsio->res.start;
+	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+		size = resource_size(&nsio->res);
+		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
+	}
+
+	if (start_pad + end_trunc)
+		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
+				dev_name(&ndns->dev), start_pad + end_trunc);
+
 	/*
 	 * Note, we use 64 here for the standard size of struct page,
 	 * debugging options may cause it to be larger in which case the
 	 * implementation will limit the pfns advertised through
 	 * ->direct_access() to those that are included in the memmap.
 	 */
+	start += start_pad;
+	npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
-		offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
+		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
+			- start;
 	else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(SZ_8K, nd_pfn->align);
+		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
 	else
 		goto err;
 
-	npfns = (pmem->size - offset) / SZ_4K;
+	if (offset + start_pad + end_trunc >= pmem->size) {
+		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
+				dev_name(&ndns->dev));
+		goto err;
+	}
+
+	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -326,6 +368,9 @@  static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
+	pfn_sb->version_minor = cpu_to_le16(1);
+	pfn_sb->start_pad = cpu_to_le32(start_pad);
+	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
@@ -376,41 +421,36 @@  static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 {
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
-	struct device *dev = &nd_pfn->dev;
-	struct nd_region *nd_region;
-	struct vmem_altmap *altmap;
-	struct nd_pfn_sb *pfn_sb;
-	struct pmem_device *pmem;
-	struct request_queue *q;
-	phys_addr_t offset;
 	int rc;
+	struct resource res;
+	struct request_queue *q;
+	struct pmem_device *pmem;
+	struct vmem_altmap *altmap;
+	struct device *dev = &nd_pfn->dev;
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	resource_size_t base = nsio->res.start + start_pad;
 	struct vmem_altmap __altmap = {
-		.base_pfn = init_altmap_base(nsio->res.start),
-		.reserve = init_altmap_reserve(nsio->res.start),
+		.base_pfn = init_altmap_base(base),
+		.reserve = init_altmap_reserve(base),
 	};
 
-	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return -ENODEV;
-
-	nd_region = to_nd_region(dev->parent);
-	rc = nd_pfn_init(nd_pfn);
-	if (rc)
-		return rc;
-
-	pfn_sb = nd_pfn->pfn_sb;
-	offset = le64_to_cpu(pfn_sb->dataoff);
+	pmem = dev_get_drvdata(dev);
+	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+	pmem->pfn_pad = start_pad + end_trunc;
 	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
 	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (offset < SZ_8K)
+		if (pmem->data_offset < SZ_8K)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (resource_size(&nsio->res) - offset)
+		nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ PAGE_SIZE;
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
@@ -418,7 +458,7 @@  static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
 		altmap = & __altmap;
-		altmap->free = __phys_to_pfn(offset - SZ_8K);
+		altmap->free = __phys_to_pfn(pmem->data_offset - SZ_8K);
 		altmap->alloc = 0;
 	} else {
 		rc = -ENXIO;
@@ -426,10 +466,12 @@  static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 	}
 
 	/* establish pfn range for lookup, and switch to direct map */
-	pmem = dev_get_drvdata(dev);
 	q = pmem->pmem_queue;
+	memcpy(&res, &nsio->res, sizeof(res));
+	res.start += start_pad;
+	res.end -= end_trunc;
 	devm_memunmap(dev, (void __force *) pmem->virt_addr);
-	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
+	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
 			&q->q_usage_counter, altmap);
 	pmem->pfn_flags |= PFN_MAP;
 	if (IS_ERR(pmem->virt_addr)) {
@@ -438,7 +480,6 @@  static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 	}
 
 	/* attach pmem disk in "pfn-mode" */
-	pmem->data_offset = offset;
 	rc = pmem_attach_disk(dev, ndns, pmem);
 	if (rc)
 		goto err;
@@ -447,6 +488,22 @@  static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
  err:
 	nvdimm_namespace_detach_pfn(ndns);
 	return rc;
+
+}
+
+static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
+	int rc;
+
+	if (!nd_pfn->uuid || !nd_pfn->ndns)
+		return -ENODEV;
+
+	rc = nd_pfn_init(nd_pfn);
+	if (rc)
+		return rc;
+	/* we need a valid pfn_sb before we can init a vmem_altmap */
+	return __nvdimm_namespace_attach_pfn(nd_pfn);
 }
 
 static int nd_pmem_probe(struct device *dev)