diff mbox

[v5,2/5] allow mapping page-less memremaped areas into KVA

Message ID 20150813030109.36703.21738.stgit@otcpl-skl-sds-2.jf.intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Dan Williams Aug. 13, 2015, 3:01 a.m. UTC
Introduce a type that encapsulates a page-frame-number that can also be
used to encode other information.  This other information is the
traditional "page_link" encoding in a scatterlist, but can also denote
"device memory".  Where "device memory" is a set of pfns that are not
part of the kernel's linear mapping, but are accessed via the same
memory controller as ram.  The motivation for this conversion is large
capacity persistent memory that does not enjoy struct page coverage,
entries in 'memmap' by default.

This type will be used in replace usage of 'struct page *' in cases
where only a pfn is required, i.e. scatterlists for drivers, dma mapping
api, and potentially biovecs for the block layer.  The operations in
those i/o paths that formerly required a 'struct page *' are converted
to use __pfn_t aware equivalent helpers.

It turns out that while 'struct page' references are used broadly in the
kernel I/O stacks the usage of 'struct page' based capabilities is very
shallow for block-i/o.  It is only used for populating bio_vecs and
scatterlists for the retrieval of dma addresses, and for temporary
kernel mappings (kmap).  Aside from kmap, these usages can be trivially
converted to operate on a pfn.

Indeed, kmap_atomic() is more problematic as it uses mm infrastructure,
via struct page, to setup and track temporary kernel mappings.  It would
be unfortunate if the kmap infrastructure escaped its 32-bit/HIGHMEM
bonds and leaked into 64-bit code.  Thankfully, it seems all that is
needed here is to convert kmap_atomic() callers, that want to opt-in to
supporting persistent memory, to use a new kmap_atomic_pfn_t().  Where
kmap_atomic_pfn_t() is enabled to re-use the existing ioremap() mapping
established by the driver for persistent memory.

Note, that as far as conceptually understanding __pfn_t is concerned,
'persistent memory' is really any address range in host memory not
covered by memmap.  Contrast this with pure iomem that is on an mmio
mapped bus like PCI and cannot be converted to a dma_addr_t by "pfn <<
PAGE_SHIFT".

It would be unfortunate if the kmap infrastructure escaped its current
32-bit/HIGHMEM bonds and leaked into 64-bit code.  Instead, if the user
has enabled CONFIG_DEV_PFN we direct the kmap_atomic_pfn_t()
implementation to scan a list of pre-mapped persistent memory address
ranges inserted by the pmem driver.

The __pfn_t to resource lookup is indeed inefficient walking of a linked list,
but there are two mitigating factors:

1/ The number of persistent memory ranges is bounded by the number of
   DIMMs which is on the order of 10s of DIMMs, not hundreds.

2/ The lookup yields the entire range, if it becomes inefficient to do a
   kmap_atomic_pfn_t() a PAGE_SIZE at a time the caller can take
   advantage of the fact that the lookup can be amortized for all kmap
   operations it needs to perform in a given range.

[hch: various changes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/kmap_pfn.h |   31 ++++++++++++
 include/linux/mm.h       |   57 ++++++++++++++++++++++
 mm/Kconfig               |    3 +
 mm/Makefile              |    1 
 mm/kmap_pfn.c            |  117 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 209 insertions(+)
 create mode 100644 include/linux/kmap_pfn.h
 create mode 100644 mm/kmap_pfn.c

Comments

Boaz Harrosh Aug. 13, 2015, 5:58 a.m. UTC | #1
On 08/13/2015 06:01 AM, Dan Williams wrote:
> Introduce a type that encapsulates a page-frame-number that can also be
> used to encode other information.  This other information is the
> traditional "page_link" encoding in a scatterlist, but can also denote
> "device memory".  Where "device memory" is a set of pfns that are not
> part of the kernel's linear mapping, but are accessed via the same
> memory controller as ram.  The motivation for this conversion is large
> capacity persistent memory that does not enjoy struct page coverage,
> entries in 'memmap' by default.
> 
> This type will be used in replace usage of 'struct page *' in cases
> where only a pfn is required, i.e. scatterlists for drivers, dma mapping
> api, and potentially biovecs for the block layer.  The operations in
> those i/o paths that formerly required a 'struct page *' are converted
> to use __pfn_t aware equivalent helpers.
> 
> It turns out that while 'struct page' references are used broadly in the
> kernel I/O stacks the usage of 'struct page' based capabilities is very
> shallow for block-i/o.  It is only used for populating bio_vecs and
> scatterlists for the retrieval of dma addresses, and for temporary
> kernel mappings (kmap).  Aside from kmap, these usages can be trivially
> converted to operate on a pfn.
> 
> Indeed, kmap_atomic() is more problematic as it uses mm infrastructure,
> via struct page, to setup and track temporary kernel mappings.  It would
> be unfortunate if the kmap infrastructure escaped its 32-bit/HIGHMEM
> bonds and leaked into 64-bit code.  Thankfully, it seems all that is
> needed here is to convert kmap_atomic() callers, that want to opt-in to
> supporting persistent memory, to use a new kmap_atomic_pfn_t().  Where
> kmap_atomic_pfn_t() is enabled to re-use the existing ioremap() mapping
> established by the driver for persistent memory.
> 
> Note, that as far as conceptually understanding __pfn_t is concerned,
> 'persistent memory' is really any address range in host memory not
> covered by memmap.  Contrast this with pure iomem that is on an mmio
> mapped bus like PCI and cannot be converted to a dma_addr_t by "pfn <<
> PAGE_SHIFT".
> 
> It would be unfortunate if the kmap infrastructure escaped its current
> 32-bit/HIGHMEM bonds and leaked into 64-bit code.  Instead, if the user
> has enabled CONFIG_DEV_PFN we direct the kmap_atomic_pfn_t()
> implementation to scan a list of pre-mapped persistent memory address
> ranges inserted by the pmem driver.
> 
> The __pfn_t to resource lookup is indeed inefficient walking of a linked list,
> but there are two mitigating factors:
> 
> 1/ The number of persistent memory ranges is bounded by the number of
>    DIMMs which is on the order of 10s of DIMMs, not hundreds.
> 
> 2/ The lookup yields the entire range, if it becomes inefficient to do a
>    kmap_atomic_pfn_t() a PAGE_SIZE at a time the caller can take
>    advantage of the fact that the lookup can be amortized for all kmap
>    operations it needs to perform in a given range.
> 
> [hch: various changes]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/kmap_pfn.h |   31 ++++++++++++
>  include/linux/mm.h       |   57 ++++++++++++++++++++++
>  mm/Kconfig               |    3 +
>  mm/Makefile              |    1 
>  mm/kmap_pfn.c            |  117 ++++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 209 insertions(+)
>  create mode 100644 include/linux/kmap_pfn.h
>  create mode 100644 mm/kmap_pfn.c
> 
> diff --git a/include/linux/kmap_pfn.h b/include/linux/kmap_pfn.h
> new file mode 100644
> index 000000000000..fa44971d8e95
> --- /dev/null
> +++ b/include/linux/kmap_pfn.h
> @@ -0,0 +1,31 @@
> +#ifndef _LINUX_KMAP_PFN_H
> +#define _LINUX_KMAP_PFN_H 1
> +
> +#include <linux/highmem.h>
> +
> +struct device;
> +struct resource;
> +#ifdef CONFIG_KMAP_PFN
> +extern void *kmap_atomic_pfn_t(__pfn_t pfn);
> +extern void kunmap_atomic_pfn_t(void *addr);
> +extern int devm_register_kmap_pfn_range(struct device *dev,
> +		struct resource *res, void *base);
> +#else
> +static inline void *kmap_atomic_pfn_t(__pfn_t pfn)
> +{
> +	return kmap_atomic(__pfn_t_to_page(pfn));
> +}
> +
> +static inline void kunmap_atomic_pfn_t(void *addr)
> +{
> +	__kunmap_atomic(addr);
> +}
> +
> +static inline int devm_register_kmap_pfn_range(struct device *dev,
> +		struct resource *res, void *base)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_KMAP_PFN */
> +
> +#endif /* _LINUX_KMAP_PFN_H */
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 84b05ebedb2d..57ba5ca6be72 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -924,6 +924,63 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
>  }
>  
>  /*
> + * __pfn_t: encapsulates a page-frame number that is optionally backed
> + * by memmap (struct page).  This type will be used in place of a
> + * 'struct page *' instance in contexts where unmapped memory (usually
> + * persistent memory) is being referenced (scatterlists for drivers,
> + * biovecs for the block layer, etc).  Whether a __pfn_t has a struct
> + * page backing is indicated by flags in the low bits of the value;
> + */
> +typedef struct {
> +	unsigned long val;
> +} __pfn_t;
> +
> +/*
> + * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
> + * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
> + * PFN_DEV - pfn is not covered by system memmap
> + */
> +enum {
> +	PFN_MASK = (1UL << PAGE_SHIFT) - 1,
> +	PFN_SG_CHAIN = (1UL << 0),
> +	PFN_SG_LAST = (1UL << 1),
> +#ifdef CONFIG_KMAP_PFN
> +	PFN_DEV = (1UL << 2),
> +#else
> +	PFN_DEV = 0,
> +#endif
> +};
> +
> +static inline bool __pfn_t_has_page(__pfn_t pfn)
> +{
> +	return (pfn.val & PFN_DEV) == 0;
> +}
> +
> +static inline unsigned long __pfn_t_to_pfn(__pfn_t pfn)
> +{
> +	return pfn.val >> PAGE_SHIFT;
> +}
> +
> +static inline struct page *__pfn_t_to_page(__pfn_t pfn)
> +{
> +	if (!__pfn_t_has_page(pfn))
> +		return NULL;
> +	return pfn_to_page(__pfn_t_to_pfn(pfn));
> +}
> +
> +static inline dma_addr_t __pfn_t_to_phys(__pfn_t pfn)
> +{
> +	return __pfn_to_phys(__pfn_t_to_pfn(pfn));
> +}
> +
> +static inline __pfn_t page_to_pfn_t(struct page *page)
> +{
> +	__pfn_t pfn = { .val = page_to_pfn(page) << PAGE_SHIFT, };
> +
> +	return pfn;
> +}
> +
> +/*
>   * Some inline functions in vmstat.h depend on page_zone()
>   */
>  #include <linux/vmstat.h>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e79de2bd12cd..ed1be8ff982e 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -654,3 +654,6 @@ config DEFERRED_STRUCT_PAGE_INIT
>  	  when kswapd starts. This has a potential performance impact on
>  	  processes running early in the lifetime of the systemm until kswapd
>  	  finishes the initialisation.
> +
> +config KMAP_PFN
> +	bool
> diff --git a/mm/Makefile b/mm/Makefile
> index 98c4eaeabdcb..f7b27958ea69 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)	+= cma.o
>  obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
>  obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
>  obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
> +obj-$(CONFIG_KMAP_PFN) += kmap_pfn.o
> diff --git a/mm/kmap_pfn.c b/mm/kmap_pfn.c
> new file mode 100644
> index 000000000000..2d58e167dfbc
> --- /dev/null
> +++ b/mm/kmap_pfn.c
> @@ -0,0 +1,117 @@
> +/*
> + * Copyright(c) 2015 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */
> +#include <linux/rcupdate.h>
> +#include <linux/rculist.h>
> +#include <linux/highmem.h>
> +#include <linux/device.h>
> +#include <linux/mutex.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> +
> +static LIST_HEAD(ranges);
> +static DEFINE_MUTEX(register_lock);
> +
> +struct kmap {
> +	struct list_head list;
> +	struct resource *res;
> +	struct device *dev;
> +	void *base;
> +};
> +
> +static void teardown_kmap(void *data)
> +{
> +	struct kmap *kmap = data;
> +
> +	dev_dbg(kmap->dev, "kmap unregister %pr\n", kmap->res);
> +	mutex_lock(&register_lock);
> +	list_del_rcu(&kmap->list);
> +	mutex_unlock(&register_lock);
> +	synchronize_rcu();
> +	kfree(kmap);
> +}
> +
> +int devm_register_kmap_pfn_range(struct device *dev, struct resource *res,
> +		void *base)
> +{
> +	struct kmap *kmap = kzalloc(sizeof(*kmap), GFP_KERNEL);
> +	int rc;
> +
> +	if (!kmap)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&kmap->list);
> +	kmap->res = res;
> +	kmap->base = base;
> +	kmap->dev = dev;
> +	rc = devm_add_action(dev, teardown_kmap, kmap);
> +	if (rc) {
> +		kfree(kmap);
> +		return rc;
> +	}
> +	dev_dbg(kmap->dev, "kmap register %pr\n", kmap->res);
> +
> +	mutex_lock(&register_lock);
> +	list_add_rcu(&kmap->list, &ranges);
> +	mutex_unlock(&register_lock);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(devm_register_kmap_pfn_range);
> +
> +void *kmap_atomic_pfn_t(__pfn_t pfn)
> +{
> +	struct page *page = __pfn_t_to_page(pfn);
> +	resource_size_t addr;
> +	struct kmap *kmap;
> +
> +	rcu_read_lock();
> +	if (page)
> +		return kmap_atomic(page);

Right even with pages I pay rcu_read_lock(); for every access?

> +	addr = __pfn_t_to_phys(pfn);
> +	list_for_each_entry_rcu(kmap, &ranges, list)
> +		if (addr >= kmap->res->start && addr <= kmap->res->end)
> +			return kmap->base + addr - kmap->res->start;
> +

Good god! This loop is a real *joke*. You have just dropped memory access
performance by 10 fold.

The all point of pages and memory_model.h was to have a one to one
relation-ships between Kernel-virtual vs physical vs page *

There is already an object that holds a relationship of physical
to Kernel-virtual. It is called a memory-section. Why not just
widen its definition?

If you are willing to accept this loop. In current Linux 2015 Kernel
Then I have nothing farther to say.

Boaz - go mourning for the death of the Linux Kernel alone in the corner ;-(

> +	/* only unlock in the error case */
> +	rcu_read_unlock();
> +	return NULL;
> +}
> +EXPORT_SYMBOL(kmap_atomic_pfn_t);
> +
> +void kunmap_atomic_pfn_t(void *addr)
> +{
> +	struct kmap *kmap;
> +	bool dev_pfn = false;
> +
> +	if (!addr)
> +		return;
> +
> +	/*
> +	 * If the original __pfn_t had an entry in the memmap (i.e.
> +	 * !PFN_DEV) then 'addr' will be outside of the registered
> +	 * ranges and we'll need to kunmap_atomic() it.
> +	 */
> +	list_for_each_entry_rcu(kmap, &ranges, list)
> +		if (addr < kmap->base + resource_size(kmap->res)
> +				&& addr >= kmap->base) {
> +			dev_pfn = true;
> +			break;
> +		}
> +
> +	if (!dev_pfn)
> +		kunmap_atomic(addr);
> +
> +	/* signal that we are done with the range */
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL(kunmap_atomic_pfn_t);
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
>
Dan Williams Aug. 13, 2015, 12:57 p.m. UTC | #2
On Wed, Aug 12, 2015 at 10:58 PM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 08/13/2015 06:01 AM, Dan Williams wrote:
[..]
>> +void *kmap_atomic_pfn_t(__pfn_t pfn)
>> +{
>> +     struct page *page = __pfn_t_to_page(pfn);
>> +     resource_size_t addr;
>> +     struct kmap *kmap;
>> +
>> +     rcu_read_lock();
>> +     if (page)
>> +             return kmap_atomic(page);
>
> Right even with pages I pay rcu_read_lock(); for every access?
>
>> +     addr = __pfn_t_to_phys(pfn);
>> +     list_for_each_entry_rcu(kmap, &ranges, list)
>> +             if (addr >= kmap->res->start && addr <= kmap->res->end)
>> +                     return kmap->base + addr - kmap->res->start;
>> +
>
> Good god! This loop is a real *joke*. You have just dropped memory access
> performance by 10 fold.
>
> The all point of pages and memory_model.h was to have a one to one
> relation-ships between Kernel-virtual vs physical vs page *
>
> There is already an object that holds a relationship of physical
> to Kernel-virtual. It is called a memory-section. Why not just
> widen its definition?
>
> If you are willing to accept this loop. In current Linux 2015 Kernel
> Then I have nothing farther to say.
>
> Boaz - go mourning for the death of the Linux Kernel alone in the corner ;-(
>

This is explicitly addressed in the changelog, repeated here:

> The __pfn_t to resource lookup is indeed inefficient walking of a linked list,
> but there are two mitigating factors:
>
> 1/ The number of persistent memory ranges is bounded by the number of
>    DIMMs which is on the order of 10s of DIMMs, not hundreds.
>
> 2/ The lookup yields the entire range, if it becomes inefficient to do a
>    kmap_atomic_pfn_t() a PAGE_SIZE at a time the caller can take
>    advantage of the fact that the lookup can be amortized for all kmap
>    operations it needs to perform in a given range.

DAX as is is races against pmem unbind.   A synchronization cost must
be paid somewhere to make sure the memremap() mapping is still valid.
Boaz Harrosh Aug. 13, 2015, 1:23 p.m. UTC | #3
On 08/13/2015 03:57 PM, Dan Williams wrote:
<>
> This is explicitly addressed in the changelog, repeated here:
> 
>> The __pfn_t to resource lookup is indeed inefficient walking of a linked list,
>> but there are two mitigating factors:
>>
>> 1/ The number of persistent memory ranges is bounded by the number of
>>    DIMMs which is on the order of 10s of DIMMs, not hundreds.
>>

You do not get where I'm comming from. It used to be a [ptr - ONE_BASE + OTHER_BASE]
(In 64 bit) it is now a call and a loop and a search. how ever you will look at
it is *not* the instantaneous address translation it is now.

I have memory I want memory speeds. You keep thinking HD speeds, where what ever
you do will not matter.

>> 2/ The lookup yields the entire range, if it becomes inefficient to do a
>>    kmap_atomic_pfn_t() a PAGE_SIZE at a time the caller can take
>>    advantage of the fact that the lookup can be amortized for all kmap
>>    operations it needs to perform in a given range.
> 

What "given range" how can a bdev assume that the all sg-list belongs to the
same "range". In fact our code does multple-pmem devices for a long time.
What about say md-of-pmems for example, or btrfs

> DAX as is is races against pmem unbind.   A synchronization cost must
> be paid somewhere to make sure the memremap() mapping is still valid.

Sorry for being so slow, is what I asked. what is exactly "pmem unbind" ?

Currently in my 4.1 Kernel the ioremap is done on modprobe time and
released modprobe --remove time. the --remove can not happen with a mounted
FS dax or not. So what is exactly "pmem unbind". And if there is a new knob
then make it refuse with a raised refcount.

Cheers
Boaz
Christoph Hellwig Aug. 13, 2015, 2:37 p.m. UTC | #4
Hi Boaz,

can you please fix your quoting?  I read down about 10 pages but still
couldn't find a comment from you.  For now I gave up on this mail.
Christoph Hellwig Aug. 13, 2015, 2:41 p.m. UTC | #5
On Thu, Aug 13, 2015 at 04:23:38PM +0300, Boaz Harrosh wrote:
> > DAX as is is races against pmem unbind.   A synchronization cost must
> > be paid somewhere to make sure the memremap() mapping is still valid.
> 
> Sorry for being so slow, is what I asked. what is exactly "pmem unbind" ?
> 
> Currently in my 4.1 Kernel the ioremap is done on modprobe time and
> released modprobe --remove time. the --remove can not happen with a mounted
> FS dax or not. So what is exactly "pmem unbind". And if there is a new knob
> then make it refuse with a raised refcount.

Surprise removal of a PCIe card which is mapped to provide non-volatile
memory for example.  Or memory hot swap.
Boaz Harrosh Aug. 13, 2015, 2:48 p.m. UTC | #6
On 08/13/2015 05:37 PM, Christoph Hellwig wrote:
> Hi Boaz,
> 
> can you please fix your quoting?  I read down about 10 pages but still
> couldn't find a comment from you.  For now I gave up on this mail.
> 

Sorry here:

> +void *kmap_atomic_pfn_t(__pfn_t pfn)
> +{
> +	struct page *page = __pfn_t_to_page(pfn);
> +	resource_size_t addr;
> +	struct kmap *kmap;
> +
> +	rcu_read_lock();
> +	if (page)
> +		return kmap_atomic(page);

Right even with pages I pay rcu_read_lock(); for every access?

> +	addr = __pfn_t_to_phys(pfn);
> +	list_for_each_entry_rcu(kmap, &ranges, list)
> +		if (addr >= kmap->res->start && addr <= kmap->res->end)
> +			return kmap->base + addr - kmap->res->start;
> +

Good god! This loop is a real *joke*. You have just dropped memory access
performance by 10 fold.

The all point of pages and memory_model.h was to have a one to one
relation-ships between Kernel-virtual vs physical vs page *

There is already an object that holds a relationship of physical
to Kernel-virtual. It is called a memory-section. Why not just
widen its definition?

If you are willing to accept this loop. In current Linux 2015 Kernel
Then I have nothing farther to say.

Boaz - go mourning for the death of the Linux Kernel alone in the corner ;-(

> +	/* only unlock in the error case */
> +	rcu_read_unlock();
> +	return NULL;
> +}
> +EXPORT_SYMBOL(kmap_atomic_pfn_t);
> +
Boaz Harrosh Aug. 13, 2015, 3:01 p.m. UTC | #7
On 08/13/2015 05:41 PM, Christoph Hellwig wrote:
> On Thu, Aug 13, 2015 at 04:23:38PM +0300, Boaz Harrosh wrote:
>>> DAX as is is races against pmem unbind.   A synchronization cost must
>>> be paid somewhere to make sure the memremap() mapping is still valid.
>>
>> Sorry for being so slow, is what I asked. what is exactly "pmem unbind" ?
>>
>> Currently in my 4.1 Kernel the ioremap is done on modprobe time and
>> released modprobe --remove time. the --remove can not happen with a mounted
>> FS dax or not. So what is exactly "pmem unbind". And if there is a new knob
>> then make it refuse with a raised refcount.
> 
> Surprise removal of a PCIe card which is mapped to provide non-volatile
> memory for example.  Or memory hot swap.
> 

Then the mapping is just there and you get garbage. Just the same as
"memory hot swap" the kernel will not let you HOT-REMOVE a referenced
memory. It will just refuse. If you forcefully remove a swapeble memory
chip without HOT-REMOVE first what will happen? so the same here.

SW wise you refuse to HOT-REMOVE. HW wise BTW the Kernel will not die
only farther reads will return all 111111 and writes will go to the
either.

The all kmap thing was for highmem. Is not the case here.

Again see my other comment at dax mmap:

- you go pfn_map take a pfn
- kpfn_unmap
- put pfn on user mmap vma
- then what happens to user access after that. Nothing not even a page_fault
  It will have a vm-mapping to a now none existing physical address that's
  it.

Thanks
Boaz
Boaz Harrosh Aug. 13, 2015, 3:29 p.m. UTC | #8
On 08/13/2015 05:48 PM, Boaz Harrosh wrote:
<>
> There is already an object that holds a relationship of physical
> to Kernel-virtual. It is called a memory-section. Why not just
> widen its definition?
> 

BTW: Regarding the "widen its definition"

I was thinking of two possible new models here:
[1-A page-less memory section]
- Keep the 64bit phisical-to-kernel_virtual hard coded relationship
- Allocate a section-object, but this section object does not have any
  pages, its only the header. (You need it for the pmd/pmt thing)

  Lots of things just work now if you make sure you do not go through
  a page struct. This needs no extra work I have done this in the past
  all you need is to do your ioremap through the map_kernel_range_noflush(__va(), ....)

[2- Small pages-struct]

- Like above, but each entry in the new section object is small one-ulong size
  holding just flags.

 Then if !(p->flags & PAGE_SPECIAL) page = container_of(p, struct page, flags)

 This model is good because you actually have your pfn_to_page and page_to_pfn
 and need not touch sg-list or bio. But only 8 bytes per frame instead of 64 bytes


But I still think that the best long-term model is the variable size pages
where a page* can be 2M or 1G. Again an extra flag and a widen section definition.
Is about time we move to bigger pages, throughout but still keep the 4k
page-cache-dirty granularity.

Thanks
Boaz
Matthew Wilcox Aug. 13, 2015, 5:35 p.m. UTC | #9
On Wed, Aug 12, 2015 at 11:01:09PM -0400, Dan Williams wrote:
> +static inline __pfn_t page_to_pfn_t(struct page *page)
> +{
> +	__pfn_t pfn = { .val = page_to_pfn(page) << PAGE_SHIFT, };
> +
> +	return pfn;
> +}

static inline __pfn_t page_to_pfn_t(struct page *page)
{
	__pfn_t __pfn;
	unsigned long pfn = page_to_pfn(page);
	BUG_ON(pfn > (-1UL >> PFN_SHIFT))
	__pfn.val = pfn << PFN_SHIFT;

	return __pfn;
}

I have a problem wih PFN_SHIFT being equal to PAGE_SHIFT.  Consider a
32-bit kernel; you're asserting that no memory represented by a struct
page can have a physical address above 4GB.

You only need three bits for flags so far ... how about making PFN_SHIFT
be 6?  That supports physical addresses up to 2^38 (256GB).  That should
be enough, but hardware designers have done some strange things in the
past (I know that HP made PA-RISC hardware that can run 32-bit kernels
with memory between 64GB and 68GB, and they can't be the only strange
hardware people out there).
Dave Hansen Aug. 13, 2015, 5:37 p.m. UTC | #10
On 08/13/2015 07:48 AM, Boaz Harrosh wrote:
> There is already an object that holds a relationship of physical
> to Kernel-virtual. It is called a memory-section. Why not just
> widen its definition?

Memory sections are purely there to map physical address ranges back to
metadata about them.  *Originally* for 'struct page', but widened a bit
subsequently.  But, it's *never* been connected to kernel-virtual
addresses in any way that I can think of.

So, that's a curious statement.
Dan Williams Aug. 13, 2015, 6:15 p.m. UTC | #11
On Thu, Aug 13, 2015 at 10:35 AM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Wed, Aug 12, 2015 at 11:01:09PM -0400, Dan Williams wrote:
>> +static inline __pfn_t page_to_pfn_t(struct page *page)
>> +{
>> +     __pfn_t pfn = { .val = page_to_pfn(page) << PAGE_SHIFT, };
>> +
>> +     return pfn;
>> +}
>
> static inline __pfn_t page_to_pfn_t(struct page *page)
> {
>         __pfn_t __pfn;
>         unsigned long pfn = page_to_pfn(page);
>         BUG_ON(pfn > (-1UL >> PFN_SHIFT))
>         __pfn.val = pfn << PFN_SHIFT;
>
>         return __pfn;
> }
>
> I have a problem wih PFN_SHIFT being equal to PAGE_SHIFT.  Consider a
> 32-bit kernel; you're asserting that no memory represented by a struct
> page can have a physical address above 4GB.
>
> You only need three bits for flags so far ... how about making PFN_SHIFT
> be 6?  That supports physical addresses up to 2^38 (256GB).  That should
> be enough, but hardware designers have done some strange things in the
> past (I know that HP made PA-RISC hardware that can run 32-bit kernels
> with memory between 64GB and 68GB, and they can't be the only strange
> hardware people out there).

Sounds good, especially given we only use 4-bits today.
diff mbox

Patch

diff --git a/include/linux/kmap_pfn.h b/include/linux/kmap_pfn.h
new file mode 100644
index 000000000000..fa44971d8e95
--- /dev/null
+++ b/include/linux/kmap_pfn.h
@@ -0,0 +1,31 @@ 
+#ifndef _LINUX_KMAP_PFN_H
+#define _LINUX_KMAP_PFN_H 1
+
+#include <linux/highmem.h>
+
+struct device;
+struct resource;
+#ifdef CONFIG_KMAP_PFN
+extern void *kmap_atomic_pfn_t(__pfn_t pfn);
+extern void kunmap_atomic_pfn_t(void *addr);
+extern int devm_register_kmap_pfn_range(struct device *dev,
+		struct resource *res, void *base);
+#else
+static inline void *kmap_atomic_pfn_t(__pfn_t pfn)
+{
+	return kmap_atomic(__pfn_t_to_page(pfn));
+}
+
+static inline void kunmap_atomic_pfn_t(void *addr)
+{
+	__kunmap_atomic(addr);
+}
+
+static inline int devm_register_kmap_pfn_range(struct device *dev,
+		struct resource *res, void *base)
+{
+	return 0;
+}
+#endif /* CONFIG_KMAP_PFN */
+
+#endif /* _LINUX_KMAP_PFN_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 84b05ebedb2d..57ba5ca6be72 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -924,6 +924,63 @@  static inline void set_page_links(struct page *page, enum zone_type zone,
 }
 
 /*
+ * __pfn_t: encapsulates a page-frame number that is optionally backed
+ * by memmap (struct page).  This type will be used in place of a
+ * 'struct page *' instance in contexts where unmapped memory (usually
+ * persistent memory) is being referenced (scatterlists for drivers,
+ * biovecs for the block layer, etc).  Whether a __pfn_t has a struct
+ * page backing is indicated by flags in the low bits of the value;
+ */
+typedef struct {
+	unsigned long val;
+} __pfn_t;
+
+/*
+ * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
+ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
+ * PFN_DEV - pfn is not covered by system memmap
+ */
+enum {
+	PFN_MASK = (1UL << PAGE_SHIFT) - 1,
+	PFN_SG_CHAIN = (1UL << 0),
+	PFN_SG_LAST = (1UL << 1),
+#ifdef CONFIG_KMAP_PFN
+	PFN_DEV = (1UL << 2),
+#else
+	PFN_DEV = 0,
+#endif
+};
+
+static inline bool __pfn_t_has_page(__pfn_t pfn)
+{
+	return (pfn.val & PFN_DEV) == 0;
+}
+
+static inline unsigned long __pfn_t_to_pfn(__pfn_t pfn)
+{
+	return pfn.val >> PAGE_SHIFT;
+}
+
+static inline struct page *__pfn_t_to_page(__pfn_t pfn)
+{
+	if (!__pfn_t_has_page(pfn))
+		return NULL;
+	return pfn_to_page(__pfn_t_to_pfn(pfn));
+}
+
+static inline dma_addr_t __pfn_t_to_phys(__pfn_t pfn)
+{
+	return __pfn_to_phys(__pfn_t_to_pfn(pfn));
+}
+
+static inline __pfn_t page_to_pfn_t(struct page *page)
+{
+	__pfn_t pfn = { .val = page_to_pfn(page) << PAGE_SHIFT, };
+
+	return pfn;
+}
+
+/*
  * Some inline functions in vmstat.h depend on page_zone()
  */
 #include <linux/vmstat.h>
diff --git a/mm/Kconfig b/mm/Kconfig
index e79de2bd12cd..ed1be8ff982e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -654,3 +654,6 @@  config DEFERRED_STRUCT_PAGE_INIT
 	  when kswapd starts. This has a potential performance impact on
 	  processes running early in the lifetime of the systemm until kswapd
 	  finishes the initialisation.
+
+config KMAP_PFN
+	bool
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..f7b27958ea69 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@  obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_KMAP_PFN) += kmap_pfn.o
diff --git a/mm/kmap_pfn.c b/mm/kmap_pfn.c
new file mode 100644
index 000000000000..2d58e167dfbc
--- /dev/null
+++ b/mm/kmap_pfn.c
@@ -0,0 +1,117 @@ 
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/highmem.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+static LIST_HEAD(ranges);
+static DEFINE_MUTEX(register_lock);
+
+struct kmap {
+	struct list_head list;
+	struct resource *res;
+	struct device *dev;
+	void *base;
+};
+
+static void teardown_kmap(void *data)
+{
+	struct kmap *kmap = data;
+
+	dev_dbg(kmap->dev, "kmap unregister %pr\n", kmap->res);
+	mutex_lock(&register_lock);
+	list_del_rcu(&kmap->list);
+	mutex_unlock(&register_lock);
+	synchronize_rcu();
+	kfree(kmap);
+}
+
+int devm_register_kmap_pfn_range(struct device *dev, struct resource *res,
+		void *base)
+{
+	struct kmap *kmap = kzalloc(sizeof(*kmap), GFP_KERNEL);
+	int rc;
+
+	if (!kmap)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&kmap->list);
+	kmap->res = res;
+	kmap->base = base;
+	kmap->dev = dev;
+	rc = devm_add_action(dev, teardown_kmap, kmap);
+	if (rc) {
+		kfree(kmap);
+		return rc;
+	}
+	dev_dbg(kmap->dev, "kmap register %pr\n", kmap->res);
+
+	mutex_lock(&register_lock);
+	list_add_rcu(&kmap->list, &ranges);
+	mutex_unlock(&register_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_register_kmap_pfn_range);
+
+void *kmap_atomic_pfn_t(__pfn_t pfn)
+{
+	struct page *page = __pfn_t_to_page(pfn);
+	resource_size_t addr;
+	struct kmap *kmap;
+
+	rcu_read_lock();
+	if (page)
+		return kmap_atomic(page);
+	addr = __pfn_t_to_phys(pfn);
+	list_for_each_entry_rcu(kmap, &ranges, list)
+		if (addr >= kmap->res->start && addr <= kmap->res->end)
+			return kmap->base + addr - kmap->res->start;
+
+	/* only unlock in the error case */
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL(kmap_atomic_pfn_t);
+
+void kunmap_atomic_pfn_t(void *addr)
+{
+	struct kmap *kmap;
+	bool dev_pfn = false;
+
+	if (!addr)
+		return;
+
+	/*
+	 * If the original __pfn_t had an entry in the memmap (i.e.
+	 * !PFN_DEV) then 'addr' will be outside of the registered
+	 * ranges and we'll need to kunmap_atomic() it.
+	 */
+	list_for_each_entry_rcu(kmap, &ranges, list)
+		if (addr < kmap->base + resource_size(kmap->res)
+				&& addr >= kmap->base) {
+			dev_pfn = true;
+			break;
+		}
+
+	if (!dev_pfn)
+		kunmap_atomic(addr);
+
+	/* signal that we are done with the range */
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(kunmap_atomic_pfn_t);