diff mbox

[v2,4/4] arch, x86: cache management apis for persistent memory

Message ID 20150530185940.32590.37804.stgit@dwillia2-desk3.amr.corp.intel.com
State Superseded
Headers show

Commit Message

Dan Williams May 30, 2015, 6:59 p.m. UTC
From: Ross Zwisler <ross.zwisler@linux.intel.com>

Based on an original patch by Ross Zwisler [1].

Writes to persistent memory have the potential to be posted to cpu
cache, cpu write buffers, and platform write buffers (memory controller)
before being committed to persistent media.  Provide apis
(persistent_copy() and persistent_sync()) to copy data and assert that
it is durable in PMEM (a persistent linear address range).

[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[djbw: s/arch_persistent_flush()/io_flush_cache_range()/]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                  |    1 
 arch/x86/include/asm/cacheflush.h |   24 ++++++++++
 arch/x86/include/asm/io.h         |    6 ++
 drivers/block/pmem.c              |   58 ++++++++++++++++++++++-
 include/linux/pmem.h              |   93 +++++++++++++++++++++++++++++++++++++
 lib/Kconfig                       |    3 +
 6 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/pmem.h

Comments

Paul Bolle June 1, 2015, 9:19 a.m. UTC | #1
On Sat, 2015-05-30 at 14:59 -0400, Dan Williams wrote:
> --- a/lib/Kconfig
> +++ b/lib/Kconfig

> +config ARCH_HAS_PMEM_API
> +	def_bool n

'n' is the default anyway, so I think 

config ARCH_HAS_PMEM_API
	bool

should work just as well.


Paul Bolle
Boaz Harrosh June 1, 2015, 11:39 a.m. UTC | #2
On 05/30/2015 09:59 PM, Dan Williams wrote:
> From: Ross Zwisler <ross.zwisler@linux.intel.com>
> 
> Based on an original patch by Ross Zwisler [1].
> 
> Writes to persistent memory have the potential to be posted to cpu
> cache, cpu write buffers, and platform write buffers (memory controller)
> before being committed to persistent media.  Provide apis
> (persistent_copy() and persistent_sync()) to copy data and assert that
> it is durable in PMEM (a persistent linear address range).
> 
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> [djbw: s/arch_persistent_flush()/io_flush_cache_range()/]
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  arch/x86/Kconfig                  |    1 
>  arch/x86/include/asm/cacheflush.h |   24 ++++++++++
>  arch/x86/include/asm/io.h         |    6 ++
>  drivers/block/pmem.c              |   58 ++++++++++++++++++++++-
>  include/linux/pmem.h              |   93 +++++++++++++++++++++++++++++++++++++
>  lib/Kconfig                       |    3 +
>  6 files changed, 183 insertions(+), 2 deletions(-)
>  create mode 100644 include/linux/pmem.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 73a4d0330ad0..6412d92e6f1e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -102,6 +102,7 @@ config X86
>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>  	select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
>  	select ARCH_HAS_SG_CHAIN
> +	select ARCH_HAS_PMEM_API
>  	select CLKEVT_I8253
>  	select ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	select GENERIC_IOMAP
> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
> index b6f7457d12e4..6b8bd5c43bf6 100644
> --- a/arch/x86/include/asm/cacheflush.h
> +++ b/arch/x86/include/asm/cacheflush.h
> @@ -4,6 +4,7 @@
>  /* Caches aren't brain-dead on the intel. */
>  #include <asm-generic/cacheflush.h>
>  #include <asm/special_insns.h>
> +#include <asm/uaccess.h>
>  
>  /*
>   * The set_memory_* API can be used to change various attributes of a virtual
> @@ -108,4 +109,27 @@ static inline int rodata_test(void)
>  }
>  #endif
>  
> +static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
> +{
> +	/*
> +	 * We are copying between two kernel buffers, if
> +	 * __copy_from_user_inatomic_nocache() returns an error (page
> +	 * fault) we would have already taken an unhandled fault before
> +	 * the BUG_ON.  The BUG_ON is simply here to satisfy
> +	 * __must_check and allow reuse of the common non-temporal store
> +	 * implementation for persistent_copy().
> +	 */
> +	BUG_ON(__copy_from_user_inatomic_nocache(dst, src, n));
> +}
> +
> +static inline void arch_persistent_sync(void)
> +{
> +	wmb();
> +	pcommit_sfence();
> +}
> +
> +static inline bool __arch_has_persistent_sync(void)
> +{
> +	return boot_cpu_has(X86_FEATURE_PCOMMIT);
> +}
>  #endif /* _ASM_X86_CACHEFLUSH_H */
> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
> index 956f2768bdc1..f3c32bb207cf 100644
> --- a/arch/x86/include/asm/io.h
> +++ b/arch/x86/include/asm/io.h
> @@ -250,6 +250,12 @@ static inline void flush_write_buffers(void)
>  #endif
>  }
>  
> +static inline void *arch_persistent_remap(resource_size_t offset,
> +	unsigned long size)
> +{
> +	return (void __force *) ioremap_cache(offset, size);
> +}
> +
>  #endif /* __KERNEL__ */
>  
>  extern void native_io_delay(void);
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> index 799acff6bd7c..10cbe557165c 100644
> --- a/drivers/block/pmem.c
> +++ b/drivers/block/pmem.c
> @@ -23,9 +23,16 @@
>  #include <linux/module.h>
>  #include <linux/moduleparam.h>
>  #include <linux/slab.h>
> +#include <linux/pmem.h>
>  
>  #define PMEM_MINORS		16
>  
> +struct pmem_ops {
> +	void *(*remap)(resource_size_t offset, unsigned long size);
> +	void (*copy)(void *dst, const void *src, size_t size);
> +	void (*sync)(void);

What? why the ops at all see below ...

> +};
> +
>  struct pmem_device {
>  	struct request_queue	*pmem_queue;
>  	struct gendisk		*pmem_disk;
> @@ -34,11 +41,54 @@ struct pmem_device {
>  	phys_addr_t		phys_addr;
>  	void			*virt_addr;
>  	size_t			size;
> +	struct pmem_ops		ops;
>  };
>  
>  static int pmem_major;
>  static atomic_t pmem_index;
>  
> +static void default_pmem_sync(void)
> +{
> +	wmb();
> +}
> +
> +static void default_pmem_copy(void *dst, const void *src, size_t size)
> +{
> +	memcpy(dst, src, size);
> +}
> +
> +static void pmem_ops_default_init(struct pmem_device *pmem)
> +{
> +	/*
> +	 * These defaults seek to offer decent performance and minimize
> +	 * the window between i/o completion and writes being durable on
> +	 * media.  However, it is undefined / architecture specific
> +	 * whether acknowledged data may be lost in transit if a power
> +	 * fail occurs after bio_endio().
> +	 */
> +	pmem->ops.remap = memremap_wt;
> +	pmem->ops.copy = default_pmem_copy;
> +	pmem->ops.sync = default_pmem_sync;
> +}
> +
> +static bool pmem_ops_init(struct pmem_device *pmem)
> +{
> +	if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
> +			arch_has_persistent_sync()) {

I must be slow and stupid but it looks to me like:

if arch_has_persistent_sync == false then persistent_sync
can then just be the default above wmb, and
	if (something_always_off())
		do
Will be always faster then a function pointer. This
if can be in the generic implementation of persistent_sync
and be done with.

Then persistent_copy() can just have an inline generic
implementation of your memcpy above, and the WARN_ON_ONCE
or what ever you want.

And no need for any opt vector and function pointers call out.

And also for me the all arch_has_persistent_sync() is mute,
the arches that have members of its family not support some
fixture can do the if (always_false_or_true) thing and do
not pollute the global name space. All we need is a single
switch as above
#ifdef CONFIG_ARCH_HAS_PMEM_API
	=> arch_persistent_sync
#else
	=> wmb
#endif

> +		/*
> +		 * This arch + cpu guarantees that bio_endio() == data
> +		 * durable on media.
> +		 */
> +		pmem->ops.remap = persistent_remap;
> +		pmem->ops.copy = persistent_copy;
> +		pmem->ops.sync = persistent_sync;
> +		return true;
> +	}
> +
> +	pmem_ops_default_init(pmem);
> +	return false;
> +}
> +
>  static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>  			unsigned int len, unsigned int off, int rw,
>  			sector_t sector)
> @@ -51,7 +101,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>  		flush_dcache_page(page);
>  	} else {
>  		flush_dcache_page(page);
> -		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
> +		pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
>  	}
>  
>  	kunmap_atomic(mem);
> @@ -82,6 +132,8 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
>  		sector += bvec.bv_len >> 9;
>  	}
>  
> +	if (rw)
> +		pmem->ops.sync();
>  out:
>  	bio_endio(bio, err);
>  }
> @@ -131,6 +183,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>  
>  	pmem->phys_addr = res->start;
>  	pmem->size = resource_size(res);
> +	if (!pmem_ops_init(pmem))

fine just #ifndef CONFIG_ARCH_HAS_PMEM_API

> +		dev_warn(dev, "unable to guarantee persistence of writes\n");

But I wouldn't even bother I'd just put a WARN_ON_ONCE inside
persistent_copy() on any real use pmem or not and be done with it.

>  
>  	err = -EINVAL;
>  	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
> @@ -143,7 +197,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>  	 * of the CPU caches in case of a crash.
>  	 */
>  	err = -ENOMEM;
> -	pmem->virt_addr = memremap_wt(pmem->phys_addr, pmem->size);
> +	pmem->virt_addr = pmem->ops.remap(pmem->phys_addr, pmem->size);
>  	if (!pmem->virt_addr)
>  		goto out_release_region;
>  
> diff --git a/include/linux/pmem.h b/include/linux/pmem.h
> new file mode 100644
> index 000000000000..e9a63ee1d361
> --- /dev/null
> +++ b/include/linux/pmem.h
> @@ -0,0 +1,93 @@
> +/*
> + * Copyright(c) 2015 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */
> +#ifndef __PMEM_H__
> +#define __PMEM_H__
> +
> +#include <linux/io.h>
> +#include <asm/cacheflush.h>
> +
> +/*
> + * Architectures that define ARCH_HAS_PMEM_API must provide
> + * implementations for arch_persistent_remap(), arch_persistent_copy(),
> + * arch_persistent_sync(), and __arch_has_persistent_sync().
> + */
> +
> +#ifdef CONFIG_ARCH_HAS_PMEM_API
> +/**
> + * persistent_remap - map physical persistent memory for pmem api
> + * @offset: physical address of persistent memory
> + * @size: size of the mapping
> + *
> + * Establish a mapping of the architecture specific memory type expected
> + * by persistent_copy() and persistent_sync().  For example, it may be
> + * the case that an uncacheable or writethrough mapping is sufficient,
> + * or a writeback mapping provided persistent_copy() and
> + * persistent_sync() arrange for the data to be written through the
> + * cache to persistent media.
> + */
> +static inline void *persistent_remap(resource_size_t offset, unsigned long size)
> +{
> +	return arch_persistent_remap(offset, size);
> +}
> +
> +/**
> + * persistent_copy - copy data to persistent memory
> + * @dst: destination buffer for the copy
> + * @src: source buffer for the copy
> + * @n: length of the copy in bytes
> + *
> + * Perform a memory copy that results in the destination of the copy
> + * being effectively evicted from, or never written to, the processor
> + * cache hierarchy after the copy completes.  After persistent_copy()
> + * data may still reside in cpu or platform buffers, so this operation
> + * must be followed by a persistent_sync().
> + */
> +static inline void persistent_copy(void *dst, const void *src, size_t n)
> +{
> +	arch_persistent_copy(dst, src, n);
> +}
> +
> +/**
> + * persistent_sync - synchronize writes to persistent memory
> + *
> + * After a series of persistent_copy() operations this drains data from
> + * cpu write buffers and any platform (memory controller) buffers to
> + * ensure that written data is durable on persistent memory media.
> + */
> +static inline void persistent_sync(void)
> +{
> +	arch_persistent_sync();
> +}
> +
> +/**
> + * arch_has_persistent_sync - true if persistent_sync() ensures durability
> + *
> + * For a given cpu implementation within an architecture it is possible
> + * that persistent_sync() resolves to a nop.  In the case this returns
> + * false, pmem api users are unable to ensure durabilty and may want to
> + * fall back to a different data consistency model, or otherwise notify
> + * the user.
> + */
> +static inline bool arch_has_persistent_sync(void)

Again this can just go inside the arch in question.
Those arches without a choice need not bother

> +{
> +	return __arch_has_persistent_sync();
> +}
> +#else
> +/* undefined symbols */
> +extern void *persistent_remap(resource_size_t offet, unsigned long size);
> +extern void persistent_copy(void *dst, const void *src, size_t n);
> +extern void persistent_sync(void);

Define these to the generic imp you have in pmem.c (memcpy && wmb)
After all drivers/block/pmem.c is just as generic as this here place

> +extern bool arch_has_persistent_sync(void);
> +#endif /* CONFIG_ARCH_HAS_PMEM_API */
> +
> +#endif /* __PMEM_H__ */
> diff --git a/lib/Kconfig b/lib/Kconfig
> index 601965a948e8..e6a3c892d514 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -522,4 +522,7 @@ source "lib/fonts/Kconfig"
>  config ARCH_HAS_SG_CHAIN
>  	def_bool n
>  
> +config ARCH_HAS_PMEM_API
> +	def_bool n
> +
>  endmenu
> 

Cheers
Boaz
Boaz Harrosh June 1, 2015, 11:44 a.m. UTC | #3
Forgot one thing

On 06/01/2015 02:39 PM, Boaz Harrosh wrote:
>> +static inline void persistent_copy(void *dst, const void *src, size_t n)

Could we please make this
memcpy_persistent

Same as:
copy_from_user_nocache

The generic name of what it does first then the special override.
copy_from_user_XXX is same as copy_from_user but with XXX applied

Same here exactly as memcpy_ but with persistent applied.

<>

Thanks
Boaz
Dan Williams June 1, 2015, 4:07 p.m. UTC | #4
On Mon, Jun 1, 2015 at 4:44 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> Forgot one thing
>
> On 06/01/2015 02:39 PM, Boaz Harrosh wrote:
>>> +static inline void persistent_copy(void *dst, const void *src, size_t n)
>
> Could we please make this
> memcpy_persistent
>
> Same as:
> copy_from_user_nocache
>
> The generic name of what it does first then the special override.
> copy_from_user_XXX is same as copy_from_user but with XXX applied
>
> Same here exactly as memcpy_ but with persistent applied.

Ok, makes sense.
Dan Williams June 1, 2015, 4:22 p.m. UTC | #5
On Mon, Jun 1, 2015 at 4:39 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 05/30/2015 09:59 PM, Dan Williams wrote:
>> From: Ross Zwisler <ross.zwisler@linux.intel.com>
>>
>> Based on an original patch by Ross Zwisler [1].
>>
>> Writes to persistent memory have the potential to be posted to cpu
>> cache, cpu write buffers, and platform write buffers (memory controller)
>> before being committed to persistent media.  Provide apis
>> (persistent_copy() and persistent_sync()) to copy data and assert that
>> it is durable in PMEM (a persistent linear address range).
>>
>> [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html
>>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>> [djbw: s/arch_persistent_flush()/io_flush_cache_range()/]
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  arch/x86/Kconfig                  |    1
>>  arch/x86/include/asm/cacheflush.h |   24 ++++++++++
>>  arch/x86/include/asm/io.h         |    6 ++
>>  drivers/block/pmem.c              |   58 ++++++++++++++++++++++-
>>  include/linux/pmem.h              |   93 +++++++++++++++++++++++++++++++++++++
>>  lib/Kconfig                       |    3 +
>>  6 files changed, 183 insertions(+), 2 deletions(-)
>>  create mode 100644 include/linux/pmem.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 73a4d0330ad0..6412d92e6f1e 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -102,6 +102,7 @@ config X86
>>       select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>       select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
>>       select ARCH_HAS_SG_CHAIN
>> +     select ARCH_HAS_PMEM_API
>>       select CLKEVT_I8253
>>       select ARCH_HAVE_NMI_SAFE_CMPXCHG
>>       select GENERIC_IOMAP
>> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
>> index b6f7457d12e4..6b8bd5c43bf6 100644
>> --- a/arch/x86/include/asm/cacheflush.h
>> +++ b/arch/x86/include/asm/cacheflush.h
>> @@ -4,6 +4,7 @@
>>  /* Caches aren't brain-dead on the intel. */
>>  #include <asm-generic/cacheflush.h>
>>  #include <asm/special_insns.h>
>> +#include <asm/uaccess.h>
>>
>>  /*
>>   * The set_memory_* API can be used to change various attributes of a virtual
>> @@ -108,4 +109,27 @@ static inline int rodata_test(void)
>>  }
>>  #endif
>>
>> +static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
>> +{
>> +     /*
>> +      * We are copying between two kernel buffers, if
>> +      * __copy_from_user_inatomic_nocache() returns an error (page
>> +      * fault) we would have already taken an unhandled fault before
>> +      * the BUG_ON.  The BUG_ON is simply here to satisfy
>> +      * __must_check and allow reuse of the common non-temporal store
>> +      * implementation for persistent_copy().
>> +      */
>> +     BUG_ON(__copy_from_user_inatomic_nocache(dst, src, n));
>> +}
>> +
>> +static inline void arch_persistent_sync(void)
>> +{
>> +     wmb();
>> +     pcommit_sfence();
>> +}
>> +
>> +static inline bool __arch_has_persistent_sync(void)
>> +{
>> +     return boot_cpu_has(X86_FEATURE_PCOMMIT);
>> +}
>>  #endif /* _ASM_X86_CACHEFLUSH_H */
>> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
>> index 956f2768bdc1..f3c32bb207cf 100644
>> --- a/arch/x86/include/asm/io.h
>> +++ b/arch/x86/include/asm/io.h
>> @@ -250,6 +250,12 @@ static inline void flush_write_buffers(void)
>>  #endif
>>  }
>>
>> +static inline void *arch_persistent_remap(resource_size_t offset,
>> +     unsigned long size)
>> +{
>> +     return (void __force *) ioremap_cache(offset, size);
>> +}
>> +
>>  #endif /* __KERNEL__ */
>>
>>  extern void native_io_delay(void);
>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>> index 799acff6bd7c..10cbe557165c 100644
>> --- a/drivers/block/pmem.c
>> +++ b/drivers/block/pmem.c
>> @@ -23,9 +23,16 @@
>>  #include <linux/module.h>
>>  #include <linux/moduleparam.h>
>>  #include <linux/slab.h>
>> +#include <linux/pmem.h>
>>
>>  #define PMEM_MINORS          16
>>
>> +struct pmem_ops {
>> +     void *(*remap)(resource_size_t offset, unsigned long size);
>> +     void (*copy)(void *dst, const void *src, size_t size);
>> +     void (*sync)(void);
>
> What? why the ops at all see below ...
>
>> +};
>> +
>>  struct pmem_device {
>>       struct request_queue    *pmem_queue;
>>       struct gendisk          *pmem_disk;
>> @@ -34,11 +41,54 @@ struct pmem_device {
>>       phys_addr_t             phys_addr;
>>       void                    *virt_addr;
>>       size_t                  size;
>> +     struct pmem_ops         ops;
>>  };
>>
>>  static int pmem_major;
>>  static atomic_t pmem_index;
>>
>> +static void default_pmem_sync(void)
>> +{
>> +     wmb();
>> +}
>> +
>> +static void default_pmem_copy(void *dst, const void *src, size_t size)
>> +{
>> +     memcpy(dst, src, size);
>> +}
>> +
>> +static void pmem_ops_default_init(struct pmem_device *pmem)
>> +{
>> +     /*
>> +      * These defaults seek to offer decent performance and minimize
>> +      * the window between i/o completion and writes being durable on
>> +      * media.  However, it is undefined / architecture specific
>> +      * whether acknowledged data may be lost in transit if a power
>> +      * fail occurs after bio_endio().
>> +      */
>> +     pmem->ops.remap = memremap_wt;
>> +     pmem->ops.copy = default_pmem_copy;
>> +     pmem->ops.sync = default_pmem_sync;
>> +}
>> +
>> +static bool pmem_ops_init(struct pmem_device *pmem)
>> +{
>> +     if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
>> +                     arch_has_persistent_sync()) {
>
> I must be slow and stupid but it looks to me like:
>
> if arch_has_persistent_sync == false then persistent_sync
> can then just be the default above wmb, and
>         if (something_always_off())
>                 do
> Will be always faster then a function pointer. This
> if can be in the generic implementation of persistent_sync
> and be done with.

A few things... that "something_always_off()" check is a cpuid call on
x86.  There's no need to keep asking we can resolve the answer once at
init.  If you can get this indirect branch to show up in a meaningful
way on a profile I'd reconsider, but branch-target-buffers are fairly
good these days.  The wmb() is neither necessary nor sufficient on x86
and it's simply a guess that it helps on other archs().

> Then persistent_copy() can just have an inline generic
> implementation of your memcpy above, and the WARN_ON_ONCE
> or what ever you want.

No, persistent_copy() does not have the proper context to tell you
which device is encountering the problem.  And again, we'd be taking a
conditional branch every i/o for this determination that could be made
once at init.

> And no need for any opt vector and function pointers call out.

There's a third case to consider, driver local persistence guarantees.
ADR platforms need not call pcommit_sfence() to guarantee persistence.
In that case I would expect ADR platform enabling developers to extend
pmem_ops_init() to turn off the warning with ops that are specific to
their platform.  We also have directed flush support through registers
that could be turned into another op option.

> And also for me the all arch_has_persistent_sync() is mute,
> the arches that have members of its family not support some
> fixture can do the if (always_false_or_true) thing and do
> not pollute the global name space. All we need is a single
> switch as above
> #ifdef CONFIG_ARCH_HAS_PMEM_API
>         => arch_persistent_sync
> #else
>         => wmb

wmb() is not a substitute for pcommit, I think it is dangerous to
imply that you can silently fall back to wmb().

> #endif
>
>> +             /*
>> +              * This arch + cpu guarantees that bio_endio() == data
>> +              * durable on media.
>> +              */
>> +             pmem->ops.remap = persistent_remap;
>> +             pmem->ops.copy = persistent_copy;
>> +             pmem->ops.sync = persistent_sync;
>> +             return true;
>> +     }
>> +
>> +     pmem_ops_default_init(pmem);
>> +     return false;
>> +}
>> +
>>  static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>>                       unsigned int len, unsigned int off, int rw,
>>                       sector_t sector)
>> @@ -51,7 +101,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
>>               flush_dcache_page(page);
>>       } else {
>>               flush_dcache_page(page);
>> -             memcpy(pmem->virt_addr + pmem_off, mem + off, len);
>> +             pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
>>       }
>>
>>       kunmap_atomic(mem);
>> @@ -82,6 +132,8 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
>>               sector += bvec.bv_len >> 9;
>>       }
>>
>> +     if (rw)
>> +             pmem->ops.sync();
>>  out:
>>       bio_endio(bio, err);
>>  }
>> @@ -131,6 +183,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
>>
>>       pmem->phys_addr = res->start;
>>       pmem->size = resource_size(res);
>> +     if (!pmem_ops_init(pmem))
>
> fine just #ifndef CONFIG_ARCH_HAS_PMEM_API

No because the presence of the API is not sufficient to determine
whether persistence durability can be guaranteed.

>> +             dev_warn(dev, "unable to guarantee persistence of writes\n");
>
> But I wouldn't even bother I'd just put a WARN_ON_ONCE inside
> persistent_copy() on any real use pmem or not and be done with it.

Again, we can make this determination once, and not forever for every
future usage.
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73a4d0330ad0..6412d92e6f1e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@  config X86
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
 	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAS_PMEM_API
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b6f7457d12e4..6b8bd5c43bf6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@ 
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
+#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
@@ -108,4 +109,27 @@  static inline int rodata_test(void)
 }
 #endif
 
+static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
+{
+	/*
+	 * We are copying between two kernel buffers, if
+	 * __copy_from_user_inatomic_nocache() returns an error (page
+	 * fault) we would have already taken an unhandled fault before
+	 * the BUG_ON.  The BUG_ON is simply here to satisfy
+	 * __must_check and allow reuse of the common non-temporal store
+	 * implementation for persistent_copy().
+	 */
+	BUG_ON(__copy_from_user_inatomic_nocache(dst, src, n));
+}
+
+static inline void arch_persistent_sync(void)
+{
+	wmb();
+	pcommit_sfence();
+}
+
+static inline bool __arch_has_persistent_sync(void)
+{
+	return boot_cpu_has(X86_FEATURE_PCOMMIT);
+}
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 956f2768bdc1..f3c32bb207cf 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -250,6 +250,12 @@  static inline void flush_write_buffers(void)
 #endif
 }
 
+static inline void *arch_persistent_remap(resource_size_t offset,
+	unsigned long size)
+{
+	return (void __force *) ioremap_cache(offset, size);
+}
+
 #endif /* __KERNEL__ */
 
 extern void native_io_delay(void);
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 799acff6bd7c..10cbe557165c 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -23,9 +23,16 @@ 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
+#include <linux/pmem.h>
 
 #define PMEM_MINORS		16
 
+struct pmem_ops {
+	void *(*remap)(resource_size_t offset, unsigned long size);
+	void (*copy)(void *dst, const void *src, size_t size);
+	void (*sync)(void);
+};
+
 struct pmem_device {
 	struct request_queue	*pmem_queue;
 	struct gendisk		*pmem_disk;
@@ -34,11 +41,54 @@  struct pmem_device {
 	phys_addr_t		phys_addr;
 	void			*virt_addr;
 	size_t			size;
+	struct pmem_ops		ops;
 };
 
 static int pmem_major;
 static atomic_t pmem_index;
 
+static void default_pmem_sync(void)
+{
+	wmb();
+}
+
+static void default_pmem_copy(void *dst, const void *src, size_t size)
+{
+	memcpy(dst, src, size);
+}
+
+static void pmem_ops_default_init(struct pmem_device *pmem)
+{
+	/*
+	 * These defaults seek to offer decent performance and minimize
+	 * the window between i/o completion and writes being durable on
+	 * media.  However, it is undefined / architecture specific
+	 * whether acknowledged data may be lost in transit if a power
+	 * fail occurs after bio_endio().
+	 */
+	pmem->ops.remap = memremap_wt;
+	pmem->ops.copy = default_pmem_copy;
+	pmem->ops.sync = default_pmem_sync;
+}
+
+static bool pmem_ops_init(struct pmem_device *pmem)
+{
+	if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
+			arch_has_persistent_sync()) {
+		/*
+		 * This arch + cpu guarantees that bio_endio() == data
+		 * durable on media.
+		 */
+		pmem->ops.remap = persistent_remap;
+		pmem->ops.copy = persistent_copy;
+		pmem->ops.sync = persistent_sync;
+		return true;
+	}
+
+	pmem_ops_default_init(pmem);
+	return false;
+}
+
 static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, int rw,
 			sector_t sector)
@@ -51,7 +101,7 @@  static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 		flush_dcache_page(page);
 	} else {
 		flush_dcache_page(page);
-		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+		pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
 	}
 
 	kunmap_atomic(mem);
@@ -82,6 +132,8 @@  static void pmem_make_request(struct request_queue *q, struct bio *bio)
 		sector += bvec.bv_len >> 9;
 	}
 
+	if (rw)
+		pmem->ops.sync();
 out:
 	bio_endio(bio, err);
 }
@@ -131,6 +183,8 @@  static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
 
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
+	if (!pmem_ops_init(pmem))
+		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	err = -EINVAL;
 	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
@@ -143,7 +197,7 @@  static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
 	 * of the CPU caches in case of a crash.
 	 */
 	err = -ENOMEM;
-	pmem->virt_addr = memremap_wt(pmem->phys_addr, pmem->size);
+	pmem->virt_addr = pmem->ops.remap(pmem->phys_addr, pmem->size);
 	if (!pmem->virt_addr)
 		goto out_release_region;
 
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 000000000000..e9a63ee1d361
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,93 @@ 
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __PMEM_H__
+#define __PMEM_H__
+
+#include <linux/io.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Architectures that define ARCH_HAS_PMEM_API must provide
+ * implementations for arch_persistent_remap(), arch_persistent_copy(),
+ * arch_persistent_sync(), and __arch_has_persistent_sync().
+ */
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * persistent_remap - map physical persistent memory for pmem api
+ * @offset: physical address of persistent memory
+ * @size: size of the mapping
+ *
+ * Establish a mapping of the architecture specific memory type expected
+ * by persistent_copy() and persistent_sync().  For example, it may be
+ * the case that an uncacheable or writethrough mapping is sufficient,
+ * or a writeback mapping provided persistent_copy() and
+ * persistent_sync() arrange for the data to be written through the
+ * cache to persistent media.
+ */
+static inline void *persistent_remap(resource_size_t offset, unsigned long size)
+{
+	return arch_persistent_remap(offset, size);
+}
+
+/**
+ * persistent_copy - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Perform a memory copy that results in the destination of the copy
+ * being effectively evicted from, or never written to, the processor
+ * cache hierarchy after the copy completes.  After persistent_copy()
+ * data may still reside in cpu or platform buffers, so this operation
+ * must be followed by a persistent_sync().
+ */
+static inline void persistent_copy(void *dst, const void *src, size_t n)
+{
+	arch_persistent_copy(dst, src, n);
+}
+
+/**
+ * persistent_sync - synchronize writes to persistent memory
+ *
+ * After a series of persistent_copy() operations this drains data from
+ * cpu write buffers and any platform (memory controller) buffers to
+ * ensure that written data is durable on persistent memory media.
+ */
+static inline void persistent_sync(void)
+{
+	arch_persistent_sync();
+}
+
+/**
+ * arch_has_persistent_sync - true if persistent_sync() ensures durability
+ *
+ * For a given cpu implementation within an architecture it is possible
+ * that persistent_sync() resolves to a nop.  In the case this returns
+ * false, pmem api users are unable to ensure durabilty and may want to
+ * fall back to a different data consistency model, or otherwise notify
+ * the user.
+ */
+static inline bool arch_has_persistent_sync(void)
+{
+	return __arch_has_persistent_sync();
+}
+#else
+/* undefined symbols */
+extern void *persistent_remap(resource_size_t offet, unsigned long size);
+extern void persistent_copy(void *dst, const void *src, size_t n);
+extern void persistent_sync(void);
+extern bool arch_has_persistent_sync(void);
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+
+#endif /* __PMEM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 601965a948e8..e6a3c892d514 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -522,4 +522,7 @@  source "lib/fonts/Kconfig"
 config ARCH_HAS_SG_CHAIN
 	def_bool n
 
+config ARCH_HAS_PMEM_API
+	def_bool n
+
 endmenu