diff mbox

[PATCHV2,3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks

Message ID 23b2515da9d06b198044ad83ca0a15ba38c24e6e.1449861203.git.tony.luck@intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Luck, Tony Dec. 11, 2015, 12:21 a.m. UTC
Using __copy_user_nocache() as inspiration create a memory copy
routine for use by kernel code with annotations to allow for
recovery from machine checks.

Notes:
1) Unlike the original we make no attempt to copy all the bytes
   up to the faulting address. The original achieves that by
   re-executing the failing part as a byte-by-byte copy,
   which will take another page fault. We don't want to have
   a second machine check!
2) Likewise the return value for the original indicates exactly
   how many bytes were not copied. Instead we provide the physical
   address of the fault (thanks to help from do_machine_check()
3) Provide helpful macros to decode the return value.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/uaccess_64.h |  5 +++
 arch/x86/kernel/x8664_ksyms_64.c  |  2 +
 arch/x86/lib/copy_user_64.S       | 91 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+)

Comments

Andy Lutomirski Dec. 11, 2015, 8:09 p.m. UTC | #1
On Thu, Dec 10, 2015 at 4:21 PM, Tony Luck <tony.luck@intel.com> wrote:
> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
>
> Notes:
> 1) Unlike the original we make no attempt to copy all the bytes
>    up to the faulting address. The original achieves that by
>    re-executing the failing part as a byte-by-byte copy,
>    which will take another page fault. We don't want to have
>    a second machine check!
> 2) Likewise the return value for the original indicates exactly
>    how many bytes were not copied. Instead we provide the physical
>    address of the fault (thanks to help from do_machine_check()
> 3) Provide helpful macros to decode the return value.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/uaccess_64.h |  5 +++
>  arch/x86/kernel/x8664_ksyms_64.c  |  2 +
>  arch/x86/lib/copy_user_64.S       | 91 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 98 insertions(+)
>
> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> index f2f9b39b274a..779cb0e77ecc 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -216,6 +216,11 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
>  extern long __copy_user_nocache(void *dst, const void __user *src,
>                                 unsigned size, int zerorest);
>
> +extern u64 mcsafe_memcpy(void *dst, const void __user *src,
> +                               unsigned size);
> +#define COPY_HAD_MCHECK(ret)   ((ret) & BIT(63))
> +#define        COPY_MCHECK_PADDR(ret)  ((ret) & ~BIT(63))
> +
>  static inline int
>  __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
>  {
> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..ec988c92c055 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>
> +EXPORT_SYMBOL(mcsafe_memcpy);
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>
> diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
> index 982ce34f4a9b..ffce93cbc9a5 100644
> --- a/arch/x86/lib/copy_user_64.S
> +++ b/arch/x86/lib/copy_user_64.S
> @@ -319,3 +319,94 @@ ENTRY(__copy_user_nocache)
>         _ASM_EXTABLE(21b,50b)
>         _ASM_EXTABLE(22b,50b)
>  ENDPROC(__copy_user_nocache)
> +
> +/*
> + * mcsafe_memcpy - Uncached memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + * This will force destination/source out of cache for more performance.
> + */
> +ENTRY(mcsafe_memcpy)
> +       cmpl $8,%edx
> +       jb 20f          /* less then 8 bytes, go to byte copy loop */
> +
> +       /* check for bad alignment of destination */
> +       movl %edi,%ecx
> +       andl $7,%ecx
> +       jz 102f                         /* already aligned */
> +       subl $8,%ecx
> +       negl %ecx
> +       subl %ecx,%edx
> +0:     movb (%rsi),%al
> +       movb %al,(%rdi)
> +       incq %rsi
> +       incq %rdi
> +       decl %ecx
> +       jnz 100b
> +102:
> +       movl %edx,%ecx
> +       andl $63,%edx
> +       shrl $6,%ecx
> +       jz 17f
> +1:     movq (%rsi),%r8
> +2:     movq 1*8(%rsi),%r9
> +3:     movq 2*8(%rsi),%r10
> +4:     movq 3*8(%rsi),%r11
> +       movnti %r8,(%rdi)
> +       movnti %r9,1*8(%rdi)
> +       movnti %r10,2*8(%rdi)
> +       movnti %r11,3*8(%rdi)
> +9:     movq 4*8(%rsi),%r8
> +10:    movq 5*8(%rsi),%r9
> +11:    movq 6*8(%rsi),%r10
> +12:    movq 7*8(%rsi),%r11
> +       movnti %r8,4*8(%rdi)
> +       movnti %r9,5*8(%rdi)
> +       movnti %r10,6*8(%rdi)
> +       movnti %r11,7*8(%rdi)
> +       leaq 64(%rsi),%rsi
> +       leaq 64(%rdi),%rdi
> +       decl %ecx
> +       jnz 1b
> +17:    movl %edx,%ecx
> +       andl $7,%edx
> +       shrl $3,%ecx
> +       jz 20f
> +18:    movq (%rsi),%r8
> +       movnti %r8,(%rdi)
> +       leaq 8(%rsi),%rsi
> +       leaq 8(%rdi),%rdi
> +       decl %ecx
> +       jnz 18b
> +20:    andl %edx,%edx
> +       jz 23f
> +       movl %edx,%ecx
> +21:    movb (%rsi),%al
> +       movb %al,(%rdi)
> +       incq %rsi
> +       incq %rdi
> +       decl %ecx
> +       jnz 21b
> +23:    xorl %eax,%eax
> +       sfence
> +       ret
> +
> +       .section .fixup,"ax"
> +30:
> +       sfence
> +       /* do_machine_check() sets %eax return value */
> +       ret
> +       .previous
> +
> +       _ASM_MCEXTABLE(0b,30b)
> +       _ASM_MCEXTABLE(1b,30b)
> +       _ASM_MCEXTABLE(2b,30b)
> +       _ASM_MCEXTABLE(3b,30b)
> +       _ASM_MCEXTABLE(4b,30b)
> +       _ASM_MCEXTABLE(9b,30b)
> +       _ASM_MCEXTABLE(10b,30b)
> +       _ASM_MCEXTABLE(11b,30b)
> +       _ASM_MCEXTABLE(12b,30b)
> +       _ASM_MCEXTABLE(18b,30b)
> +       _ASM_MCEXTABLE(21b,30b)
> +ENDPROC(mcsafe_memcpy)

I still don't get the BIT(63) thing.  Can you explain it?

--Andy
Luck, Tony Dec. 11, 2015, 9:19 p.m. UTC | #2
> I still don't get the BIT(63) thing.  Can you explain it?

It will be more obvious when I get around to writing copy_from_user().

Then we will have a function that can take page faults if there are pages
that are not present.  If the page faults can't be fixed we have a -EFAULT
condition. We can also take machine checks if we reads from a location with an
uncorrected error.

We need to distinguish these two cases because the action we take is
different. For the unresolved page fault we already have the ABI that the
copy_to/from_user() functions return zero for success, and a non-zero
return is the number of not-copied bytes.

So for my new case I'm setting bit63 ... this is never going to be set for
a failed page fault.

copy_from_user() conceptually will look like this:

int copy_from_user(void *to, void *from, unsigned long n)
{
	u64 ret = mcsafe_memcpy(to, from, n);

	if (COPY_HAD_MCHECK(r)) {
		if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
			force_sig(SIGBUS, current);
		return something;
	} else
		return ret;
}

-Tony
Konrad Rzeszutek Wilk Dec. 11, 2015, 9:32 p.m. UTC | #3
On Fri, Dec 11, 2015 at 09:19:17PM +0000, Luck, Tony wrote:
> > I still don't get the BIT(63) thing.  Can you explain it?
> 
> It will be more obvious when I get around to writing copy_from_user().
> 
> Then we will have a function that can take page faults if there are pages
> that are not present.  If the page faults can't be fixed we have a -EFAULT
> condition. We can also take machine checks if we reads from a location with an
> uncorrected error.
> 
> We need to distinguish these two cases because the action we take is
> different. For the unresolved page fault we already have the ABI that the
> copy_to/from_user() functions return zero for success, and a non-zero
> return is the number of not-copied bytes.
> 
> So for my new case I'm setting bit63 ... this is never going to be set for
> a failed page fault.

Isn't 63 NX?

> 
> copy_from_user() conceptually will look like this:
> 
> int copy_from_user(void *to, void *from, unsigned long n)
> {
> 	u64 ret = mcsafe_memcpy(to, from, n);
> 
> 	if (COPY_HAD_MCHECK(r)) {
> 		if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
> 			force_sig(SIGBUS, current);
> 		return something;
> 	} else
> 		return ret;
> }
> 
> -Tony
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Andy Lutomirski Dec. 11, 2015, 9:50 p.m. UTC | #4
On Fri, Dec 11, 2015 at 1:19 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> I still don't get the BIT(63) thing.  Can you explain it?
>
> It will be more obvious when I get around to writing copy_from_user().
>
> Then we will have a function that can take page faults if there are pages
> that are not present.  If the page faults can't be fixed we have a -EFAULT
> condition. We can also take machine checks if we reads from a location with an
> uncorrected error.
>
> We need to distinguish these two cases because the action we take is
> different. For the unresolved page fault we already have the ABI that the
> copy_to/from_user() functions return zero for success, and a non-zero
> return is the number of not-copied bytes.

I'm missing something, though.  The normal fixup_exception path
doesn't touch rax at all.  The memory_failure path does.  But couldn't
you distinguish them by just pointing the exception handlers at
different landing pads?

Also, would it be more straightforward if the mcexception landing pad
looked up the va -> pa mapping by itself?  Or is that somehow not
reliable?

--Andy
Luck, Tony Dec. 11, 2015, 10:17 p.m. UTC | #5
> I'm missing something, though.  The normal fixup_exception path
> doesn't touch rax at all.  The memory_failure path does.  But couldn't
> you distinguish them by just pointing the exception handlers at
> different landing pads?

Perhaps I'm just trying to take a short cut to avoid writing
some clever fixup code for the target ip that goes into the
exception table.

For __copy_user_nocache() we have four possible targets
for fixup depending on where we were in the function.

        .section .fixup,"ax"
30:     shll $6,%ecx
        addl %ecx,%edx
        jmp 60f
40:     lea (%rdx,%rcx,8),%rdx
        jmp 60f
50:     movl %ecx,%edx
60:     sfence
        jmp copy_user_handle_tail
        .previous

Note that this code also takes a shortcut
by jumping to copy_user_handle_tail() to
finish up the copy a byte at a time ... and
running back into the same page fault a 2nd
time to make sure the byte count is exactly
right.

I really, really, don't want to run back into
the poison again.  It would probably work, but
because current generation Intel cpus broadcast machine
checks to every logical cpu, it is a lot of overhead,
and potentially risky.

> Also, would it be more straightforward if the mcexception landing pad
> looked up the va -> pa mapping by itself?  Or is that somehow not
> reliable?

If we did get all the above right, then we could have
target use virt_to_phys() to convert to physical ...
I don't see that this part would be a problem.

-Tony
Dan Williams Dec. 11, 2015, 10:20 p.m. UTC | #6
On Fri, Dec 11, 2015 at 2:17 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> Also, would it be more straightforward if the mcexception landing pad
>> looked up the va -> pa mapping by itself?  Or is that somehow not
>> reliable?
>
> If we did get all the above right, then we could have
> target use virt_to_phys() to convert to physical ...
> I don't see that this part would be a problem.

virt_to_phys() implies a linear address.  In the case of the use in
the pmem driver we'll be using an ioremap()'d address off somewherein
vmalloc space.
Andy Lutomirski Dec. 11, 2015, 10:26 p.m. UTC | #7
On Fri, Dec 11, 2015 at 2:20 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Fri, Dec 11, 2015 at 2:17 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>> Also, would it be more straightforward if the mcexception landing pad
>>> looked up the va -> pa mapping by itself?  Or is that somehow not
>>> reliable?
>>
>> If we did get all the above right, then we could have
>> target use virt_to_phys() to convert to physical ...
>> I don't see that this part would be a problem.
>
> virt_to_phys() implies a linear address.  In the case of the use in
> the pmem driver we'll be using an ioremap()'d address off somewherein
> vmalloc space.

There's always slow_virt_to_phys.

Note that I don't fundamentally object to passing the pa to the fixup
handler.  I just think we should try to disentangle that from figuring
out what exactly the failure was.

Also, are there really PCOMMIT-capable CPUs that still forcibly
broadcast MCE?  If, so, that's unfortunate.

--Andy
Luck, Tony Dec. 11, 2015, 10:35 p.m. UTC | #8
> Also, are there really PCOMMIT-capable CPUs that still forcibly
> broadcast MCE?  If, so, that's unfortunate.

PCOMMIT and LMCE arrive together ... though BIOS is in the decision
path to enable LMCE, so it is possible that some systems could still
broadcast if the BIOS writer decides to not allow local.

But a machine check safe copy_from_user() would be useful
current generation cpus that broadcast all the time.

-Tony
Andy Lutomirski Dec. 11, 2015, 10:38 p.m. UTC | #9
On Fri, Dec 11, 2015 at 2:35 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> Also, are there really PCOMMIT-capable CPUs that still forcibly
>> broadcast MCE?  If, so, that's unfortunate.
>
> PCOMMIT and LMCE arrive together ... though BIOS is in the decision
> path to enable LMCE, so it is possible that some systems could still
> broadcast if the BIOS writer decides to not allow local.

I really wish Intel would stop doing that.

>
> But a machine check safe copy_from_user() would be useful
> current generation cpus that broadcast all the time.

Fair enough.

--Andy
Luck, Tony Dec. 11, 2015, 10:45 p.m. UTC | #10
>> But a machine check safe copy_from_user() would be useful
>> current generation cpus that broadcast all the time.
>
> Fair enough.

Thanks for spending the time to look at this.  Coaxing me to re-write the
tail of do_machine_check() has made that code much better. Too many
years of one patch on top of another without looking at the whole context.

Cogitate on this series over the weekend and see if you can give me
an Acked-by or Reviewed-by (I'll be adding a #define for BIT(63)).

-Tony
Andy Lutomirski Dec. 11, 2015, 10:55 p.m. UTC | #11
On Fri, Dec 11, 2015 at 2:45 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>> But a machine check safe copy_from_user() would be useful
>>> current generation cpus that broadcast all the time.
>>
>> Fair enough.
>
> Thanks for spending the time to look at this.  Coaxing me to re-write the
> tail of do_machine_check() has made that code much better. Too many
> years of one patch on top of another without looking at the whole context.
>
> Cogitate on this series over the weekend and see if you can give me
> an Acked-by or Reviewed-by (I'll be adding a #define for BIT(63)).

I can't review the MCE decoding part, because I don't understand it
nearly well enough.  The interaction with the core fault handling
looks fine, modulo any need to bikeshed on the macro naming (which
I'll refrain from doing).

I still think it would be better if you get rid of BIT(63) and use a
pair of landing pads, though.  They could be as simple as:

.Lpage_fault_goes_here:
    xorq %rax, %rax
    jmp .Lbad

.Lmce_goes_here:
    /* set high bit of rax or whatever */
    /* fall through */

.Lbad:
    /* deal with it */

That way the magic is isolated to the function that needs the magic.

Also, at least renaming the macro to EXTABLE_MC_PA_IN_AX might be
nice.  It'll keep future users honest.  Maybe some day there'll be a
PA_IN_AX flag, and, heck, maybe some day there'll be ways to get info
for non-MCE faults delivered through fixup_exception.

--Andy
Ingo Molnar Dec. 14, 2015, 8:36 a.m. UTC | #12
* Andy Lutomirski <luto@amacapital.net> wrote:

> I still think it would be better if you get rid of BIT(63) and use a
> pair of landing pads, though.  They could be as simple as:
> 
> .Lpage_fault_goes_here:
>     xorq %rax, %rax
>     jmp .Lbad
> 
> .Lmce_goes_here:
>     /* set high bit of rax or whatever */
>     /* fall through */
> 
> .Lbad:
>     /* deal with it */
> 
> That way the magic is isolated to the function that needs the magic.

Seconded - this is the usual pattern we use in all assembly functions.

Thanks,

	Ingo
Luck, Tony Dec. 14, 2015, 7:46 p.m. UTC | #13
On Mon, Dec 14, 2015 at 09:36:25AM +0100, Ingo Molnar wrote:
> >     /* deal with it */
> > 
> > That way the magic is isolated to the function that needs the magic.
> 
> Seconded - this is the usual pattern we use in all assembly functions.

Ok - you want me to write some x86 assembly code (you may regret that).

Initial question ... here's the fixup for __copy_user_nocache()

		.section .fixup,"ax"
	30:     shll $6,%ecx
		addl %ecx,%edx
		jmp 60f
	40:     lea (%rdx,%rcx,8),%rdx
		jmp 60f
	50:     movl %ecx,%edx
	60:     sfence
		jmp copy_user_handle_tail
		.previous

Are %ecx and %rcx synonyms for the same register? Is there some
super subtle reason we use the 'r' names in the "40" fixup, but
the 'e' names everywhere else in this code (and the 'e' names in
the body of the original function)?

-Tony
Andy Lutomirski Dec. 14, 2015, 8:11 p.m. UTC | #14
On Mon, Dec 14, 2015 at 11:46 AM, Luck, Tony <tony.luck@intel.com> wrote:
> On Mon, Dec 14, 2015 at 09:36:25AM +0100, Ingo Molnar wrote:
>> >     /* deal with it */
>> >
>> > That way the magic is isolated to the function that needs the magic.
>>
>> Seconded - this is the usual pattern we use in all assembly functions.
>
> Ok - you want me to write some x86 assembly code (you may regret that).
>

All you have to do is erase all of the ia64 asm knowledge from your
brain and repurpose 1% of that space for x86 asm.  You'll be a
world-class expert!

> Initial question ... here's the fixup for __copy_user_nocache()
>
>                 .section .fixup,"ax"
>         30:     shll $6,%ecx
>                 addl %ecx,%edx
>                 jmp 60f
>         40:     lea (%rdx,%rcx,8),%rdx
>                 jmp 60f
>         50:     movl %ecx,%edx
>         60:     sfence
>                 jmp copy_user_handle_tail
>                 .previous
>
> Are %ecx and %rcx synonyms for the same register? Is there some
> super subtle reason we use the 'r' names in the "40" fixup, but
> the 'e' names everywhere else in this code (and the 'e' names in
> the body of the original function)?

rcx is a 64-bit register.  ecx is the low 32 bits of it.  If you read
from ecx, you get the low 32 bits, but if you write to ecx, you zero
the high bits as a side-effect.

--Andy
Borislav Petkov Dec. 15, 2015, 1:11 p.m. UTC | #15
On Thu, Dec 10, 2015 at 04:21:50PM -0800, Tony Luck wrote:
> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
> 
> Notes:
> 1) Unlike the original we make no attempt to copy all the bytes
>    up to the faulting address. The original achieves that by
>    re-executing the failing part as a byte-by-byte copy,
>    which will take another page fault. We don't want to have
>    a second machine check!
> 2) Likewise the return value for the original indicates exactly
>    how many bytes were not copied. Instead we provide the physical
>    address of the fault (thanks to help from do_machine_check()
> 3) Provide helpful macros to decode the return value.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/uaccess_64.h |  5 +++
>  arch/x86/kernel/x8664_ksyms_64.c  |  2 +
>  arch/x86/lib/copy_user_64.S       | 91 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 98 insertions(+)

...

> + * mcsafe_memcpy - Uncached memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + * This will force destination/source out of cache for more performance.

... and the non-temporal version is the optimal one even though we're
defaulting to copy_user_enhanced_fast_string for memcpy on modern Intel
CPUs...?

Btw, it should be also inside an ifdef if we're going to ifdef
CONFIG_MCE_KERNEL_RECOVERY everywhere else.
Dan Williams Dec. 15, 2015, 5:45 p.m. UTC | #16
On Tue, Dec 15, 2015 at 5:11 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Thu, Dec 10, 2015 at 04:21:50PM -0800, Tony Luck wrote:
>> Using __copy_user_nocache() as inspiration create a memory copy
>> routine for use by kernel code with annotations to allow for
>> recovery from machine checks.
>>
>> Notes:
>> 1) Unlike the original we make no attempt to copy all the bytes
>>    up to the faulting address. The original achieves that by
>>    re-executing the failing part as a byte-by-byte copy,
>>    which will take another page fault. We don't want to have
>>    a second machine check!
>> 2) Likewise the return value for the original indicates exactly
>>    how many bytes were not copied. Instead we provide the physical
>>    address of the fault (thanks to help from do_machine_check()
>> 3) Provide helpful macros to decode the return value.
>>
>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>> ---
>>  arch/x86/include/asm/uaccess_64.h |  5 +++
>>  arch/x86/kernel/x8664_ksyms_64.c  |  2 +
>>  arch/x86/lib/copy_user_64.S       | 91 +++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 98 insertions(+)
>
> ...
>
>> + * mcsafe_memcpy - Uncached memory copy with machine check exception handling
>> + * Note that we only catch machine checks when reading the source addresses.
>> + * Writes to target are posted and don't generate machine checks.
>> + * This will force destination/source out of cache for more performance.
>
> ... and the non-temporal version is the optimal one even though we're
> defaulting to copy_user_enhanced_fast_string for memcpy on modern Intel
> CPUs...?

At least the pmem driver use case does not want caching of the
source-buffer since that is the raw "disk" media.  I.e. in
pmem_do_bvec() we'd use this to implement memcpy_from_pmem().
However, caching the destination-buffer may prove beneficial since
that data is likely to be consumed immediately by the thread that
submitted the i/o.
Luck, Tony Dec. 15, 2015, 5:53 p.m. UTC | #17
>> ... and the non-temporal version is the optimal one even though we're
>> defaulting to copy_user_enhanced_fast_string for memcpy on modern Intel
>> CPUs...?

My current generation cpu has a bit of an issue with recovering from a
machine check in a "rep mov" ... so I'm working with a version of memcpy
that unrolls into individual mov instructions for now.

> At least the pmem driver use case does not want caching of the
> source-buffer since that is the raw "disk" media.  I.e. in
> pmem_do_bvec() we'd use this to implement memcpy_from_pmem().
> However, caching the destination-buffer may prove beneficial since
> that data is likely to be consumed immediately by the thread that
> submitted the i/o.

I can drop the "nti" from the destination moves.  Does "nti" work
on the load from source address side to avoid cache allocation?

On another topic raised by Boris ... is there some CONFIG_PMEM*
that I should use as a dependency to enable all this?

-Tony
Borislav Petkov Dec. 15, 2015, 6:21 p.m. UTC | #18
On Tue, Dec 15, 2015 at 05:53:31PM +0000, Luck, Tony wrote:
> My current generation cpu has a bit of an issue with recovering from a
> machine check in a "rep mov" ... so I'm working with a version of memcpy
> that unrolls into individual mov instructions for now.

Ah.

> I can drop the "nti" from the destination moves.  Does "nti" work
> on the load from source address side to avoid cache allocation?

I don't think so:

+1:     movq (%rsi),%r8
+2:     movq 1*8(%rsi),%r9
+3:     movq 2*8(%rsi),%r10
+4:     movq 3*8(%rsi),%r11
...

You need to load the data into registers first because MOVNTI needs them
there as it does reg -> mem movement. That first load from memory into
registers with a normal MOV will pull the data into the cache.

Perhaps the first thing to try would be to see what slowdown normal MOVs
bring and if not really noticeable, use those instead.

> On another topic raised by Boris ... is there some CONFIG_PMEM*
> that I should use as a dependency to enable all this?

I found CONFIG_LIBNVDIMM only today:

drivers/nvdimm/Kconfig:1:menuconfig LIBNVDIMM
drivers/nvdimm/Kconfig:2:       tristate "NVDIMM (Non-Volatile Memory Device) Support"
Dan Williams Dec. 15, 2015, 6:27 p.m. UTC | #19
On Tue, Dec 15, 2015 at 9:53 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>> ... and the non-temporal version is the optimal one even though we're
>>> defaulting to copy_user_enhanced_fast_string for memcpy on modern Intel
>>> CPUs...?
>
> My current generation cpu has a bit of an issue with recovering from a
> machine check in a "rep mov" ... so I'm working with a version of memcpy
> that unrolls into individual mov instructions for now.
>
>> At least the pmem driver use case does not want caching of the
>> source-buffer since that is the raw "disk" media.  I.e. in
>> pmem_do_bvec() we'd use this to implement memcpy_from_pmem().
>> However, caching the destination-buffer may prove beneficial since
>> that data is likely to be consumed immediately by the thread that
>> submitted the i/o.
>
> I can drop the "nti" from the destination moves.  Does "nti" work
> on the load from source address side to avoid cache allocation?

My mistake, I don't think we have an uncached load capability, only store.

> On another topic raised by Boris ... is there some CONFIG_PMEM*
> that I should use as a dependency to enable all this?

I'd rather make this a "select ARCH_MCSAFE_MEMCPY".  Since it's not a
hard dependency and the details will be hidden behind
memcpy_from_pmem().  Specifically, the details will be handled by a
new arch_memcpy_from_pmem() in arch/x86/include/asm/pmem.h to
supplement the existing arch_memcpy_to_pmem().
Dan Williams Dec. 15, 2015, 6:35 p.m. UTC | #20
On Tue, Dec 15, 2015 at 10:27 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Dec 15, 2015 at 9:53 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>>> ... and the non-temporal version is the optimal one even though we're
>>>> defaulting to copy_user_enhanced_fast_string for memcpy on modern Intel
>>>> CPUs...?
>>
>> My current generation cpu has a bit of an issue with recovering from a
>> machine check in a "rep mov" ... so I'm working with a version of memcpy
>> that unrolls into individual mov instructions for now.
>>
>>> At least the pmem driver use case does not want caching of the
>>> source-buffer since that is the raw "disk" media.  I.e. in
>>> pmem_do_bvec() we'd use this to implement memcpy_from_pmem().
>>> However, caching the destination-buffer may prove beneficial since
>>> that data is likely to be consumed immediately by the thread that
>>> submitted the i/o.
>>
>> I can drop the "nti" from the destination moves.  Does "nti" work
>> on the load from source address side to avoid cache allocation?
>
> My mistake, I don't think we have an uncached load capability, only store.

Correction we have MOVNTDQA, but that requires saving the fpu state
and marking the memory as WC, i.e. probably not worth it.
Borislav Petkov Dec. 15, 2015, 6:39 p.m. UTC | #21
On Tue, Dec 15, 2015 at 10:35:49AM -0800, Dan Williams wrote:
> Correction we have MOVNTDQA, but that requires saving the fpu state
> and marking the memory as WC, i.e. probably not worth it.

Not really. Last time I tried an SSE3 memcpy in the kernel like glibc
does, it wasn't worth it. The enhanced REP; MOVSB is hands down faster.
Elliott, Robert (Servers) Dec. 15, 2015, 7:19 p.m. UTC | #22
> -----Original Message-----
> From: Linux-nvdimm [mailto:linux-nvdimm-bounces@lists.01.org] On Behalf
> Of Borislav Petkov
> Sent: Tuesday, December 15, 2015 12:39 PM
> To: Dan Williams <dan.j.williams@intel.com>
> Cc: Luck, Tony <tony.luck@intel.com>; linux-nvdimm <linux-
> nvdimm@ml01.01.org>; X86 ML <x86@kernel.org>; linux-
> kernel@vger.kernel.org; Linux MM <linux-mm@kvack.org>; Andy Lutomirski
> <luto@kernel.org>; Andrew Morton <akpm@linux-foundation.org>; Ingo Molnar
> <mingo@kernel.org>
> Subject: Re: [PATCHV2 3/3] x86, ras: Add mcsafe_memcpy() function to
> recover from machine checks
> 
> On Tue, Dec 15, 2015 at 10:35:49AM -0800, Dan Williams wrote:
> > Correction we have MOVNTDQA, but that requires saving the fpu state
> > and marking the memory as WC, i.e. probably not worth it.
> 
> Not really. Last time I tried an SSE3 memcpy in the kernel like glibc
> does, it wasn't worth it. The enhanced REP; MOVSB is hands down faster.

Reading from NVDIMM, rep movsb is efficient, but it 
fills the CPU caches with the NVDIMM addresses. For
large data moves (not uncommon for storage) this
will crowd out more important cacheable data.

For normal block device reads made through the pmem
block device driver, this CPU cache consumption is
wasteful, since it is unlikely the application will
ask pmem to read the same addresses anytime soon.
Due to the historic long latency of storage devices,
applications don't re-read from storage again; they
save the results.  So, the streaming-load
instructions are beneficial:
* movntdqa (16-byte xmm registers) 
* vmovntdqa (32-byte ymm registers)
* vmovntdqa (64-byte zmm registers)

Dan Williams wrote:
> Correction we have MOVNTDQA, but that requires
> saving the fpu state and marking the memory as WC
> i.e. probably not worth it.

Although the WC memory type is described in the SDM
in the most detail:
    "An implementation may also make use of the
    non-temporal hint associated with this instruction
    if the memory source is WB (write back) memory
    type. ... may optimize cache reads generated by 
    (V)MOVNTDQA on WB memory type to reduce cache 
    evictions."

For applications doing loads from mmap() DAX memory, 
the CPU cache usage could be worthwhile, because
applications expect mmap() regions to consist of
traditional writeback-cached memory and might do
lots of loads/stores.

Writing to the NVDIMM requires either:
* non-temporal stores; or
* normal stores + cache flushes + fences

movnti is OK for small transfers, but these are
better for bulk moves:
* movntdq (16-byte xmm registers)
* vmovntdq (32-byte ymm registers)
* vmovntdq (64-byte zmm registers)

---
Robert Elliott, HPE Persistent Memory
Borislav Petkov Dec. 15, 2015, 7:28 p.m. UTC | #23
On Tue, Dec 15, 2015 at 07:19:58PM +0000, Elliott, Robert (Persistent Memory) wrote:

...

> Due to the historic long latency of storage devices,
> applications don't re-read from storage again; they
> save the results.
> So, the streaming-load instructions are beneficial:

That's the theory...

Do you also have some actual performance numbers where non-temporal
operations are better than the REP; MOVSB and *actually* show
improvements? And no microbenchmarks please.

Thanks.
Elliott, Robert (Servers) Dec. 15, 2015, 8:25 p.m. UTC | #24
---
Robert Elliott, HPE Persistent Memory


> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Tuesday, December 15, 2015 1:29 PM
> To: Elliott, Robert (Persistent Memory) <elliott@hpe.com>
> Cc: Dan Williams <dan.j.williams@intel.com>; Luck, Tony
> <tony.luck@intel.com>; linux-nvdimm <linux-nvdimm@ml01.01.org>; X86 ML
> <x86@kernel.org>; linux-kernel@vger.kernel.org; Linux MM <linux-
> mm@kvack.org>; Andy Lutomirski <luto@kernel.org>; Andrew Morton
> <akpm@linux-foundation.org>; Ingo Molnar <mingo@kernel.org>
> Subject: Re: [PATCHV2 3/3] x86, ras: Add mcsafe_memcpy() function to
> recover from machine checks
> 
> On Tue, Dec 15, 2015 at 07:19:58PM +0000, Elliott, Robert (Persistent
> Memory) wrote:
> 
> ...
> 
> > Due to the historic long latency of storage devices,
> > applications don't re-read from storage again; they
> > save the results.
> > So, the streaming-load instructions are beneficial:
> 
> That's the theory...
> 
> Do you also have some actual performance numbers where non-temporal
> operations are better than the REP; MOVSB and *actually* show
> improvements? And no microbenchmarks please.
> 
> Thanks.
> 

This isn't exactly what you're looking for, but here is 
an example of fio doing reads from pmem devices (reading
from NVDIMMs, writing to DIMMs) with various transfer
sizes.

At 256 KiB, all the main memory buffers fit in the CPU
caches, so no write traffic appears on DDR (just the reads
from the NVDIMMs).  At 1 MiB, the data spills out of the
caches, and writes to the DIMMs end up on DDR.

Although DDR is busier, fio gets a lot less work done:
* 256 KiB: 90 GiB/s by fio
*   1 MiB: 49 GiB/s by fio

We could try modifying pmem to use its own non-temporal
memcpy functions (I've posted experimental patches
before that did this) to see if that transition point
shifts.  We can also watch the CPU cache statistics
while running.

Here are statistics from Intel's pcm-memory.x 
(pardon the wide formatting):

256 KiB
=======
pmem0: (groupid=0, jobs=40): err= 0: pid=20867: Tue Nov 24 18:20:08 2015
  read : io=5219.1GB, bw=89079MB/s, iops=356314, runt= 60006msec
  cpu          : usr=1.74%, sys=96.16%, ctx=49576, majf=0, minf=21997

Run status group 0 (all jobs):
   READ: io=5219.1GB, aggrb=89079MB/s, minb=89079MB/s, maxb=89079MB/s, mint=60006msec, maxt=60006msec

|---------------------------------------||---------------------------------------|
|--             Socket  0             --||--             Socket  1             --|
|---------------------------------------||---------------------------------------|
|--     Memory Channel Monitoring     --||--     Memory Channel Monitoring     --|
|---------------------------------------||---------------------------------------|
|-- Mem Ch  0: Reads (MB/s): 11778.11 --||-- Mem Ch  0: Reads (MB/s): 11743.99 --|
|--            Writes(MB/s):    51.83 --||--            Writes(MB/s):    43.25 --|
|-- Mem Ch  1: Reads (MB/s): 11779.90 --||-- Mem Ch  1: Reads (MB/s): 11736.06 --|
|--            Writes(MB/s):    48.73 --||--            Writes(MB/s):    37.86 --|
|-- Mem Ch  4: Reads (MB/s): 11784.79 --||-- Mem Ch  4: Reads (MB/s): 11746.94 --|
|--            Writes(MB/s):    52.90 --||--            Writes(MB/s):    43.73 --|
|-- Mem Ch  5: Reads (MB/s): 11778.48 --||-- Mem Ch  5: Reads (MB/s): 11741.55 --|
|--            Writes(MB/s):    47.62 --||--            Writes(MB/s):    37.80 --|
|-- NODE 0 Mem Read (MB/s) : 47121.27 --||-- NODE 1 Mem Read (MB/s) : 46968.53 --|
|-- NODE 0 Mem Write(MB/s) :   201.08 --||-- NODE 1 Mem Write(MB/s) :   162.65 --|
|-- NODE 0 P. Write (T/s):     190927 --||-- NODE 1 P. Write (T/s):     182961 --|
|-- NODE 0 Memory (MB/s):    47322.36 --||-- NODE 1 Memory (MB/s):    47131.17 --|
|---------------------------------------||---------------------------------------|
|---------------------------------------||---------------------------------------|
|--                   System Read Throughput(MB/s):  94089.80                  --|
|--                  System Write Throughput(MB/s):    363.73                  --|
|--                 System Memory Throughput(MB/s):  94453.52                  --|
|---------------------------------------||---------------------------------------|

1 MiB
=====
|---------------------------------------||---------------------------------------|
|--             Socket  0             --||--             Socket  1             --|
|---------------------------------------||---------------------------------------|
|--     Memory Channel Monitoring     --||--     Memory Channel Monitoring     --|
|---------------------------------------||---------------------------------------|
|-- Mem Ch  0: Reads (MB/s):  7227.83 --||-- Mem Ch  0: Reads (MB/s):  7047.45 --|
|--            Writes(MB/s):  5894.47 --||--            Writes(MB/s):  6010.66 --|
|-- Mem Ch  1: Reads (MB/s):  7229.32 --||-- Mem Ch  1: Reads (MB/s):  7041.79 --|
|--            Writes(MB/s):  5891.38 --||--            Writes(MB/s):  6003.19 --|
|-- Mem Ch  4: Reads (MB/s):  7230.70 --||-- Mem Ch  4: Reads (MB/s):  7052.44 --|
|--            Writes(MB/s):  5888.63 --||--            Writes(MB/s):  6012.49 --|
|-- Mem Ch  5: Reads (MB/s):  7229.16 --||-- Mem Ch  5: Reads (MB/s):  7047.19 --|
|--            Writes(MB/s):  5882.45 --||--            Writes(MB/s):  6008.11 --|
|-- NODE 0 Mem Read (MB/s) : 28917.01 --||-- NODE 1 Mem Read (MB/s) : 28188.87 --|
|-- NODE 0 Mem Write(MB/s) : 23556.93 --||-- NODE 1 Mem Write(MB/s) : 24034.46 --|
|-- NODE 0 P. Write (T/s):     238713 --||-- NODE 1 P. Write (T/s):     228040 --|
|-- NODE 0 Memory (MB/s):    52473.94 --||-- NODE 1 Memory (MB/s):    52223.33 --|
|---------------------------------------||---------------------------------------|
|---------------------------------------||---------------------------------------|
|--                   System Read Throughput(MB/s):  57105.87                  --|
|--                  System Write Throughput(MB/s):  47591.39                  --|
|--                 System Memory Throughput(MB/s): 104697.27                  --|
|---------------------------------------||---------------------------------------|
Borislav Petkov Dec. 21, 2015, 5:33 p.m. UTC | #25
On Tue, Dec 15, 2015 at 08:25:37PM +0000, Elliott, Robert (Persistent Memory) wrote:
> This isn't exactly what you're looking for, but here is 
> an example of fio doing reads from pmem devices (reading
> from NVDIMMs, writing to DIMMs) with various transfer
> sizes.

... and "fio" is?

> At 256 KiB, all the main memory buffers fit in the CPU
> caches, so no write traffic appears on DDR (just the reads
> from the NVDIMMs).  At 1 MiB, the data spills out of the
> caches, and writes to the DIMMs end up on DDR.
> 
> Although DDR is busier, fio gets a lot less work done:
> * 256 KiB: 90 GiB/s by fio
> *   1 MiB: 49 GiB/s by fio

Yeah, I don't think that answers the question I had: whether REP; MOVSB
is faster/better than using non-temporal stores. But you say that
already above.

Also, if you do non-temporal stores then you're expected to have *more*
memory controller and DIMM traffic as you're pushing everything out
through the WCC.

What would need to be measured instead is, IMO, two things:

* compare NTI vs REP; MOVSB data movement to see the differences in
performance aspects

* run a benchmark (no idea which one) which would measure the positive
impact of the NTI versions which do not pollute the cache and thus do
not hurt other workloads' working set being pushed out of the cache.

Also, we don't really know (at least I don't) what REP; MOVSB
improvements hide behind those enhanced fast string optimizations.
It could be that microcode is doing some aggregation into cachelines
and doing much bigger writes which could compensate for the cache
pollution.

Questions over questions...

> We could try modifying pmem to use its own non-temporal
> memcpy functions (I've posted experimental patches
> before that did this) to see if that transition point
> shifts.  We can also watch the CPU cache statistics
> while running.
> 
> Here are statistics from Intel's pcm-memory.x 
> (pardon the wide formatting):
> 
> 256 KiB
> =======
> pmem0: (groupid=0, jobs=40): err= 0: pid=20867: Tue Nov 24 18:20:08 2015
>   read : io=5219.1GB, bw=89079MB/s, iops=356314, runt= 60006msec
>   cpu          : usr=1.74%, sys=96.16%, ctx=49576, majf=0, minf=21997
> 
> Run status group 0 (all jobs):
>    READ: io=5219.1GB, aggrb=89079MB/s, minb=89079MB/s, maxb=89079MB/s, mint=60006msec, maxt=60006msec
> 
> |---------------------------------------||---------------------------------------|
> |--             Socket  0             --||--             Socket  1             --|
> |---------------------------------------||---------------------------------------|
> |--     Memory Channel Monitoring     --||--     Memory Channel Monitoring     --|
> |---------------------------------------||---------------------------------------|
> |-- Mem Ch  0: Reads (MB/s): 11778.11 --||-- Mem Ch  0: Reads (MB/s): 11743.99 --|
> |--            Writes(MB/s):    51.83 --||--            Writes(MB/s):    43.25 --|
> |-- Mem Ch  1: Reads (MB/s): 11779.90 --||-- Mem Ch  1: Reads (MB/s): 11736.06 --|
> |--            Writes(MB/s):    48.73 --||--            Writes(MB/s):    37.86 --|
> |-- Mem Ch  4: Reads (MB/s): 11784.79 --||-- Mem Ch  4: Reads (MB/s): 11746.94 --|
> |--            Writes(MB/s):    52.90 --||--            Writes(MB/s):    43.73 --|
> |-- Mem Ch  5: Reads (MB/s): 11778.48 --||-- Mem Ch  5: Reads (MB/s): 11741.55 --|
> |--            Writes(MB/s):    47.62 --||--            Writes(MB/s):    37.80 --|
> |-- NODE 0 Mem Read (MB/s) : 47121.27 --||-- NODE 1 Mem Read (MB/s) : 46968.53 --|
> |-- NODE 0 Mem Write(MB/s) :   201.08 --||-- NODE 1 Mem Write(MB/s) :   162.65 --|
> |-- NODE 0 P. Write (T/s):     190927 --||-- NODE 1 P. Write (T/s):     182961 --|

What does T/s mean?

> |-- NODE 0 Memory (MB/s):    47322.36 --||-- NODE 1 Memory (MB/s):    47131.17 --|
> |---------------------------------------||---------------------------------------|
> |---------------------------------------||---------------------------------------|
> |--                   System Read Throughput(MB/s):  94089.80                  --|
> |--                  System Write Throughput(MB/s):    363.73                  --|
> |--                 System Memory Throughput(MB/s):  94453.52                  --|
> |---------------------------------------||---------------------------------------|
> 
> 1 MiB
> =====
> |---------------------------------------||---------------------------------------|
> |--             Socket  0             --||--             Socket  1             --|
> |---------------------------------------||---------------------------------------|
> |--     Memory Channel Monitoring     --||--     Memory Channel Monitoring     --|
> |---------------------------------------||---------------------------------------|
> |-- Mem Ch  0: Reads (MB/s):  7227.83 --||-- Mem Ch  0: Reads (MB/s):  7047.45 --|
> |--            Writes(MB/s):  5894.47 --||--            Writes(MB/s):  6010.66 --|
> |-- Mem Ch  1: Reads (MB/s):  7229.32 --||-- Mem Ch  1: Reads (MB/s):  7041.79 --|
> |--            Writes(MB/s):  5891.38 --||--            Writes(MB/s):  6003.19 --|
> |-- Mem Ch  4: Reads (MB/s):  7230.70 --||-- Mem Ch  4: Reads (MB/s):  7052.44 --|
> |--            Writes(MB/s):  5888.63 --||--            Writes(MB/s):  6012.49 --|
> |-- Mem Ch  5: Reads (MB/s):  7229.16 --||-- Mem Ch  5: Reads (MB/s):  7047.19 --|
> |--            Writes(MB/s):  5882.45 --||--            Writes(MB/s):  6008.11 --|
> |-- NODE 0 Mem Read (MB/s) : 28917.01 --||-- NODE 1 Mem Read (MB/s) : 28188.87 --|
> |-- NODE 0 Mem Write(MB/s) : 23556.93 --||-- NODE 1 Mem Write(MB/s) : 24034.46 --|
> |-- NODE 0 P. Write (T/s):     238713 --||-- NODE 1 P. Write (T/s):     228040 --|
> |-- NODE 0 Memory (MB/s):    52473.94 --||-- NODE 1 Memory (MB/s):    52223.33 --|
> |---------------------------------------||---------------------------------------|
> |---------------------------------------||---------------------------------------|
> |--                   System Read Throughput(MB/s):  57105.87                  --|
> |--                  System Write Throughput(MB/s):  47591.39                  --|
> |--                 System Memory Throughput(MB/s): 104697.27                  --|
> |---------------------------------------||---------------------------------------|

Looks to me like, because writes have increased, the read bandwidth has
dropped too, which makes sense.
diff mbox

Patch

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2f9b39b274a..779cb0e77ecc 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -216,6 +216,11 @@  __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
+extern u64 mcsafe_memcpy(void *dst, const void __user *src,
+				unsigned size);
+#define COPY_HAD_MCHECK(ret)	((ret) & BIT(63))
+#define	COPY_MCHECK_PADDR(ret)	((ret) & ~BIT(63))
+
 static inline int
 __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..ec988c92c055 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@  EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL(mcsafe_memcpy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 982ce34f4a9b..ffce93cbc9a5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -319,3 +319,94 @@  ENTRY(__copy_user_nocache)
 	_ASM_EXTABLE(21b,50b)
 	_ASM_EXTABLE(22b,50b)
 ENDPROC(__copy_user_nocache)
+
+/*
+ * mcsafe_memcpy - Uncached memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(mcsafe_memcpy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	movnti %r8,(%rdi)
+	movnti %r9,1*8(%rdi)
+	movnti %r10,2*8(%rdi)
+	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	movnti %r8,4*8(%rdi)
+	movnti %r9,5*8(%rdi)
+	movnti %r10,6*8(%rdi)
+	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:
+	sfence
+	/* do_machine_check() sets %eax return value */
+	ret
+	.previous
+
+	_ASM_MCEXTABLE(0b,30b)
+	_ASM_MCEXTABLE(1b,30b)
+	_ASM_MCEXTABLE(2b,30b)
+	_ASM_MCEXTABLE(3b,30b)
+	_ASM_MCEXTABLE(4b,30b)
+	_ASM_MCEXTABLE(9b,30b)
+	_ASM_MCEXTABLE(10b,30b)
+	_ASM_MCEXTABLE(11b,30b)
+	_ASM_MCEXTABLE(12b,30b)
+	_ASM_MCEXTABLE(18b,30b)
+	_ASM_MCEXTABLE(21b,30b)
+ENDPROC(mcsafe_memcpy)