diff mbox

[v10,3/4] x86, mce: Add __mcsafe_copy()

Message ID 6b63a88e925bbc821dc87f209909c3c1166b3261.1454618190.git.tony.luck@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Luck, Tony Jan. 8, 2016, 9:18 p.m. UTC
Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
        u64 trapnr;
        u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Note that this is probably the first of several copy functions.
We can make new ones for non-temporal cache handling etc.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/string_64.h |   8 +++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+)

Comments

Borislav Petkov Feb. 7, 2016, 4:49 p.m. UTC | #1
On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)

...

> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML
> +/*
> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx

You can save yourself this MOV here in what is, I'm assuming, the
general likely case where @src is aligned and do:

        /* check for bad alignment of source */
        testl $7, %esi
        /* already aligned? */
        jz 102f

        movl %esi,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

> +	andl $7,%ecx
> +	jz 102f				/* already aligned */

Please move side-comments over the line they're referring to.

> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f

Please add a \n after the JMPs for better readability - those blocks are
dense as it is. They could use some comments too.

> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)

You can say "movq" too here, for consistency.

> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11

Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
and using them too and thus copying a whole cacheline in one go?

We would need to restore them when we're done with the cacheline-wise
shuffle, of course.

> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b

...
Borislav Petkov Feb. 7, 2016, 4:55 p.m. UTC | #2
On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)
> 
> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
> index ff8b9a17dc4b..5b24039463a4 100644
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
>  #define memset(s, c, n) __memset(s, c, n)
>  #endif
>  
> +struct mcsafe_ret {
> +	u64 trapnr;
> +	u64 remain;
> +};
> +
> +struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
> +extern void __mcsafe_copy_end(void);
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* _ASM_X86_STRING_64_H */
> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..fff245462a8c 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +EXPORT_SYMBOL_GPL(__mcsafe_copy);
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML

So this is because we get a link failure on UML:

arch/x86/um/built-in.o:(__ex_table+0x8): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x14): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x20): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x2c): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x38): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x44): more undefined references to `ex_handler_fault' follow
collect2: error: ld returned 1 exit status
make: *** [vmlinux] Error 1

due to those

> +     _ASM_EXTABLE_FAULT(0b,30b)
> +     _ASM_EXTABLE_FAULT(1b,31b)
> +     _ASM_EXTABLE_FAULT(2b,32b)
> +     _ASM_EXTABLE_FAULT(3b,33b)
> +     _ASM_EXTABLE_FAULT(4b,34b)

things below and that's because ex_handler_fault() is defined in
arch/x86/mm/extable.c and UML doesn't include that file in the build. It
takes kernel/extable.c and lib/extable.c only.

Richi, what's the usual way to address that in UML? I.e., make an
x86-only symbol visible to the UML build too? Define a dummy one, just
so that it builds?

> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorq %rax, %rax
> +	xorq %rdx, %rdx
> +	/* copy successful. return 0 */
> +	ret
> +
> +	.section .fixup,"ax"
> +	/*
> +	 * machine check handler loaded %rax with trap number
> +	 * We just need to make sure %edx has the number of
> +	 * bytes remaining
> +	 */
> +30:
> +	add %ecx,%edx
> +	ret
> +31:
> +	shl $6,%ecx
> +	add %ecx,%edx
> +	ret
> +32:
> +	shl $6,%ecx
> +	lea -8(%ecx,%edx),%edx
> +	ret
> +33:
> +	shl $6,%ecx
> +	lea -16(%ecx,%edx),%edx
> +	ret
> +34:
> +	shl $6,%ecx
> +	lea -24(%ecx,%edx),%edx
> +	ret
> +35:
> +	shl $6,%ecx
> +	lea -32(%ecx,%edx),%edx
> +	ret
> +36:
> +	shl $6,%ecx
> +	lea -40(%ecx,%edx),%edx
> +	ret
> +37:
> +	shl $6,%ecx
> +	lea -48(%ecx,%edx),%edx
> +	ret
> +38:
> +	shl $6,%ecx
> +	lea -56(%ecx,%edx),%edx
> +	ret
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	ret
> +40:
> +	mov %ecx,%edx
> +	ret
> +	.previous
> +
> +	_ASM_EXTABLE_FAULT(0b,30b)
> +	_ASM_EXTABLE_FAULT(1b,31b)
> +	_ASM_EXTABLE_FAULT(2b,32b)
> +	_ASM_EXTABLE_FAULT(3b,33b)
> +	_ASM_EXTABLE_FAULT(4b,34b)
> +	_ASM_EXTABLE_FAULT(9b,35b)
> +	_ASM_EXTABLE_FAULT(10b,36b)
> +	_ASM_EXTABLE_FAULT(11b,37b)
> +	_ASM_EXTABLE_FAULT(12b,38b)
> +	_ASM_EXTABLE_FAULT(18b,39b)
> +	_ASM_EXTABLE_FAULT(21b,40b)
> +#endif
> -- 
> 2.5.0
Richard Weinberger Feb. 7, 2016, 8:54 p.m. UTC | #3
Am 07.02.2016 um 17:55 schrieb Borislav Petkov:
> due to those
> 
>> +     _ASM_EXTABLE_FAULT(0b,30b)
>> +     _ASM_EXTABLE_FAULT(1b,31b)
>> +     _ASM_EXTABLE_FAULT(2b,32b)
>> +     _ASM_EXTABLE_FAULT(3b,33b)
>> +     _ASM_EXTABLE_FAULT(4b,34b)
> 
> things below and that's because ex_handler_fault() is defined in
> arch/x86/mm/extable.c and UML doesn't include that file in the build. It
> takes kernel/extable.c and lib/extable.c only.
> 
> Richi, what's the usual way to address that in UML? I.e., make an
> x86-only symbol visible to the UML build too? Define a dummy one, just
> so that it builds?

As discussed on IRC with Boris, UML offers only minimal extable support.
To get rid of the said #ifndef, UML would have to provide its own
extable.c (mostly copy&paste from arch/x86) and an advanced
struct exception_table_entry which includes the trap number.
This implies also that UML can no longer use uaccess.h from asm-generic
or has to add a new ifdef into uaccess.h to whiteout the minimal
struct exception_table_entry from there.

So, I'd vote to keep the #ifndef CONFIG_UML in memcpy_64.S.
As soon you need another #ifndef please ping me and I'll happily
bite the bullet and implement the advanced extable stuff for UML.
Deal?

Thanks,
//richard
Luck, Tony Feb. 9, 2016, 11:15 p.m. UTC | #4
> You can save yourself this MOV here in what is, I'm assuming, the
> general likely case where @src is aligned and do:
> 
>         /* check for bad alignment of source */
>         testl $7, %esi
>         /* already aligned? */
>         jz 102f
> 
>         movl %esi,%ecx
>         subl $8,%ecx
>         negl %ecx
>         subl %ecx,%edx
> 0:      movb (%rsi),%al
>         movb %al,(%rdi)
>         incq %rsi
>         incq %rdi
>         decl %ecx
>         jnz 0b

The "testl $7, %esi" just checks the low three bits ... it doesn't
change %esi.  But the code from the "subl $8" on down assumes that
%ecx is a number in [1..7] as the count of bytes to copy until we
achieve alignment.

So your "movl %esi,%ecx" needs to be somthing that just copies the
low three bits and zeroes the high part of %ecx.  Is there a cute
way to do that in x86 assembler?

> Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
> and using them too and thus copying a whole cacheline in one go?
> 
> We would need to restore them when we're done with the cacheline-wise
> shuffle, of course.

I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
I guess the answer depends on whether you generally copy enough
cache lines to save enough time to cover the cost of saving and
restoring those registers.

-Tony
Borislav Petkov Feb. 10, 2016, 10:58 a.m. UTC | #5
On Tue, Feb 09, 2016 at 03:15:57PM -0800, Luck, Tony wrote:
> > You can save yourself this MOV here in what is, I'm assuming, the
> > general likely case where @src is aligned and do:
> > 
> >         /* check for bad alignment of source */
> >         testl $7, %esi
> >         /* already aligned? */
> >         jz 102f
> > 
> >         movl %esi,%ecx
> >         subl $8,%ecx
> >         negl %ecx
> >         subl %ecx,%edx
> > 0:      movb (%rsi),%al
> >         movb %al,(%rdi)
> >         incq %rsi
> >         incq %rdi
> >         decl %ecx
> >         jnz 0b
> 
> The "testl $7, %esi" just checks the low three bits ... it doesn't
> change %esi.  But the code from the "subl $8" on down assumes that
> %ecx is a number in [1..7] as the count of bytes to copy until we
> achieve alignment.

Grr, sorry about that, I actually missed to copy-paste the AND:

        /* check for bad alignment of source */
        testl $7, %esi
        jz 102f                         /* already aligned */

        movl %esi,%ecx
        andl $7,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

I basically am proposing to move the unlikely case out of line and
optimize the likely one.

> So your "movl %esi,%ecx" needs to be somthing that just copies the
> low three bits and zeroes the high part of %ecx.  Is there a cute
> way to do that in x86 assembler?

We could do some funky games with byte-sized moves but those are
generally slower anyway so doing the default operand size thing should
be ok.

> I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
> I guess the answer depends on whether you generally copy enough
> cache lines to save enough time to cover the cost of saving and
> restoring those registers.

Well, that function will run on modern hw with a stack engine so I'd
assume those 4 pushes and pops would be paid for by the increased
registers count for the data shuffling.

But one could take out that function do some microbenchmarking with
different sizes and once with the current version and once with the
pushes and pops of r1[2-5] to see where the breakeven is.
Luck, Tony Feb. 10, 2016, 7:39 p.m. UTC | #6
On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> But one could take out that function do some microbenchmarking with
> different sizes and once with the current version and once with the
> pushes and pops of r1[2-5] to see where the breakeven is.

On a 4K page copy from a source address that isn't in the
cache I see all sorts of answers.

On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
registers.

On my latest Xeon - I can't post benchmarks ... but also a bit slower.

On an older Xeon it is a few cycles faster (but even though I'm
looking at the median of 10,000 runs I see more run-to-run variation
that I see difference between register choices.

Here's what I tested:

	push %r12
	push %r13
	push %r14
	push %r15

	/* Loop copying whole cache lines */
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
9:	movq 4*8(%rsi),%r12
10:	movq 5*8(%rsi),%r13
11:	movq 6*8(%rsi),%r14
12:	movq 7*8(%rsi),%r15
	movq %r8,(%rdi)
	movq %r9,1*8(%rdi)
	movq %r10,2*8(%rdi)
	movq %r11,3*8(%rdi)
	movq %r12,4*8(%rdi)
	movq %r13,5*8(%rdi)
	movq %r14,6*8(%rdi)
	movq %r15,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz 1b

	pop %r15
	pop %r14
	pop %r13
	pop %r12
-Tony
Borislav Petkov Feb. 10, 2016, 8:50 p.m. UTC | #7
On Wed, Feb 10, 2016 at 11:39:05AM -0800, Luck, Tony wrote:
> On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> > But one could take out that function do some microbenchmarking with
> > different sizes and once with the current version and once with the
> > pushes and pops of r1[2-5] to see where the breakeven is.
> 
> On a 4K page copy from a source address that isn't in the
> cache I see all sorts of answers.
> 
> On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
> registers.
> 
> On my latest Xeon - I can't post benchmarks ... but also a bit slower.
> 
> On an older Xeon it is a few cycles faster (but even though I'm
> looking at the median of 10,000 runs I see more run-to-run variation
> that I see difference between register choices.

Hmm, strange. Can you check whether perf doesn't show any significant
differences too. Something like:

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_1

and then

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_2

That'll be interesting...

Thanks.
diff mbox

Patch

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..5b24039463a4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,14 @@  int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
+extern void __mcsafe_copy_end(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..fff245462a8c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@  EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(__mcsafe_copy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..f576acad485e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,137 @@  ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/*
+	 * machine check handler loaded %rax with trap number
+	 * We just need to make sure %edx has the number of
+	 * bytes remaining
+	 */
+30:
+	add %ecx,%edx
+	ret
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	ret
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	ret
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	ret
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	ret
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	ret
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	ret
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	ret
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	ret
+39:
+	lea (%rdx,%rcx,8),%rdx
+	ret
+40:
+	mov %ecx,%edx
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+#endif