diff mbox

[PATCHV3,3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks

Message ID d560d03663b6fd7a5bbeae9842934f329a7dcbdf.1450283985.git.tony.luck@intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Luck, Tony Dec. 16, 2015, 1:30 a.m. UTC
Using __copy_user_nocache() as inspiration create a memory copy
routine for use by kernel code with annotations to allow for
recovery from machine checks.

Notes:
1) We align the source address rather than the destination. This
   means we never have to deal with a memory read that spans two
   cache lines ... so we can provide a precise indication of
   where the error occurred without having to re-execute at
   a byte-by-byte level to find the exact spot like the original
   did.
2) We 'or' BIT(63) into the return because this is the first
   in a series of machine check safe functions. Some will copy
   from user addresses, so may need to indicate an invalid user
   address instead of a machine check.
3) This code doesn't play any cache games. Future functions can
   use non-temporal loads/stores to meet needs of different callers.
4) Provide helpful macros to decode the return value.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/mcsafe_copy.h |  11 +++
 arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
 arch/x86/lib/Makefile              |   1 +
 arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_copy.h
 create mode 100644 arch/x86/lib/mcsafe_copy.S

Comments

Borislav Petkov Dec. 22, 2015, 11:13 a.m. UTC | #1
On Tue, Dec 15, 2015 at 05:30:49PM -0800, Tony Luck wrote:
> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
> 
> Notes:
> 1) We align the source address rather than the destination. This
>    means we never have to deal with a memory read that spans two
>    cache lines ... so we can provide a precise indication of
>    where the error occurred without having to re-execute at
>    a byte-by-byte level to find the exact spot like the original
>    did.
> 2) We 'or' BIT(63) into the return because this is the first
>    in a series of machine check safe functions. Some will copy
>    from user addresses, so may need to indicate an invalid user
>    address instead of a machine check.
> 3) This code doesn't play any cache games. Future functions can
>    use non-temporal loads/stores to meet needs of different callers.
> 4) Provide helpful macros to decode the return value.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/mcsafe_copy.h |  11 +++
>  arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
>  arch/x86/lib/Makefile              |   1 +
>  arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 159 insertions(+)
>  create mode 100644 arch/x86/include/asm/mcsafe_copy.h
>  create mode 100644 arch/x86/lib/mcsafe_copy.S
> 
> diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
> new file mode 100644
> index 000000000000..d4dbd5a667a3
> --- /dev/null
> +++ b/arch/x86/include/asm/mcsafe_copy.h
> @@ -0,0 +1,11 @@
> +#ifndef _ASM_X86_MCSAFE_COPY_H
> +#define _ASM_X86_MCSAFE_COPY_H
> +
> +u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
> +
> +#define	COPY_MCHECK_ERRBIT	BIT(63)

What happened to the landing pads Andy was talking about? They sound
like cleaner design than that bit 63...

> +#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
> +#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
> +
> +#endif /* _ASM_MCSAFE_COPY_H */

This should all be in arch/x86/include/asm/string_64.h I guess. You can
save yourself the #include-ing.

> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..afab8b25dbc0 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,11 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +#include <asm/mcsafe_copy.h>
> +EXPORT_SYMBOL(mcsafe_memcpy);
> +#endif
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index f2587888d987..82bb0bf46b6b 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
>  lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> +lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
>  
>  obj-y += msr.o msr-reg.o msr-reg-export.o
>  
> diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
> new file mode 100644
> index 000000000000..059b3a9642eb
> --- /dev/null
> +++ b/arch/x86/lib/mcsafe_copy.S

You probably should add that function to arch/x86/lib/memcpy_64.S where
we keep all those memcpy variants instead of a separate file.

> @@ -0,0 +1,142 @@
> +/*
> + * Copyright (C) 2015 Intel Corporation
> + * Author: Tony Luck
> + *
> + * This software may be redistributed and/or modified under the terms of
> + * the GNU General Public License ("GPL") version 2 only as published by the
> + * Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +
> +/*
> + * mcsafe_memcpy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(mcsafe_memcpy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorl %eax,%eax
> +	sfence
> +	ret
> +
> +	.section .fixup,"ax"
> +30:
> +	addl %ecx,%edx
> +	jmp 100f
> +31:
> +	shll $6,%ecx
> +	addl %ecx,%edx
> +	jmp 100f
> +32:
> +	shll $6,%ecx
> +	leal -8(%ecx,%edx),%edx
> +	jmp 100f
> +33:
> +	shll $6,%ecx
> +	leal -16(%ecx,%edx),%edx
> +	jmp 100f
> +34:
> +	shll $6,%ecx
> +	leal -24(%ecx,%edx),%edx
> +	jmp 100f
> +35:
> +	shll $6,%ecx
> +	leal -32(%ecx,%edx),%edx
> +	jmp 100f
> +36:
> +	shll $6,%ecx
> +	leal -40(%ecx,%edx),%edx
> +	jmp 100f
> +37:
> +	shll $6,%ecx
> +	leal -48(%ecx,%edx),%edx
> +	jmp 100f
> +38:
> +	shll $6,%ecx
> +	leal -56(%ecx,%edx),%edx
> +	jmp 100f
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	jmp 100f
> +40:
> +	movl %ecx,%edx
> +100:
> +	sfence
> +	movabsq	$0x8000000000000000, %rax
> +	orq %rdx,%rax

I think you want to write this as:

	mov %rdx, %rax
	bts $63, %rax

It cuts down instruction bytes by almost half and it is a bit more
readable:

  5c:   48 b8 00 00 00 00 00    movabs $0x8000000000000000,%rax
  63:   00 00 80
  66:   48 09 d0                or     %rdx,%rax

  5c:   48 89 d0                mov    %rdx,%rax
  5f:   48 0f ba e8 3f          bts    $0x3f,%rax


> +	ret

Also, you can drop the "l" suffix - default operand size is 32-bit in
long mode:

        .section .fixup,"ax"
30:
        add %ecx,%edx
        jmp 100f
31:
        shl $6,%ecx
        add %ecx,%edx
        jmp 100f
32:
        shl $6,%ecx
        lea -8(%ecx,%edx),%edx
        jmp 100f
33:
        shl $6,%ecx
        lea -16(%ecx,%edx),%edx
        jmp 100f
34:
        shl $6,%ecx
        lea -24(%ecx,%edx),%edx
        jmp 100f
35:
        shl $6,%ecx
        lea -32(%ecx,%edx),%edx
        jmp 100f
36:
        shl $6,%ecx
        lea -40(%ecx,%edx),%edx
        jmp 100f
37:
        shl $6,%ecx
        lea -48(%ecx,%edx),%edx
        jmp 100f
38:
        shl $6,%ecx
        lea -56(%ecx,%edx),%edx
        jmp 100f
39:
        lea (%rdx,%rcx,8),%rdx
        jmp 100f
40:
        mov %ecx,%edx
100:
Tony Luck Dec. 22, 2015, 7:38 p.m. UTC | #2
On Tue, Dec 22, 2015 at 3:13 AM, Borislav Petkov <bp@alien8.de> wrote:
>> +#define      COPY_MCHECK_ERRBIT      BIT(63)
>
> What happened to the landing pads Andy was talking about? They sound
> like cleaner design than that bit 63...

I interpreted that comment as "stop playing with %rax in the fault
handler ... just change the IP to point the the .fixup location" ...
the target of the fixup being the "landing pad".

Right now this function has only one set of fault fixups (for machine
checks). When I tackle copy_from_user() it will sprout a second
set for page faults, and then will look a bit more like Andy's dual
landing pad example.

I still need an indicator to the caller which type of fault happened
since their actions will be different. So BIT(63) lives on ... but is
now set in the .fixup section rather than in the machine check
code.

I'll move the function and #defines as you suggest - we don't need
new files for these.  Also will fix the assembly code.
[In my defense that load immediate 0x8000000000000000 and 'or'
was what gcc -O2 generates from a simple bit of C code to set
bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
case code compactness wins over possible speed difference].

-Tony
Borislav Petkov Dec. 23, 2015, 12:58 p.m. UTC | #3
On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
> I interpreted that comment as "stop playing with %rax in the fault
> handler ... just change the IP to point the the .fixup location" ...
> the target of the fixup being the "landing pad".
> 
> Right now this function has only one set of fault fixups (for machine
> checks). When I tackle copy_from_user() it will sprout a second
> set for page faults, and then will look a bit more like Andy's dual
> landing pad example.
> 
> I still need an indicator to the caller which type of fault happened
> since their actions will be different. So BIT(63) lives on ... but is
> now set in the .fixup section rather than in the machine check
> code.

You mean this previous example of yours:

int copy_from_user(void *to, void *from, unsigned long n)
{
        u64 ret = mcsafe_memcpy(to, from, n);

        if (COPY_HAD_MCHECK(r)) {
                if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
                        force_sig(SIGBUS, current);
                return something;
        } else
                return ret;
}

?

So what's wrong with mcsafe_memcpy() returning a proper retval which
says what type of fault happened?

I know, memcpy returns the ptr to @dest like a parrot but your version
mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
and have a wrapper around it which fiddles out the proper retvals and
returns @dest after all. It would still be cleaner this way IMHO.

> I'll move the function and #defines as you suggest - we don't need
> new files for these.  Also will fix the assembly code.
> [In my defense that load immediate 0x8000000000000000 and 'or'
> was what gcc -O2 generates from a simple bit of C code to set
> bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
> case code compactness wins over possible speed difference].

Well, upon a second thought, the reason why gcc would use that huge
immediate could be because by using BTS, it clobbers the carry flag
in rFLAGS. And I guess we don't want that. Although any Jcc or other
conditional instructions touching rFLAGS following will overwrite that
bit so it won't really matter.

I've asked a gcc person, we'll see what interesting explanation comes
back.
Dan Williams Dec. 23, 2015, 7:31 p.m. UTC | #4
On Wed, Dec 23, 2015 at 4:58 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
>> I interpreted that comment as "stop playing with %rax in the fault
>> handler ... just change the IP to point the the .fixup location" ...
>> the target of the fixup being the "landing pad".
>>
>> Right now this function has only one set of fault fixups (for machine
>> checks). When I tackle copy_from_user() it will sprout a second
>> set for page faults, and then will look a bit more like Andy's dual
>> landing pad example.
>>
>> I still need an indicator to the caller which type of fault happened
>> since their actions will be different. So BIT(63) lives on ... but is
>> now set in the .fixup section rather than in the machine check
>> code.
>
> You mean this previous example of yours:
>
> int copy_from_user(void *to, void *from, unsigned long n)
> {
>         u64 ret = mcsafe_memcpy(to, from, n);
>
>         if (COPY_HAD_MCHECK(r)) {
>                 if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
>                         force_sig(SIGBUS, current);
>                 return something;
>         } else
>                 return ret;
> }
>
> ?
>
> So what's wrong with mcsafe_memcpy() returning a proper retval which
> says what type of fault happened?
>
> I know, memcpy returns the ptr to @dest like a parrot but your version
> mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
> and have a wrapper around it which fiddles out the proper retvals and
> returns @dest after all. It would still be cleaner this way IMHO.

We might leave this to the consumer.  It's already the case that
mcsafe_memcpy() is arch specific so I'm having to wrap its return
value into a generic value.  My current thinking is make
memcpy_from_pmem() return a pmem_cookie_t, and then have an arch
specific pmem_copy_error(pmem_cookit_t cookie) helper that interprets
the value.  This is similar to the situation we have with
dma_mapping_error().
Tony Luck Dec. 23, 2015, 8:46 p.m. UTC | #5
> I know, memcpy returns the ptr to @dest like a parrot

Maybe I need to change the name to remove the
"memcpy" substring to avoid this confusion. How
about "mcsafe_copy()"? Perhaps with a "__" prefix
to point out it is a building block that will get various
wrappers around it??

Dan wants a copy_from_nvdimm() that either completes
the copy, or indicates where a machine check occurred.

I'm going to want a copy_from_user() that has two fault
options (user gave a bad address -> -EFAULT, or the
source address had an uncorrected error -> SIGBUS).

-Tony
Borislav Petkov Dec. 24, 2015, 1:37 p.m. UTC | #6
On Wed, Dec 23, 2015 at 12:46:20PM -0800, Tony Luck wrote:
> > I know, memcpy returns the ptr to @dest like a parrot
> 
> Maybe I need to change the name to remove the
> "memcpy" substring to avoid this confusion. How
> about "mcsafe_copy()"? Perhaps with a "__" prefix
> to point out it is a building block that will get various
> wrappers around it??
> 
> Dan wants a copy_from_nvdimm() that either completes
> the copy, or indicates where a machine check occurred.
> 
> I'm going to want a copy_from_user() that has two fault
> options (user gave a bad address -> -EFAULT, or the
> source address had an uncorrected error -> SIGBUS).

Sounds like standard kernel design to me. :)
diff mbox

Patch

diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
new file mode 100644
index 000000000000..d4dbd5a667a3
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_copy.h
@@ -0,0 +1,11 @@ 
+#ifndef _ASM_X86_MCSAFE_COPY_H
+#define _ASM_X86_MCSAFE_COPY_H
+
+u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
+
+#define	COPY_MCHECK_ERRBIT	BIT(63)
+#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
+#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
+
+#endif /* _ASM_MCSAFE_COPY_H */
+
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..afab8b25dbc0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,11 @@  EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+#include <asm/mcsafe_copy.h>
+EXPORT_SYMBOL(mcsafe_memcpy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..82bb0bf46b6b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,6 +21,7 @@  lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
new file mode 100644
index 000000000000..059b3a9642eb
--- /dev/null
+++ b/arch/x86/lib/mcsafe_copy.S
@@ -0,0 +1,142 @@ 
+/*
+ * Copyright (C) 2015 Intel Corporation
+ * Author: Tony Luck
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/*
+ * mcsafe_memcpy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(mcsafe_memcpy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:
+	addl %ecx,%edx
+	jmp 100f
+31:
+	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 100f
+32:
+	shll $6,%ecx
+	leal -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shll $6,%ecx
+	leal -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shll $6,%ecx
+	leal -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shll $6,%ecx
+	leal -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shll $6,%ecx
+	leal -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shll $6,%ecx
+	leal -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shll $6,%ecx
+	leal -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	movl %ecx,%edx
+100:
+	sfence
+	movabsq	$0x8000000000000000, %rax
+	orq %rdx,%rax
+	ret
+	.previous
+
+	_ASM_MCEXTABLE(0b,30b)
+	_ASM_MCEXTABLE(1b,31b)
+	_ASM_MCEXTABLE(2b,32b)
+	_ASM_MCEXTABLE(3b,33b)
+	_ASM_MCEXTABLE(4b,34b)
+	_ASM_MCEXTABLE(9b,35b)
+	_ASM_MCEXTABLE(10b,36b)
+	_ASM_MCEXTABLE(11b,37b)
+	_ASM_MCEXTABLE(12b,38b)
+	_ASM_MCEXTABLE(18b,39b)
+	_ASM_MCEXTABLE(21b,40b)
+ENDPROC(mcsafe_memcpy)