Message ID | 6b63a88e925bbc821dc87f209909c3c1166b3261.1454618190.git.tony.luck@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote: > Make use of the EXTABLE_FAULT exception table entries. This routine > returns a structure to indicate the result of the copy: > > struct mcsafe_ret { > u64 trapnr; > u64 remain; > }; > > If the copy is successful, then both 'trapnr' and 'remain' are zero. > > If we faulted during the copy, then 'trapnr' will say which type > of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many > bytes were not copied. > > Note that this is probably the first of several copy functions. > We can make new ones for non-temporal cache handling etc. > > Signed-off-by: Tony Luck <tony.luck@intel.com> > --- > arch/x86/include/asm/string_64.h | 8 +++ > arch/x86/kernel/x8664_ksyms_64.c | 2 + > arch/x86/lib/memcpy_64.S | 134 +++++++++++++++++++++++++++++++++++++++ > 3 files changed, 144 insertions(+) ... > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index 16698bba87de..f576acad485e 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -177,3 +177,137 @@ ENTRY(memcpy_orig) > .Lend: > retq > ENDPROC(memcpy_orig) > + > +#ifndef CONFIG_UML > +/* > + * __mcsafe_copy - memory copy with machine check exception handling > + * Note that we only catch machine checks when reading the source addresses. > + * Writes to target are posted and don't generate machine checks. > + */ > +ENTRY(__mcsafe_copy) > + cmpl $8,%edx > + jb 20f /* less then 8 bytes, go to byte copy loop */ > + > + /* check for bad alignment of source */ > + movl %esi,%ecx You can save yourself this MOV here in what is, I'm assuming, the general likely case where @src is aligned and do: /* check for bad alignment of source */ testl $7, %esi /* already aligned? */ jz 102f movl %esi,%ecx subl $8,%ecx negl %ecx subl %ecx,%edx 0: movb (%rsi),%al movb %al,(%rdi) incq %rsi incq %rdi decl %ecx jnz 0b > + andl $7,%ecx > + jz 102f /* already aligned */ Please move side-comments over the line they're referring to. > + subl $8,%ecx > + negl %ecx > + subl %ecx,%edx > +0: movb (%rsi),%al > + movb %al,(%rdi) > + incq %rsi > + incq %rdi > + decl %ecx > + jnz 0b > +102: > + movl %edx,%ecx > + andl $63,%edx > + shrl $6,%ecx > + jz 17f Please add a \n after the JMPs for better readability - those blocks are dense as it is. They could use some comments too. > +1: movq (%rsi),%r8 > +2: movq 1*8(%rsi),%r9 > +3: movq 2*8(%rsi),%r10 > +4: movq 3*8(%rsi),%r11 > + mov %r8,(%rdi) > + mov %r9,1*8(%rdi) > + mov %r10,2*8(%rdi) > + mov %r11,3*8(%rdi) You can say "movq" too here, for consistency. > +9: movq 4*8(%rsi),%r8 > +10: movq 5*8(%rsi),%r9 > +11: movq 6*8(%rsi),%r10 > +12: movq 7*8(%rsi),%r11 Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above and using them too and thus copying a whole cacheline in one go? We would need to restore them when we're done with the cacheline-wise shuffle, of course. > + mov %r8,4*8(%rdi) > + mov %r9,5*8(%rdi) > + mov %r10,6*8(%rdi) > + mov %r11,7*8(%rdi) > + leaq 64(%rsi),%rsi > + leaq 64(%rdi),%rdi > + decl %ecx > + jnz 1b ...
On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote: > Make use of the EXTABLE_FAULT exception table entries. This routine > returns a structure to indicate the result of the copy: > > struct mcsafe_ret { > u64 trapnr; > u64 remain; > }; > > If the copy is successful, then both 'trapnr' and 'remain' are zero. > > If we faulted during the copy, then 'trapnr' will say which type > of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many > bytes were not copied. > > Note that this is probably the first of several copy functions. > We can make new ones for non-temporal cache handling etc. > > Signed-off-by: Tony Luck <tony.luck@intel.com> > --- > arch/x86/include/asm/string_64.h | 8 +++ > arch/x86/kernel/x8664_ksyms_64.c | 2 + > arch/x86/lib/memcpy_64.S | 134 +++++++++++++++++++++++++++++++++++++++ > 3 files changed, 144 insertions(+) > > diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h > index ff8b9a17dc4b..5b24039463a4 100644 > --- a/arch/x86/include/asm/string_64.h > +++ b/arch/x86/include/asm/string_64.h > @@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct); > #define memset(s, c, n) __memset(s, c, n) > #endif > > +struct mcsafe_ret { > + u64 trapnr; > + u64 remain; > +}; > + > +struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt); > +extern void __mcsafe_copy_end(void); > + > #endif /* __KERNEL__ */ > > #endif /* _ASM_X86_STRING_64_H */ > diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c > index a0695be19864..fff245462a8c 100644 > --- a/arch/x86/kernel/x8664_ksyms_64.c > +++ b/arch/x86/kernel/x8664_ksyms_64.c > @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache); > EXPORT_SYMBOL(_copy_from_user); > EXPORT_SYMBOL(_copy_to_user); > > +EXPORT_SYMBOL_GPL(__mcsafe_copy); > + > EXPORT_SYMBOL(copy_page); > EXPORT_SYMBOL(clear_page); > > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index 16698bba87de..f576acad485e 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -177,3 +177,137 @@ ENTRY(memcpy_orig) > .Lend: > retq > ENDPROC(memcpy_orig) > + > +#ifndef CONFIG_UML So this is because we get a link failure on UML: arch/x86/um/built-in.o:(__ex_table+0x8): undefined reference to `ex_handler_fault' arch/x86/um/built-in.o:(__ex_table+0x14): undefined reference to `ex_handler_fault' arch/x86/um/built-in.o:(__ex_table+0x20): undefined reference to `ex_handler_fault' arch/x86/um/built-in.o:(__ex_table+0x2c): undefined reference to `ex_handler_fault' arch/x86/um/built-in.o:(__ex_table+0x38): undefined reference to `ex_handler_fault' arch/x86/um/built-in.o:(__ex_table+0x44): more undefined references to `ex_handler_fault' follow collect2: error: ld returned 1 exit status make: *** [vmlinux] Error 1 due to those > + _ASM_EXTABLE_FAULT(0b,30b) > + _ASM_EXTABLE_FAULT(1b,31b) > + _ASM_EXTABLE_FAULT(2b,32b) > + _ASM_EXTABLE_FAULT(3b,33b) > + _ASM_EXTABLE_FAULT(4b,34b) things below and that's because ex_handler_fault() is defined in arch/x86/mm/extable.c and UML doesn't include that file in the build. It takes kernel/extable.c and lib/extable.c only. Richi, what's the usual way to address that in UML? I.e., make an x86-only symbol visible to the UML build too? Define a dummy one, just so that it builds? > + * __mcsafe_copy - memory copy with machine check exception handling > + * Note that we only catch machine checks when reading the source addresses. > + * Writes to target are posted and don't generate machine checks. > + */ > +ENTRY(__mcsafe_copy) > + cmpl $8,%edx > + jb 20f /* less then 8 bytes, go to byte copy loop */ > + > + /* check for bad alignment of source */ > + movl %esi,%ecx > + andl $7,%ecx > + jz 102f /* already aligned */ > + subl $8,%ecx > + negl %ecx > + subl %ecx,%edx > +0: movb (%rsi),%al > + movb %al,(%rdi) > + incq %rsi > + incq %rdi > + decl %ecx > + jnz 0b > +102: > + movl %edx,%ecx > + andl $63,%edx > + shrl $6,%ecx > + jz 17f > +1: movq (%rsi),%r8 > +2: movq 1*8(%rsi),%r9 > +3: movq 2*8(%rsi),%r10 > +4: movq 3*8(%rsi),%r11 > + mov %r8,(%rdi) > + mov %r9,1*8(%rdi) > + mov %r10,2*8(%rdi) > + mov %r11,3*8(%rdi) > +9: movq 4*8(%rsi),%r8 > +10: movq 5*8(%rsi),%r9 > +11: movq 6*8(%rsi),%r10 > +12: movq 7*8(%rsi),%r11 > + mov %r8,4*8(%rdi) > + mov %r9,5*8(%rdi) > + mov %r10,6*8(%rdi) > + mov %r11,7*8(%rdi) > + leaq 64(%rsi),%rsi > + leaq 64(%rdi),%rdi > + decl %ecx > + jnz 1b > +17: movl %edx,%ecx > + andl $7,%edx > + shrl $3,%ecx > + jz 20f > +18: movq (%rsi),%r8 > + mov %r8,(%rdi) > + leaq 8(%rsi),%rsi > + leaq 8(%rdi),%rdi > + decl %ecx > + jnz 18b > +20: andl %edx,%edx > + jz 23f > + movl %edx,%ecx > +21: movb (%rsi),%al > + movb %al,(%rdi) > + incq %rsi > + incq %rdi > + decl %ecx > + jnz 21b > +23: xorq %rax, %rax > + xorq %rdx, %rdx > + /* copy successful. return 0 */ > + ret > + > + .section .fixup,"ax" > + /* > + * machine check handler loaded %rax with trap number > + * We just need to make sure %edx has the number of > + * bytes remaining > + */ > +30: > + add %ecx,%edx > + ret > +31: > + shl $6,%ecx > + add %ecx,%edx > + ret > +32: > + shl $6,%ecx > + lea -8(%ecx,%edx),%edx > + ret > +33: > + shl $6,%ecx > + lea -16(%ecx,%edx),%edx > + ret > +34: > + shl $6,%ecx > + lea -24(%ecx,%edx),%edx > + ret > +35: > + shl $6,%ecx > + lea -32(%ecx,%edx),%edx > + ret > +36: > + shl $6,%ecx > + lea -40(%ecx,%edx),%edx > + ret > +37: > + shl $6,%ecx > + lea -48(%ecx,%edx),%edx > + ret > +38: > + shl $6,%ecx > + lea -56(%ecx,%edx),%edx > + ret > +39: > + lea (%rdx,%rcx,8),%rdx > + ret > +40: > + mov %ecx,%edx > + ret > + .previous > + > + _ASM_EXTABLE_FAULT(0b,30b) > + _ASM_EXTABLE_FAULT(1b,31b) > + _ASM_EXTABLE_FAULT(2b,32b) > + _ASM_EXTABLE_FAULT(3b,33b) > + _ASM_EXTABLE_FAULT(4b,34b) > + _ASM_EXTABLE_FAULT(9b,35b) > + _ASM_EXTABLE_FAULT(10b,36b) > + _ASM_EXTABLE_FAULT(11b,37b) > + _ASM_EXTABLE_FAULT(12b,38b) > + _ASM_EXTABLE_FAULT(18b,39b) > + _ASM_EXTABLE_FAULT(21b,40b) > +#endif > -- > 2.5.0
Am 07.02.2016 um 17:55 schrieb Borislav Petkov: > due to those > >> + _ASM_EXTABLE_FAULT(0b,30b) >> + _ASM_EXTABLE_FAULT(1b,31b) >> + _ASM_EXTABLE_FAULT(2b,32b) >> + _ASM_EXTABLE_FAULT(3b,33b) >> + _ASM_EXTABLE_FAULT(4b,34b) > > things below and that's because ex_handler_fault() is defined in > arch/x86/mm/extable.c and UML doesn't include that file in the build. It > takes kernel/extable.c and lib/extable.c only. > > Richi, what's the usual way to address that in UML? I.e., make an > x86-only symbol visible to the UML build too? Define a dummy one, just > so that it builds? As discussed on IRC with Boris, UML offers only minimal extable support. To get rid of the said #ifndef, UML would have to provide its own extable.c (mostly copy&paste from arch/x86) and an advanced struct exception_table_entry which includes the trap number. This implies also that UML can no longer use uaccess.h from asm-generic or has to add a new ifdef into uaccess.h to whiteout the minimal struct exception_table_entry from there. So, I'd vote to keep the #ifndef CONFIG_UML in memcpy_64.S. As soon you need another #ifndef please ping me and I'll happily bite the bullet and implement the advanced extable stuff for UML. Deal? Thanks, //richard
> You can save yourself this MOV here in what is, I'm assuming, the > general likely case where @src is aligned and do: > > /* check for bad alignment of source */ > testl $7, %esi > /* already aligned? */ > jz 102f > > movl %esi,%ecx > subl $8,%ecx > negl %ecx > subl %ecx,%edx > 0: movb (%rsi),%al > movb %al,(%rdi) > incq %rsi > incq %rdi > decl %ecx > jnz 0b The "testl $7, %esi" just checks the low three bits ... it doesn't change %esi. But the code from the "subl $8" on down assumes that %ecx is a number in [1..7] as the count of bytes to copy until we achieve alignment. So your "movl %esi,%ecx" needs to be somthing that just copies the low three bits and zeroes the high part of %ecx. Is there a cute way to do that in x86 assembler? > Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above > and using them too and thus copying a whole cacheline in one go? > > We would need to restore them when we're done with the cacheline-wise > shuffle, of course. I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache() I guess the answer depends on whether you generally copy enough cache lines to save enough time to cover the cost of saving and restoring those registers. -Tony
On Tue, Feb 09, 2016 at 03:15:57PM -0800, Luck, Tony wrote: > > You can save yourself this MOV here in what is, I'm assuming, the > > general likely case where @src is aligned and do: > > > > /* check for bad alignment of source */ > > testl $7, %esi > > /* already aligned? */ > > jz 102f > > > > movl %esi,%ecx > > subl $8,%ecx > > negl %ecx > > subl %ecx,%edx > > 0: movb (%rsi),%al > > movb %al,(%rdi) > > incq %rsi > > incq %rdi > > decl %ecx > > jnz 0b > > The "testl $7, %esi" just checks the low three bits ... it doesn't > change %esi. But the code from the "subl $8" on down assumes that > %ecx is a number in [1..7] as the count of bytes to copy until we > achieve alignment. Grr, sorry about that, I actually missed to copy-paste the AND: /* check for bad alignment of source */ testl $7, %esi jz 102f /* already aligned */ movl %esi,%ecx andl $7,%ecx subl $8,%ecx negl %ecx subl %ecx,%edx 0: movb (%rsi),%al movb %al,(%rdi) incq %rsi incq %rdi decl %ecx jnz 0b I basically am proposing to move the unlikely case out of line and optimize the likely one. > So your "movl %esi,%ecx" needs to be somthing that just copies the > low three bits and zeroes the high part of %ecx. Is there a cute > way to do that in x86 assembler? We could do some funky games with byte-sized moves but those are generally slower anyway so doing the default operand size thing should be ok. > I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache() > I guess the answer depends on whether you generally copy enough > cache lines to save enough time to cover the cost of saving and > restoring those registers. Well, that function will run on modern hw with a stack engine so I'd assume those 4 pushes and pops would be paid for by the increased registers count for the data shuffling. But one could take out that function do some microbenchmarking with different sizes and once with the current version and once with the pushes and pops of r1[2-5] to see where the breakeven is.
On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote: > But one could take out that function do some microbenchmarking with > different sizes and once with the current version and once with the > pushes and pops of r1[2-5] to see where the breakeven is. On a 4K page copy from a source address that isn't in the cache I see all sorts of answers. On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four registers. On my latest Xeon - I can't post benchmarks ... but also a bit slower. On an older Xeon it is a few cycles faster (but even though I'm looking at the median of 10,000 runs I see more run-to-run variation that I see difference between register choices. Here's what I tested: push %r12 push %r13 push %r14 push %r15 /* Loop copying whole cache lines */ 1: movq (%rsi),%r8 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 4: movq 3*8(%rsi),%r11 9: movq 4*8(%rsi),%r12 10: movq 5*8(%rsi),%r13 11: movq 6*8(%rsi),%r14 12: movq 7*8(%rsi),%r15 movq %r8,(%rdi) movq %r9,1*8(%rdi) movq %r10,2*8(%rdi) movq %r11,3*8(%rdi) movq %r12,4*8(%rdi) movq %r13,5*8(%rdi) movq %r14,6*8(%rdi) movq %r15,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi decl %ecx jnz 1b pop %r15 pop %r14 pop %r13 pop %r12 -Tony
On Wed, Feb 10, 2016 at 11:39:05AM -0800, Luck, Tony wrote: > On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote: > > But one could take out that function do some microbenchmarking with > > different sizes and once with the current version and once with the > > pushes and pops of r1[2-5] to see where the breakeven is. > > On a 4K page copy from a source address that isn't in the > cache I see all sorts of answers. > > On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four > registers. > > On my latest Xeon - I can't post benchmarks ... but also a bit slower. > > On an older Xeon it is a few cycles faster (but even though I'm > looking at the median of 10,000 runs I see more run-to-run variation > that I see difference between register choices. Hmm, strange. Can you check whether perf doesn't show any significant differences too. Something like: perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_1 and then perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_2 That'll be interesting... Thanks.
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index ff8b9a17dc4b..5b24039463a4 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +struct mcsafe_ret { + u64 trapnr; + u64 remain; +}; + +struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt); +extern void __mcsafe_copy_end(void); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19864..fff245462a8c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); +EXPORT_SYMBOL_GPL(__mcsafe_copy); + EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 16698bba87de..f576acad485e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -177,3 +177,137 @@ ENTRY(memcpy_orig) .Lend: retq ENDPROC(memcpy_orig) + +#ifndef CONFIG_UML +/* + * __mcsafe_copy - memory copy with machine check exception handling + * Note that we only catch machine checks when reading the source addresses. + * Writes to target are posted and don't generate machine checks. + */ +ENTRY(__mcsafe_copy) + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + + /* check for bad alignment of source */ + movl %esi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +0: movb (%rsi),%al + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 0b +102: + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 + mov %r8,(%rdi) + mov %r9,1*8(%rdi) + mov %r10,2*8(%rdi) + mov %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 + mov %r8,4*8(%rdi) + mov %r9,5*8(%rdi) + mov %r10,6*8(%rdi) + mov %r11,7*8(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + decl %ecx + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 + mov %r8,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 21b +23: xorq %rax, %rax + xorq %rdx, %rdx + /* copy successful. return 0 */ + ret + + .section .fixup,"ax" + /* + * machine check handler loaded %rax with trap number + * We just need to make sure %edx has the number of + * bytes remaining + */ +30: + add %ecx,%edx + ret +31: + shl $6,%ecx + add %ecx,%edx + ret +32: + shl $6,%ecx + lea -8(%ecx,%edx),%edx + ret +33: + shl $6,%ecx + lea -16(%ecx,%edx),%edx + ret +34: + shl $6,%ecx + lea -24(%ecx,%edx),%edx + ret +35: + shl $6,%ecx + lea -32(%ecx,%edx),%edx + ret +36: + shl $6,%ecx + lea -40(%ecx,%edx),%edx + ret +37: + shl $6,%ecx + lea -48(%ecx,%edx),%edx + ret +38: + shl $6,%ecx + lea -56(%ecx,%edx),%edx + ret +39: + lea (%rdx,%rcx,8),%rdx + ret +40: + mov %ecx,%edx + ret + .previous + + _ASM_EXTABLE_FAULT(0b,30b) + _ASM_EXTABLE_FAULT(1b,31b) + _ASM_EXTABLE_FAULT(2b,32b) + _ASM_EXTABLE_FAULT(3b,33b) + _ASM_EXTABLE_FAULT(4b,34b) + _ASM_EXTABLE_FAULT(9b,35b) + _ASM_EXTABLE_FAULT(10b,36b) + _ASM_EXTABLE_FAULT(11b,37b) + _ASM_EXTABLE_FAULT(12b,38b) + _ASM_EXTABLE_FAULT(18b,39b) + _ASM_EXTABLE_FAULT(21b,40b) +#endif
Make use of the EXTABLE_FAULT exception table entries. This routine returns a structure to indicate the result of the copy: struct mcsafe_ret { u64 trapnr; u64 remain; }; If the copy is successful, then both 'trapnr' and 'remain' are zero. If we faulted during the copy, then 'trapnr' will say which type of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many bytes were not copied. Note that this is probably the first of several copy functions. We can make new ones for non-temporal cache handling etc. Signed-off-by: Tony Luck <tony.luck@intel.com> --- arch/x86/include/asm/string_64.h | 8 +++ arch/x86/kernel/x8664_ksyms_64.c | 2 + arch/x86/lib/memcpy_64.S | 134 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+)