Message ID | 20230403052233.1880567-5-ankur.a.arora@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | x86/clear_huge_page: multi-page clearing | expand |
On Sun, Apr 02, 2023 at 10:22:28PM -0700, Ankur Arora wrote: > Change clear_page*() to take a length parameter. > Rename to clear_pages_*(). > > Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S > index ecbfb4dd3b01..6069acf6072f 100644 > --- a/arch/x86/lib/clear_page_64.S > +++ b/arch/x86/lib/clear_page_64.S > @@ -2,6 +2,8 @@ > #include <linux/linkage.h> > #include <asm/asm.h> > #include <asm/export.h> > +#include <asm/cpufeatures.h> > +#include <asm/alternative.h> > > /* > * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is > @@ -13,18 +15,30 @@ > /* > * Zero a page. > * %rdi - page > + * %esi - page-length > + * > + * Clobbers: %rax, %rcx > */ > -SYM_FUNC_START(clear_page_rep) > - movl $4096/8,%ecx > +SYM_FUNC_START(clear_pages_rep) > + movl %esi, %ecx > xorl %eax,%eax > + shrl $3,%ecx > rep stosq > RET > -SYM_FUNC_END(clear_page_rep) > -EXPORT_SYMBOL_GPL(clear_page_rep) > +SYM_FUNC_END(clear_pages_rep) > +EXPORT_SYMBOL_GPL(clear_pages_rep) > > -SYM_FUNC_START(clear_page_orig) > +/* > + * Original page zeroing loop. > + * %rdi - page > + * %esi - page-length > + * > + * Clobbers: %rax, %rcx, %rflags > + */ > +SYM_FUNC_START(clear_pages_orig) > + movl %esi, %ecx > xorl %eax,%eax > - movl $4096/64,%ecx > + shrl $6,%ecx > .p2align 4 > .Lloop: > decl %ecx > @@ -41,16 +55,23 @@ SYM_FUNC_START(clear_page_orig) > jnz .Lloop > nop > RET > -SYM_FUNC_END(clear_page_orig) > -EXPORT_SYMBOL_GPL(clear_page_orig) > +SYM_FUNC_END(clear_pages_orig) > +EXPORT_SYMBOL_GPL(clear_pages_orig) > > -SYM_FUNC_START(clear_page_erms) > - movl $4096,%ecx > +/* > + * Zero a page. > + * %rdi - page > + * %esi - page-length > + * > + * Clobbers: %rax, %rcx > + */ > +SYM_FUNC_START(clear_pages_erms) > + movl %esi, %ecx > xorl %eax,%eax > rep stosb > RET > -SYM_FUNC_END(clear_page_erms) > -EXPORT_SYMBOL_GPL(clear_page_erms) > +SYM_FUNC_END(clear_pages_erms) > +EXPORT_SYMBOL_GPL(clear_pages_erms) So it seems to me that clear_user_*() and clear_page_*() are now very similar; is there really no way to de-duplicate all that?
Peter Zijlstra <peterz@infradead.org> writes: > On Sun, Apr 02, 2023 at 10:22:28PM -0700, Ankur Arora wrote: >> Change clear_page*() to take a length parameter. >> Rename to clear_pages_*(). >> >> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> > >> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S >> index ecbfb4dd3b01..6069acf6072f 100644 >> --- a/arch/x86/lib/clear_page_64.S >> +++ b/arch/x86/lib/clear_page_64.S > So it seems to me that clear_user_*() and clear_page_*() are now very > similar; is there really no way to de-duplicate all that? I'm not sure I see a good way to do that. The clear_page_*() variants are way simpler because they don't do alignment or uacess exception handling. The innermost loop in them could be unified -- but both clear_pages_rep() and clear_pages_erms() are pretty trivial. One place where it might be worth doing is clear_user_original() and clear_pages_orig(). These two have also diverged some (clear_pages_orig unrolls its loop and uses a register mov): inner loop in clear_pages_orig: .Lloop: decl %ecx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) PUT(2) PUT(3) PUT(4) PUT(5) PUT(6) PUT(7) leaq 64(%rdi),%rdi jnz .Lloop nop inner loop in clear_user_original: .Lqwords: movq $0,(%rdi) lea 8(%rdi),%rdi dec %rcx jnz .Lqwords Maybe something like this? --- arch/x86/lib/clear_page_64.S | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 6069acf6072f..795a82214d99 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -28,36 +28,6 @@ SYM_FUNC_START(clear_pages_rep) SYM_FUNC_END(clear_pages_rep) EXPORT_SYMBOL_GPL(clear_pages_rep) -/* - * Original page zeroing loop. - * %rdi - page - * %esi - page-length - * - * Clobbers: %rax, %rcx, %rflags - */ -SYM_FUNC_START(clear_pages_orig) - movl %esi, %ecx - xorl %eax,%eax - shrl $6,%ecx - .p2align 4 -.Lloop: - decl %ecx -#define PUT(x) movq %rax,x*8(%rdi) - movq %rax,(%rdi) - PUT(1) - PUT(2) - PUT(3) - PUT(4) - PUT(5) - PUT(6) - PUT(7) - leaq 64(%rdi),%rdi - jnz .Lloop - nop - RET -SYM_FUNC_END(clear_pages_orig) -EXPORT_SYMBOL_GPL(clear_pages_orig) - /* * Zero a page. * %rdi - page @@ -92,6 +62,9 @@ SYM_FUNC_START(clear_user_original) jz .Lrest_bytes # do the qwords first +SYM_INNER_LABEL(clear_pages_orig, SYM_L_GLOBAL) + movl %esi, %ecx + shr $3,%rcx .p2align 4 .Lqwords: movq $0,(%rdi) @@ -135,6 +108,8 @@ SYM_FUNC_START(clear_user_original) SYM_FUNC_END(clear_user_original) EXPORT_SYMBOL(clear_user_original) +EXPORT_SYMBOL_GPL(clear_pages_orig) + /* * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is * present.
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index cc6b8e087192..7ca3bd2448c1 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,22 +39,24 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); +void clear_pages_orig(void *page, unsigned int length); +void clear_pages_rep(void *page, unsigned int length); +void clear_pages_erms(void *page, unsigned int length); static inline void clear_page(void *page) { + unsigned long length = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call * below clobbers @page, so we perform unpoisoning before it. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, + kmsan_unpoison_memory(page, length); + + alternative_call_2(clear_pages_orig, + clear_pages_rep, X86_FEATURE_REP_GOOD, + clear_pages_erms, X86_FEATURE_ERMS, "=D" (page), - "0" (page) + "0" (page), "S" (length) : "cc", "memory", "rax", "rcx"); } diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ecbfb4dd3b01..6069acf6072f 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -2,6 +2,8 @@ #include <linux/linkage.h> #include <asm/asm.h> #include <asm/export.h> +#include <asm/cpufeatures.h> +#include <asm/alternative.h> /* * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is @@ -13,18 +15,30 @@ /* * Zero a page. * %rdi - page + * %esi - page-length + * + * Clobbers: %rax, %rcx */ -SYM_FUNC_START(clear_page_rep) - movl $4096/8,%ecx +SYM_FUNC_START(clear_pages_rep) + movl %esi, %ecx xorl %eax,%eax + shrl $3,%ecx rep stosq RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) +SYM_FUNC_END(clear_pages_rep) +EXPORT_SYMBOL_GPL(clear_pages_rep) -SYM_FUNC_START(clear_page_orig) +/* + * Original page zeroing loop. + * %rdi - page + * %esi - page-length + * + * Clobbers: %rax, %rcx, %rflags + */ +SYM_FUNC_START(clear_pages_orig) + movl %esi, %ecx xorl %eax,%eax - movl $4096/64,%ecx + shrl $6,%ecx .p2align 4 .Lloop: decl %ecx @@ -41,16 +55,23 @@ SYM_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) +SYM_FUNC_END(clear_pages_orig) +EXPORT_SYMBOL_GPL(clear_pages_orig) -SYM_FUNC_START(clear_page_erms) - movl $4096,%ecx +/* + * Zero a page. + * %rdi - page + * %esi - page-length + * + * Clobbers: %rax, %rcx + */ +SYM_FUNC_START(clear_pages_erms) + movl %esi, %ecx xorl %eax,%eax rep stosb RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(clear_pages_erms) +EXPORT_SYMBOL_GPL(clear_pages_erms) /* * Default clear user-space.
Change clear_page*() to take a length parameter. Rename to clear_pages_*(). Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> --- clear_page() is now defined in terms of clear_pages(). This means that all clear_page() callsites -- which are more numerous -- would use an additional register now. Opinions on if that's worth optimizing? --- arch/x86/include/asm/page_64.h | 18 ++++++++------ arch/x86/lib/clear_page_64.S | 45 +++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 20 deletions(-)