diff mbox series

[v3,1/4] x86/clear_page: extend clear_page*() for multi-page clearing

Message ID 20250414034607.762653-2-ankur.a.arora@oracle.com (mailing list archive)
State New
Headers show
Series mm/folio_zero_user: add multi-page clearing | expand

Commit Message

Ankur Arora April 14, 2025, 3:46 a.m. UTC
clear_page*() variants now take a page-aligned length parameter and
clears the whole region.

Rename to clear_pages*().

Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 arch/x86/include/asm/page_64.h | 20 +++++++------
 arch/x86/lib/clear_page_64.S   | 52 +++++++++++++++++++++++++---------
 2 files changed, 49 insertions(+), 23 deletions(-)

Comments

Ingo Molnar April 14, 2025, 6:32 a.m. UTC | #1
* Ankur Arora <ankur.a.arora@oracle.com> wrote:

> clear_page*() variants now take a page-aligned length parameter and
> clears the whole region.

Please read your changelogs and fix typos. ;-)

> +void clear_pages_orig(void *page, unsigned int length);
> +void clear_pages_rep(void *page, unsigned int length);
> +void clear_pages_erms(void *page, unsigned int length);

What unit is 'length' in? If it's bytes, why is this interface 
artificially limiting itself to ~4GB? On x86-64 there's very little (if 
any) performance difference between a 32-bit and a 64-bit length 
iterations.

Even if we end up only exposing a 32-bit length API to the generic MM 
layer, there's no reason to limit the x86-64 assembly code in such a 
fashion.

>  static inline void clear_page(void *page)
>  {
> +	unsigned int length = PAGE_SIZE;
>  	/*
> -	 * Clean up KMSAN metadata for the page being cleared. The assembly call
> +	 * Clean up KMSAN metadata for the pages being cleared. The assembly call
>  	 * below clobbers @page, so we perform unpoisoning before it.

>  	 */
> -	kmsan_unpoison_memory(page, PAGE_SIZE);
> -	alternative_call_2(clear_page_orig,
> -			   clear_page_rep, X86_FEATURE_REP_GOOD,
> -			   clear_page_erms, X86_FEATURE_ERMS,
> +	kmsan_unpoison_memory(page, length);
> +
> +	alternative_call_2(clear_pages_orig,
> +			   clear_pages_rep, X86_FEATURE_REP_GOOD,
> +			   clear_pages_erms, X86_FEATURE_ERMS,
>  			   "=D" (page),
> -			   "D" (page),
> +			   ASM_INPUT("D" (page), "S" (length)),
>  			   "cc", "memory", "rax", "rcx");
>  }
>  
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index a508e4a8c66a..bce516263b69 100644
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -13,20 +13,35 @@
>   */
>  
>  /*
> - * Zero a page.
> - * %rdi	- page
> + * Zero kernel page aligned region.
> + *
> + * Input:
> + * %rdi	- destination
> + * %esi	- length
> + *
> + * Clobbers: %rax, %rcx
>   */
> -SYM_TYPED_FUNC_START(clear_page_rep)
> -	movl $4096/8,%ecx
> +SYM_TYPED_FUNC_START(clear_pages_rep)
> +	movl %esi, %ecx
>  	xorl %eax,%eax
> +	shrl $3,%ecx
>  	rep stosq
>  	RET
> -SYM_FUNC_END(clear_page_rep)
> -EXPORT_SYMBOL_GPL(clear_page_rep)
> +SYM_FUNC_END(clear_pages_rep)
> +EXPORT_SYMBOL_GPL(clear_pages_rep)
>  
> -SYM_TYPED_FUNC_START(clear_page_orig)
> +/*
> + * Original page zeroing loop.
> + * Input:
> + * %rdi	- destination
> + * %esi	- length
> + *
> + * Clobbers: %rax, %rcx, %rflags
> + */
> +SYM_TYPED_FUNC_START(clear_pages_orig)
> +	movl   %esi, %ecx
>  	xorl   %eax,%eax
> -	movl   $4096/64,%ecx
> +	shrl   $6,%ecx

So if the natural input parameter is RCX, why is this function using 
RSI as the input 'length' parameter? Causes unnecessary register 
shuffling.

> +/*
> + * Zero kernel page aligned region.
> + *
> + * Input:
> + * %rdi	- destination
> + * %esi	- length
> + *
> + * Clobbers: %rax, %rcx
> + */
> +SYM_TYPED_FUNC_START(clear_pages_erms)
> +	movl %esi, %ecx
>  	xorl %eax,%eax
>  	rep stosb
>  	RET

Same observation: unnecessary register shuffling.

Also, please rename this (now-) terribly named interface:

> +void clear_pages_orig(void *page, unsigned int length);
> +void clear_pages_rep(void *page, unsigned int length);
> +void clear_pages_erms(void *page, unsigned int length);

Because the 'pages' is now a bit misleading, and why is the starting 
address called a 'page'?

So a more sensible namespace would be to follow memset nomenclature:

	void memzero_page_aligned_*(void *addr, unsigned long len);

... and note the intentional abbreviation to 'len'.

Also, since most of these changes are to x86 architecture code, this is 
a new interface only used by x86, and the MM glue is minimal, I'd like 
to merge this series via the x86 tree, if the glue gets acks from MM 
folks.

Thanks,

	Ingo
Peter Zijlstra April 14, 2025, 11:02 a.m. UTC | #2
On Mon, Apr 14, 2025 at 08:32:29AM +0200, Ingo Molnar wrote:

> >  static inline void clear_page(void *page)
> >  {
> > +	unsigned int length = PAGE_SIZE;
> >  	/*
> > -	 * Clean up KMSAN metadata for the page being cleared. The assembly call
> > +	 * Clean up KMSAN metadata for the pages being cleared. The assembly call
> >  	 * below clobbers @page, so we perform unpoisoning before it.
> 
> >  	 */
> > -	kmsan_unpoison_memory(page, PAGE_SIZE);
> > -	alternative_call_2(clear_page_orig,
> > -			   clear_page_rep, X86_FEATURE_REP_GOOD,
> > -			   clear_page_erms, X86_FEATURE_ERMS,
> > +	kmsan_unpoison_memory(page, length);
> > +
> > +	alternative_call_2(clear_pages_orig,
> > +			   clear_pages_rep, X86_FEATURE_REP_GOOD,
> > +			   clear_pages_erms, X86_FEATURE_ERMS,
> >  			   "=D" (page),
> > -			   "D" (page),
> > +			   ASM_INPUT("D" (page), "S" (length)),
> >  			   "cc", "memory", "rax", "rcx");
> >  }
> >  
> > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > index a508e4a8c66a..bce516263b69 100644
> > --- a/arch/x86/lib/clear_page_64.S
> > +++ b/arch/x86/lib/clear_page_64.S
> > @@ -13,20 +13,35 @@
> >   */
> >  
> >  /*
> > - * Zero a page.
> > - * %rdi	- page
> > + * Zero kernel page aligned region.
> > + *
> > + * Input:
> > + * %rdi	- destination
> > + * %esi	- length
> > + *
> > + * Clobbers: %rax, %rcx
> >   */
> > -SYM_TYPED_FUNC_START(clear_page_rep)
> > -	movl $4096/8,%ecx
> > +SYM_TYPED_FUNC_START(clear_pages_rep)
> > +	movl %esi, %ecx
> >  	xorl %eax,%eax
> > +	shrl $3,%ecx
> >  	rep stosq
> >  	RET
> > -SYM_FUNC_END(clear_page_rep)
> > -EXPORT_SYMBOL_GPL(clear_page_rep)
> > +SYM_FUNC_END(clear_pages_rep)
> > +EXPORT_SYMBOL_GPL(clear_pages_rep)
> >  
> > -SYM_TYPED_FUNC_START(clear_page_orig)
> > +/*
> > + * Original page zeroing loop.
> > + * Input:
> > + * %rdi	- destination
> > + * %esi	- length
> > + *
> > + * Clobbers: %rax, %rcx, %rflags
> > + */
> > +SYM_TYPED_FUNC_START(clear_pages_orig)
> > +	movl   %esi, %ecx
> >  	xorl   %eax,%eax
> > -	movl   $4096/64,%ecx
> > +	shrl   $6,%ecx
> 
> So if the natural input parameter is RCX, why is this function using 
> RSI as the input 'length' parameter? Causes unnecessary register 
> shuffling.

This symbol is written as a C function with C calling convention, even
though it is only meant to be called from that clear_page() alternative.

If we want to go change all this, then we should go do the same we do
for __clear_user() and write it thusly:

	asm volatile(ALTERNATIVE("rep stosb",
				 "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)
				 : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
				 : "a" (0))

And forget about all those clear_page_*() thingies.
Ingo Molnar April 14, 2025, 11:14 a.m. UTC | #3
* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Apr 14, 2025 at 08:32:29AM +0200, Ingo Molnar wrote:
> 
> > >  static inline void clear_page(void *page)
> > >  {
> > > +	unsigned int length = PAGE_SIZE;
> > >  	/*
> > > -	 * Clean up KMSAN metadata for the page being cleared. The assembly call
> > > +	 * Clean up KMSAN metadata for the pages being cleared. The assembly call
> > >  	 * below clobbers @page, so we perform unpoisoning before it.
> > 
> > >  	 */
> > > -	kmsan_unpoison_memory(page, PAGE_SIZE);
> > > -	alternative_call_2(clear_page_orig,
> > > -			   clear_page_rep, X86_FEATURE_REP_GOOD,
> > > -			   clear_page_erms, X86_FEATURE_ERMS,
> > > +	kmsan_unpoison_memory(page, length);
> > > +
> > > +	alternative_call_2(clear_pages_orig,
> > > +			   clear_pages_rep, X86_FEATURE_REP_GOOD,
> > > +			   clear_pages_erms, X86_FEATURE_ERMS,
> > >  			   "=D" (page),
> > > -			   "D" (page),
> > > +			   ASM_INPUT("D" (page), "S" (length)),
> > >  			   "cc", "memory", "rax", "rcx");
> > >  }
> > >  
> > > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > > index a508e4a8c66a..bce516263b69 100644
> > > --- a/arch/x86/lib/clear_page_64.S
> > > +++ b/arch/x86/lib/clear_page_64.S
> > > @@ -13,20 +13,35 @@
> > >   */
> > >  
> > >  /*
> > > - * Zero a page.
> > > - * %rdi	- page
> > > + * Zero kernel page aligned region.
> > > + *
> > > + * Input:
> > > + * %rdi	- destination
> > > + * %esi	- length
> > > + *
> > > + * Clobbers: %rax, %rcx
> > >   */
> > > -SYM_TYPED_FUNC_START(clear_page_rep)
> > > -	movl $4096/8,%ecx
> > > +SYM_TYPED_FUNC_START(clear_pages_rep)
> > > +	movl %esi, %ecx
> > >  	xorl %eax,%eax
> > > +	shrl $3,%ecx
> > >  	rep stosq
> > >  	RET
> > > -SYM_FUNC_END(clear_page_rep)
> > > -EXPORT_SYMBOL_GPL(clear_page_rep)
> > > +SYM_FUNC_END(clear_pages_rep)
> > > +EXPORT_SYMBOL_GPL(clear_pages_rep)
> > >  
> > > -SYM_TYPED_FUNC_START(clear_page_orig)
> > > +/*
> > > + * Original page zeroing loop.
> > > + * Input:
> > > + * %rdi	- destination
> > > + * %esi	- length
> > > + *
> > > + * Clobbers: %rax, %rcx, %rflags
> > > + */
> > > +SYM_TYPED_FUNC_START(clear_pages_orig)
> > > +	movl   %esi, %ecx
> > >  	xorl   %eax,%eax
> > > -	movl   $4096/64,%ecx
> > > +	shrl   $6,%ecx
> > 
> > So if the natural input parameter is RCX, why is this function using 
> > RSI as the input 'length' parameter? Causes unnecessary register 
> > shuffling.
> 
> This symbol is written as a C function with C calling convention, even
> though it is only meant to be called from that clear_page() alternative.
>
> If we want to go change all this, then we should go do the same we do
> for __clear_user() and write it thusly:
> 
> 	asm volatile(ALTERNATIVE("rep stosb",
> 				 "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)
> 				 : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> 				 : "a" (0))
> 
> And forget about all those clear_page_*() thingies.

Yeah.

Thanks,

	Ingo
Ankur Arora April 14, 2025, 7:46 p.m. UTC | #4
Peter Zijlstra <peterz@infradead.org> writes:

> On Mon, Apr 14, 2025 at 08:32:29AM +0200, Ingo Molnar wrote:

[ ... ]

>> > -SYM_TYPED_FUNC_START(clear_page_orig)
>> > +/*
>> > + * Original page zeroing loop.
>> > + * Input:
>> > + * %rdi	- destination
>> > + * %esi	- length
>> > + *
>> > + * Clobbers: %rax, %rcx, %rflags
>> > + */
>> > +SYM_TYPED_FUNC_START(clear_pages_orig)
>> > +	movl   %esi, %ecx
>> >  	xorl   %eax,%eax
>> > -	movl   $4096/64,%ecx
>> > +	shrl   $6,%ecx
>>
>> So if the natural input parameter is RCX, why is this function using
>> RSI as the input 'length' parameter? Causes unnecessary register
>> shuffling.
>
> This symbol is written as a C function with C calling convention, even
> though it is only meant to be called from that clear_page() alternative.
>
> If we want to go change all this, then we should go do the same we do
> for __clear_user() and write it thusly:
>
> 	asm volatile(ALTERNATIVE("rep stosb",
> 				 "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)
> 				 : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> 				 : "a" (0))
>
> And forget about all those clear_page_*() thingies.

Yeah, this makes sense. We don't call any of the clear_pages_*() variants
from anywhere else.

clear_pages_rep() and clear_pages_erms() are trivial enough to be
inlined in the ALTERNATIVE as well.

Thanks!

--
ankur
Ankur Arora April 14, 2025, 7:52 p.m. UTC | #5
Ingo Molnar <mingo@kernel.org> writes:

> * Ankur Arora <ankur.a.arora@oracle.com> wrote:
>
>> clear_page*() variants now take a page-aligned length parameter and
>> clears the whole region.
>
> Please read your changelogs and fix typos. ;-)

Ack!

>> +void clear_pages_orig(void *page, unsigned int length);
>> +void clear_pages_rep(void *page, unsigned int length);
>> +void clear_pages_erms(void *page, unsigned int length);
>
> What unit is 'length' in? If it's bytes, why is this interface
> artificially limiting itself to ~4GB? On x86-64 there's very little (if

I was in two minds about the unit. Given that the largest page size is
1GB, decided to go with 32bit. But, as you say below, there's no reason
to limit the x86-64 interface for MM reasons.  Will fix.

> any) performance difference between a 32-bit and a 64-bit length
> iterations.
>
> Even if we end up only exposing a 32-bit length API to the generic MM
> layer, there's no reason to limit the x86-64 assembly code in such a
> fashion.
>
>>  static inline void clear_page(void *page)
>>  {
>> +	unsigned int length = PAGE_SIZE;
>>  	/*
>> -	 * Clean up KMSAN metadata for the page being cleared. The assembly call
>> +	 * Clean up KMSAN metadata for the pages being cleared. The assembly call
>>  	 * below clobbers @page, so we perform unpoisoning before it.
>
>>  	 */
>> -	kmsan_unpoison_memory(page, PAGE_SIZE);
>> -	alternative_call_2(clear_page_orig,
>> -			   clear_page_rep, X86_FEATURE_REP_GOOD,
>> -			   clear_page_erms, X86_FEATURE_ERMS,
>> +	kmsan_unpoison_memory(page, length);
>> +
>> +	alternative_call_2(clear_pages_orig,
>> +			   clear_pages_rep, X86_FEATURE_REP_GOOD,
>> +			   clear_pages_erms, X86_FEATURE_ERMS,
>>  			   "=D" (page),
>> -			   "D" (page),
>> +			   ASM_INPUT("D" (page), "S" (length)),
>>  			   "cc", "memory", "rax", "rcx");
>>  }
>>
>> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
>> index a508e4a8c66a..bce516263b69 100644
>> --- a/arch/x86/lib/clear_page_64.S
>> +++ b/arch/x86/lib/clear_page_64.S
>> @@ -13,20 +13,35 @@
>>   */
>>
>>  /*
>> - * Zero a page.
>> - * %rdi	- page
>> + * Zero kernel page aligned region.
>> + *
>> + * Input:
>> + * %rdi	- destination
>> + * %esi	- length
>> + *
>> + * Clobbers: %rax, %rcx
>>   */
>> -SYM_TYPED_FUNC_START(clear_page_rep)
>> -	movl $4096/8,%ecx
>> +SYM_TYPED_FUNC_START(clear_pages_rep)
>> +	movl %esi, %ecx
>>  	xorl %eax,%eax
>> +	shrl $3,%ecx
>>  	rep stosq
>>  	RET
>> -SYM_FUNC_END(clear_page_rep)
>> -EXPORT_SYMBOL_GPL(clear_page_rep)
>> +SYM_FUNC_END(clear_pages_rep)
>> +EXPORT_SYMBOL_GPL(clear_pages_rep)
>>
>> -SYM_TYPED_FUNC_START(clear_page_orig)
>> +/*
>> + * Original page zeroing loop.
>> + * Input:
>> + * %rdi	- destination
>> + * %esi	- length
>> + *
>> + * Clobbers: %rax, %rcx, %rflags
>> + */
>> +SYM_TYPED_FUNC_START(clear_pages_orig)
>> +	movl   %esi, %ecx
>>  	xorl   %eax,%eax
>> -	movl   $4096/64,%ecx
>> +	shrl   $6,%ecx
>
> So if the natural input parameter is RCX, why is this function using
> RSI as the input 'length' parameter? Causes unnecessary register
> shuffling.

True. Will fix via ALTERNATIVE as Peter describes in the other mail.
That should get rid of some of the repetition in clear_pages_rep and
clear_pages_erms() as well.

Though I guess clear_pages_orig() will need to stick around just with
a different parameter passing convention.

>> +/*
>> + * Zero kernel page aligned region.
>> + *
>> + * Input:
>> + * %rdi	- destination
>> + * %esi	- length
>> + *
>> + * Clobbers: %rax, %rcx
>> + */
>> +SYM_TYPED_FUNC_START(clear_pages_erms)
>> +	movl %esi, %ecx
>>  	xorl %eax,%eax
>>  	rep stosb
>>  	RET
>
> Same observation: unnecessary register shuffling.
>
> Also, please rename this (now-) terribly named interface:
>
>> +void clear_pages_orig(void *page, unsigned int length);
>> +void clear_pages_rep(void *page, unsigned int length);
>> +void clear_pages_erms(void *page, unsigned int length);
>
> Because the 'pages' is now a bit misleading, and why is the starting
> address called a 'page'?

> So a more sensible namespace would be to follow memset nomenclature:
>
> 	void memzero_page_aligned_*(void *addr, unsigned long len);
>
> ... and note the intentional abbreviation to 'len'.

Ack.

> Also, since most of these changes are to x86 architecture code, this is
> a new interface only used by x86, and the MM glue is minimal, I'd like
> to merge this series via the x86 tree, if the glue gets acks from MM
> folks.

Works for me!


Thanks for the review.

--
ankur
Matthew Wilcox April 14, 2025, 8:09 p.m. UTC | #6
On Mon, Apr 14, 2025 at 12:52:37PM -0700, Ankur Arora wrote:
> Ingo Molnar <mingo@kernel.org> writes:
> >> +void clear_pages_orig(void *page, unsigned int length);
> >> +void clear_pages_rep(void *page, unsigned int length);
> >> +void clear_pages_erms(void *page, unsigned int length);
> >
> > What unit is 'length' in? If it's bytes, why is this interface
> > artificially limiting itself to ~4GB? On x86-64 there's very little (if
> 
> I was in two minds about the unit. Given that the largest page size is
> 1GB, decided to go with 32bit. But, as you say below, there's no reason
> to limit the x86-64 interface for MM reasons.  Will fix.

Actually, I think there is (and we went through this with SPARC, if you
remember?)  We _shouldn't_ be calling memset() with a large size (ie
larger than 4GB).  If we have that much memory to clear, we should be
doing something smarter, like using padata to get lots of CPUs clearing
individual portions of the page.

I don't know how relevant this is now that you're going to be using
ALTERNATIVES.
Mateusz Guzik April 14, 2025, 10:26 p.m. UTC | #7
On Mon, Apr 14, 2025 at 01:02:59PM +0200, Peter Zijlstra wrote:
> This symbol is written as a C function with C calling convention, even
> though it is only meant to be called from that clear_page() alternative.
> 
> If we want to go change all this, then we should go do the same we do
> for __clear_user() and write it thusly:
> 
> 	asm volatile(ALTERNATIVE("rep stosb",
> 				 "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)
> 				 : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> 				 : "a" (0))
> 
> And forget about all those clear_page_*() thingies.
> 

I have to disagree.

Next to nobody has FSRS, so for now one would have to expect everyone
would be punting to the routine. Did you mean ERMS as sizes are in fact
not short?

rep_stos_alternative() as implemented right now sucks in its own right
("small" areas sorted out with an 8 byte and 1 byte loops, bigger ones
unrolled 64 byte loop at a time, no rep stos{b,q} in sight). Someone(tm)
should fix it and for the sake of argument suppose it happened. That's
still some code executed to figure out how to zero and to align the buf.

Instead, I think one can start with just retiring clear_page_orig().

With that sucker out of the way, an optional quest is to figure out if
rep stosq vs rep stosb makes any difference for pages -- for all I know
rep stosq is the way. This would require testing on quite a few uarchs
and I'm not going to blame anyone for not being interested.

Let's say nobody bothered OR rep stosb provides a win. In that case this
can trivially ALTERNATIVE between rep stosb and rep stosq based on ERMS,
no func calls necessary.
Ankur Arora April 15, 2025, 6:14 a.m. UTC | #8
Mateusz Guzik <mjguzik@gmail.com> writes:

> On Mon, Apr 14, 2025 at 01:02:59PM +0200, Peter Zijlstra wrote:
>> This symbol is written as a C function with C calling convention, even
>> though it is only meant to be called from that clear_page() alternative.
>>
>> If we want to go change all this, then we should go do the same we do
>> for __clear_user() and write it thusly:
>>
>> 	asm volatile(ALTERNATIVE("rep stosb",
>> 				 "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)
>> 				 : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
>> 				 : "a" (0))
>>
>> And forget about all those clear_page_*() thingies.
>>
>
> I have to disagree.
>
> Next to nobody has FSRS, so for now one would have to expect everyone
> would be punting to the routine. Did you mean ERMS as sizes are in fact
> not short?
>
> rep_stos_alternative() as implemented right now sucks in its own right
> ("small" areas sorted out with an 8 byte and 1 byte loops, bigger ones
> unrolled 64 byte loop at a time, no rep stos{b,q} in sight). Someone(tm)
> should fix it and for the sake of argument suppose it happened. That's
> still some code executed to figure out how to zero and to align the buf.
>
> Instead, I think one can start with just retiring clear_page_orig().
>
> With that sucker out of the way, an optional quest is to figure out if
> rep stosq vs rep stosb makes any difference for pages -- for all I know
> rep stosq is the way. This would require testing on quite a few uarchs
> and I'm not going to blame anyone for not being interested.

IIRC some recent AMD models (Rome?) did expose REP_GOOD but not ERMS.

> Let's say nobody bothered OR rep stosb provides a win. In that case this
> can trivially ALTERNATIVE between rep stosb and rep stosq based on ERMS,
> no func calls necessary.

We shouldn't need any function calls for ERMS and REP_GOOD.

I think something like this untested code should work:

 	asm volatile(
            ALTERNATIVE_2("call clear_pages_orig",
                          "rep stosb", X86_FEATURE_REP_GOOD,
                          "shrl $3,%ecx; rep stosq", X86_FEATURE_ERMS,
 		          : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
                          : "a" (0)))

--
ankur
Mateusz Guzik April 15, 2025, 8:22 a.m. UTC | #9
On Tue, Apr 15, 2025 at 8:14 AM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>
>
> Mateusz Guzik <mjguzik@gmail.com> writes:
> > With that sucker out of the way, an optional quest is to figure out if
> > rep stosq vs rep stosb makes any difference for pages -- for all I know
> > rep stosq is the way. This would require testing on quite a few uarchs
> > and I'm not going to blame anyone for not being interested.
>
> IIRC some recent AMD models (Rome?) did expose REP_GOOD but not ERMS.
>

The uarch does not have it or the bit magically fails to show up?
Worst case, should rep stosb be faster on that uarch, the kernel can
pretend the bit is set.

> > Let's say nobody bothered OR rep stosb provides a win. In that case this
> > can trivially ALTERNATIVE between rep stosb and rep stosq based on ERMS,
> > no func calls necessary.
>
> We shouldn't need any function calls for ERMS and REP_GOOD.
>
> I think something like this untested code should work:
>
>         asm volatile(
>             ALTERNATIVE_2("call clear_pages_orig",
>                           "rep stosb", X86_FEATURE_REP_GOOD,
>                           "shrl $3,%ecx; rep stosq", X86_FEATURE_ERMS,
>                           : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
>                           : "a" (0)))
>

That's what I'm suggesting, with one difference: whack
clear_pages_orig altogether.
Ankur Arora April 15, 2025, 8:01 p.m. UTC | #10
Mateusz Guzik <mjguzik@gmail.com> writes:

> On Tue, Apr 15, 2025 at 8:14 AM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>>
>>
>> Mateusz Guzik <mjguzik@gmail.com> writes:
>> > With that sucker out of the way, an optional quest is to figure out if
>> > rep stosq vs rep stosb makes any difference for pages -- for all I know
>> > rep stosq is the way. This would require testing on quite a few uarchs
>> > and I'm not going to blame anyone for not being interested.
>>
>> IIRC some recent AMD models (Rome?) did expose REP_GOOD but not ERMS.
>>
>
> The uarch does not have it or the bit magically fails to show up?
> Worst case, should rep stosb be faster on that uarch, the kernel can
> pretend the bit is set.

It's a synthetic bit so the uarch has both. I think REP STOSB is optimized
post FSRS (AIUI Zen3)

        if (c->x86 >= 0x10)
                set_cpu_cap(c, X86_FEATURE_REP_GOOD);

        /* AMD FSRM also implies FSRS */
        if (cpu_has(c, X86_FEATURE_FSRM))
                set_cpu_cap(c, X86_FEATURE_FSRS);


>> > Let's say nobody bothered OR rep stosb provides a win. In that case this
>> > can trivially ALTERNATIVE between rep stosb and rep stosq based on ERMS,
>> > no func calls necessary.
>>
>> We shouldn't need any function calls for ERMS and REP_GOOD.
>>
>> I think something like this untested code should work:
>>
>>         asm volatile(
>>             ALTERNATIVE_2("call clear_pages_orig",
>>                           "rep stosb", X86_FEATURE_REP_GOOD,
>>                           "shrl $3,%ecx; rep stosq", X86_FEATURE_ERMS,
>>                           : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
>>                           : "a" (0)))
>>
>
> That's what I'm suggesting, with one difference: whack
> clear_pages_orig altogether.

What do we gain by getting rid of it? Maybe there's old hardware with
unoptimized rep; stos*.

--
ankur
Mateusz Guzik April 15, 2025, 8:32 p.m. UTC | #11
On Tue, Apr 15, 2025 at 10:02 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>
>
> Mateusz Guzik <mjguzik@gmail.com> writes:
>
> > On Tue, Apr 15, 2025 at 8:14 AM Ankur Arora <ankur.a.arora@oracle.com> wrote:
> >>
> >>
> >> Mateusz Guzik <mjguzik@gmail.com> writes:
> >> > With that sucker out of the way, an optional quest is to figure out if
> >> > rep stosq vs rep stosb makes any difference for pages -- for all I know
> >> > rep stosq is the way. This would require testing on quite a few uarchs
> >> > and I'm not going to blame anyone for not being interested.
> >>
> >> IIRC some recent AMD models (Rome?) did expose REP_GOOD but not ERMS.
> >>
> >
> > The uarch does not have it or the bit magically fails to show up?
> > Worst case, should rep stosb be faster on that uarch, the kernel can
> > pretend the bit is set.
>
> It's a synthetic bit so the uarch has both. I think REP STOSB is optimized
> post FSRS (AIUI Zen3)
>
>         if (c->x86 >= 0x10)
>                 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
>
>         /* AMD FSRM also implies FSRS */
>         if (cpu_has(c, X86_FEATURE_FSRM))
>                 set_cpu_cap(c, X86_FEATURE_FSRS);
>
>
> >> > Let's say nobody bothered OR rep stosb provides a win. In that case this
> >> > can trivially ALTERNATIVE between rep stosb and rep stosq based on ERMS,
> >> > no func calls necessary.
> >>
> >> We shouldn't need any function calls for ERMS and REP_GOOD.
> >>
> >> I think something like this untested code should work:
> >>
> >>         asm volatile(
> >>             ALTERNATIVE_2("call clear_pages_orig",
> >>                           "rep stosb", X86_FEATURE_REP_GOOD,
> >>                           "shrl $3,%ecx; rep stosq", X86_FEATURE_ERMS,
> >>                           : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> >>                           : "a" (0)))
> >>
> >
> > That's what I'm suggesting, with one difference: whack
> > clear_pages_orig altogether.
>
> What do we gain by getting rid of it? Maybe there's old hardware with
> unoptimized rep; stos*.
>

The string routines (memset, memcpy et al) need a lot of love and
preferably nobody would bother spending time placating non-rep users
while sorting them out.

According to wiki the AMD CPUs started with REP_GOOD in 2007, meaning
you would need something even older than that to not have it. Intel is
presumably in a similar boat.

So happens gcc spent several years emitting inlined rep stosq and rep
movsq, so either users don't care or there are no users (well
realistically someone somewhere has a machine like that in the garage,
but fringe cases are not an argument).

rep_movs_alternative already punts to rep mov ignoring the issue of
REP_GOOD for some time now (admittedly, I removed the non-rep support
:P) and again there are no pitchforks (that I had seen).

So I think it would be best for everyone in the long run to completely
reap out the REP_GOOD thing. For all I know the kernel stopped booting
on machines with such uarchs long time ago for unrelated reasons.

As far as this specific patchset goes, it's just a waste of testing to
make sure it still works, but I can't *insist* on removing the
routine. I guess it is x86 maintainers call whether to whack this.
Ankur Arora April 15, 2025, 9:59 p.m. UTC | #12
Matthew Wilcox <willy@infradead.org> writes:

> On Mon, Apr 14, 2025 at 12:52:37PM -0700, Ankur Arora wrote:
>> Ingo Molnar <mingo@kernel.org> writes:
>> >> +void clear_pages_orig(void *page, unsigned int length);
>> >> +void clear_pages_rep(void *page, unsigned int length);
>> >> +void clear_pages_erms(void *page, unsigned int length);
>> >
>> > What unit is 'length' in? If it's bytes, why is this interface
>> > artificially limiting itself to ~4GB? On x86-64 there's very little (if
>>
>> I was in two minds about the unit. Given that the largest page size is
>> 1GB, decided to go with 32bit. But, as you say below, there's no reason
>> to limit the x86-64 interface for MM reasons.  Will fix.
>
> Actually, I think there is (and we went through this with SPARC, if you
> remember?)

My google-fu is failing me. I don't think it was this thread:
 https://lore.kernel.org/lkml/1490310113-824438-1-git-send-email-pasha.tatashin@oracle.com/

> We _shouldn't_ be calling memset() with a large size (ie
> larger than 4GB).  If we have that much memory to clear, we should be
> doing something smarter, like using padata to get lots of CPUs clearing
> individual portions of the page.

Agreed. Or even offloading to an accelerator so as to not waste CPU time.

That said, whether to invoke clear_pages() in > 4GB seems like an MM
policy question. Not sure it makes sense to limit the low level interface.

> I don't know how relevant this is now that you're going to be using
> ALTERNATIVES.


--
ankur
diff mbox series

Patch

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d3aab6f4e59a..45db74db9021 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,22 +40,24 @@  extern unsigned long __phys_addr_symbol(unsigned long);
 
 #define __phys_reloc_hide(x)	(x)
 
-void clear_page_orig(void *page);
-void clear_page_rep(void *page);
-void clear_page_erms(void *page);
+void clear_pages_orig(void *page, unsigned int length);
+void clear_pages_rep(void *page, unsigned int length);
+void clear_pages_erms(void *page, unsigned int length);
 
 static inline void clear_page(void *page)
 {
+	unsigned int length = PAGE_SIZE;
 	/*
-	 * Clean up KMSAN metadata for the page being cleared. The assembly call
+	 * Clean up KMSAN metadata for the pages being cleared. The assembly call
 	 * below clobbers @page, so we perform unpoisoning before it.
 	 */
-	kmsan_unpoison_memory(page, PAGE_SIZE);
-	alternative_call_2(clear_page_orig,
-			   clear_page_rep, X86_FEATURE_REP_GOOD,
-			   clear_page_erms, X86_FEATURE_ERMS,
+	kmsan_unpoison_memory(page, length);
+
+	alternative_call_2(clear_pages_orig,
+			   clear_pages_rep, X86_FEATURE_REP_GOOD,
+			   clear_pages_erms, X86_FEATURE_ERMS,
 			   "=D" (page),
-			   "D" (page),
+			   ASM_INPUT("D" (page), "S" (length)),
 			   "cc", "memory", "rax", "rcx");
 }
 
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a508e4a8c66a..bce516263b69 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -13,20 +13,35 @@ 
  */
 
 /*
- * Zero a page.
- * %rdi	- page
+ * Zero kernel page aligned region.
+ *
+ * Input:
+ * %rdi	- destination
+ * %esi	- length
+ *
+ * Clobbers: %rax, %rcx
  */
-SYM_TYPED_FUNC_START(clear_page_rep)
-	movl $4096/8,%ecx
+SYM_TYPED_FUNC_START(clear_pages_rep)
+	movl %esi, %ecx
 	xorl %eax,%eax
+	shrl $3,%ecx
 	rep stosq
 	RET
-SYM_FUNC_END(clear_page_rep)
-EXPORT_SYMBOL_GPL(clear_page_rep)
+SYM_FUNC_END(clear_pages_rep)
+EXPORT_SYMBOL_GPL(clear_pages_rep)
 
-SYM_TYPED_FUNC_START(clear_page_orig)
+/*
+ * Original page zeroing loop.
+ * Input:
+ * %rdi	- destination
+ * %esi	- length
+ *
+ * Clobbers: %rax, %rcx, %rflags
+ */
+SYM_TYPED_FUNC_START(clear_pages_orig)
+	movl   %esi, %ecx
 	xorl   %eax,%eax
-	movl   $4096/64,%ecx
+	shrl   $6,%ecx
 	.p2align 4
 .Lloop:
 	decl	%ecx
@@ -43,16 +58,25 @@  SYM_TYPED_FUNC_START(clear_page_orig)
 	jnz	.Lloop
 	nop
 	RET
-SYM_FUNC_END(clear_page_orig)
-EXPORT_SYMBOL_GPL(clear_page_orig)
+SYM_FUNC_END(clear_pages_orig)
+EXPORT_SYMBOL_GPL(clear_pages_orig)
 
-SYM_TYPED_FUNC_START(clear_page_erms)
-	movl $4096,%ecx
+/*
+ * Zero kernel page aligned region.
+ *
+ * Input:
+ * %rdi	- destination
+ * %esi	- length
+ *
+ * Clobbers: %rax, %rcx
+ */
+SYM_TYPED_FUNC_START(clear_pages_erms)
+	movl %esi, %ecx
 	xorl %eax,%eax
 	rep stosb
 	RET
-SYM_FUNC_END(clear_page_erms)
-EXPORT_SYMBOL_GPL(clear_page_erms)
+SYM_FUNC_END(clear_pages_erms)
+EXPORT_SYMBOL_GPL(clear_pages_erms)
 
 /*
  * Default clear user-space.