diff mbox series

[v2,07/12] x86: add new features for paravirt patching

Message ID 20201120114630.13552-8-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series x86: major paravirt cleanup | expand

Commit Message

Jürgen Groß Nov. 20, 2020, 11:46 a.m. UTC
For being able to switch paravirt patching from special cased custom
code sequences to ALTERNATIVE handling some X86_FEATURE_* are needed
as new features. This enables to have the standard indirect pv call
as the default code and to patch that with the non-Xen custom code
sequence via ALTERNATIVE patching later.

Make sure paravirt patching is performed before alternative patching.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/cpufeatures.h |  3 +++
 arch/x86/kernel/alternative.c      | 28 ++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

Comments

Borislav Petkov Dec. 8, 2020, 6:43 p.m. UTC | #1
On Fri, Nov 20, 2020 at 12:46:25PM +0100, Juergen Gross wrote:
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index dad350d42ecf..ffa23c655412 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -237,6 +237,9 @@
>  #define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
>  #define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
>  #define X86_FEATURE_SEV_ES		( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
> +#define X86_FEATURE_NOT_XENPV		( 8*32+21) /* "" Inverse of X86_FEATURE_XENPV */
> +#define X86_FEATURE_NO_PVUNLOCK		( 8*32+22) /* "" No PV unlock function */
> +#define X86_FEATURE_NO_VCPUPREEMPT	( 8*32+23) /* "" No PV vcpu_is_preempted function */

Ew, negative features. ;-\

/me goes forward and looks at usage sites:

+	ALTERNATIVE_2 "jmp *paravirt_iret(%rip);",			\
+		      "jmp native_iret;", X86_FEATURE_NOT_XENPV,	\
+		      "jmp xen_iret;", X86_FEATURE_XENPV

Can we make that:

	ALTERNATIVE_TERNARY "jmp *paravirt_iret(%rip);",
		      "jmp xen_iret;", X86_FEATURE_XENPV,
		      "jmp native_iret;", X86_FEATURE_XENPV,

where the last two lines are supposed to mean

			    X86_FEATURE_XENPV ? "jmp xen_iret;" : "jmp native_iret;"

Now, in order to convey that logic to apply_alternatives(), you can do:

struct alt_instr {
        s32 instr_offset;       /* original instruction */
        s32 repl_offset;        /* offset to replacement instruction */
        u16 cpuid;              /* cpuid bit set for replacement */
        u8  instrlen;           /* length of original instruction */
        u8  replacementlen;     /* length of new instruction */
        u8  padlen;             /* length of build-time padding */
	u8  flags;		/* patching flags */ 			<--- THIS
} __packed;

and yes, we have had the flags thing in a lot of WIP diffs over the
years but we've never come to actually needing it.

Anyway, then, apply_alternatives() will do:

	if (flags & ALT_NOT_FEATURE)

or something like that - I'm bad at naming stuff - then it should patch
only when the feature is NOT set and vice versa.

There in that

	if (!boot_cpu_has(a->cpuid)) {

branch.

Hmm?

>  /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
>  #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 2400ad62f330..f8f9700719cf 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -593,6 +593,18 @@ int alternatives_text_reserved(void *start, void *end)
>  #endif /* CONFIG_SMP */
>  
>  #ifdef CONFIG_PARAVIRT
> +static void __init paravirt_set_cap(void)
> +{
> +	if (!boot_cpu_has(X86_FEATURE_XENPV))
> +		setup_force_cpu_cap(X86_FEATURE_NOT_XENPV);
> +
> +	if (pv_is_native_spin_unlock())
> +		setup_force_cpu_cap(X86_FEATURE_NO_PVUNLOCK);
> +
> +	if (pv_is_native_vcpu_is_preempted())
> +		setup_force_cpu_cap(X86_FEATURE_NO_VCPUPREEMPT);
> +}
> +
>  void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
>  				     struct paravirt_patch_site *end)
>  {
> @@ -616,6 +628,8 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
>  }
>  extern struct paravirt_patch_site __start_parainstructions[],
>  	__stop_parainstructions[];
> +#else
> +static void __init paravirt_set_cap(void) { }
>  #endif	/* CONFIG_PARAVIRT */
>  
>  /*
> @@ -723,6 +737,18 @@ void __init alternative_instructions(void)
>  	 * patching.
>  	 */
>  
> +	paravirt_set_cap();

Can that be called from somewhere in the Xen init path and not from
here? Somewhere before check_bugs() gets called.

> +	/*
> +	 * First patch paravirt functions, such that we overwrite the indirect
> +	 * call with the direct call.
> +	 */
> +	apply_paravirt(__parainstructions, __parainstructions_end);
> +
> +	/*
> +	 * Then patch alternatives, such that those paravirt calls that are in
> +	 * alternatives can be overwritten by their immediate fragments.
> +	 */
>  	apply_alternatives(__alt_instructions, __alt_instructions_end);

Can you give an example here pls why the paravirt patching needs to run
first?

Thx.
Jürgen Groß Dec. 9, 2020, 7:30 a.m. UTC | #2
On 08.12.20 19:43, Borislav Petkov wrote:
> On Fri, Nov 20, 2020 at 12:46:25PM +0100, Juergen Gross wrote:
>> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
>> index dad350d42ecf..ffa23c655412 100644
>> --- a/arch/x86/include/asm/cpufeatures.h
>> +++ b/arch/x86/include/asm/cpufeatures.h
>> @@ -237,6 +237,9 @@
>>   #define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
>>   #define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
>>   #define X86_FEATURE_SEV_ES		( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
>> +#define X86_FEATURE_NOT_XENPV		( 8*32+21) /* "" Inverse of X86_FEATURE_XENPV */
>> +#define X86_FEATURE_NO_PVUNLOCK		( 8*32+22) /* "" No PV unlock function */
>> +#define X86_FEATURE_NO_VCPUPREEMPT	( 8*32+23) /* "" No PV vcpu_is_preempted function */
> 
> Ew, negative features. ;-\

Hey, I already suggested to use ~FEATURE for that purpose (see
https://lore.kernel.org/lkml/f105a63d-6b51-3afb-83e0-e899ea40813e@suse.com/ 
).

> 
> /me goes forward and looks at usage sites:
> 
> +	ALTERNATIVE_2 "jmp *paravirt_iret(%rip);",			\
> +		      "jmp native_iret;", X86_FEATURE_NOT_XENPV,	\
> +		      "jmp xen_iret;", X86_FEATURE_XENPV
> 
> Can we make that:
> 
> 	ALTERNATIVE_TERNARY "jmp *paravirt_iret(%rip);",
> 		      "jmp xen_iret;", X86_FEATURE_XENPV,
> 		      "jmp native_iret;", X86_FEATURE_XENPV,

Would we really want to specify the feature twice?

I'd rather make the syntax:

ALTERNATIVE_TERNARY <initial-code> <feature> <code-for-feature-set>
                                              <code-for-feature-unset>

as this ...

> 
> where the last two lines are supposed to mean
> 
> 			    X86_FEATURE_XENPV ? "jmp xen_iret;" : "jmp native_iret;"

... would match perfectly to this interpretation.

> 
> Now, in order to convey that logic to apply_alternatives(), you can do:
> 
> struct alt_instr {
>          s32 instr_offset;       /* original instruction */
>          s32 repl_offset;        /* offset to replacement instruction */
>          u16 cpuid;              /* cpuid bit set for replacement */
>          u8  instrlen;           /* length of original instruction */
>          u8  replacementlen;     /* length of new instruction */
>          u8  padlen;             /* length of build-time padding */
> 	u8  flags;		/* patching flags */ 			<--- THIS
> } __packed;

Hmm, using flags is an alternative (pun intended :-) ).

> 
> and yes, we have had the flags thing in a lot of WIP diffs over the
> years but we've never come to actually needing it.
> 
> Anyway, then, apply_alternatives() will do:
> 
> 	if (flags & ALT_NOT_FEATURE)
> 
> or something like that - I'm bad at naming stuff - then it should patch
> only when the feature is NOT set and vice versa.
> 
> There in that
> 
> 	if (!boot_cpu_has(a->cpuid)) {
> 
> branch.
> 
> Hmm?

Fine with me (I'd prefer my ALTERNATIVE_TERNARY syntax, though).

> 
>>   /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
>>   #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index 2400ad62f330..f8f9700719cf 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -593,6 +593,18 @@ int alternatives_text_reserved(void *start, void *end)
>>   #endif /* CONFIG_SMP */
>>   
>>   #ifdef CONFIG_PARAVIRT
>> +static void __init paravirt_set_cap(void)
>> +{
>> +	if (!boot_cpu_has(X86_FEATURE_XENPV))
>> +		setup_force_cpu_cap(X86_FEATURE_NOT_XENPV);
>> +
>> +	if (pv_is_native_spin_unlock())
>> +		setup_force_cpu_cap(X86_FEATURE_NO_PVUNLOCK);
>> +
>> +	if (pv_is_native_vcpu_is_preempted())
>> +		setup_force_cpu_cap(X86_FEATURE_NO_VCPUPREEMPT);
>> +}
>> +
>>   void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
>>   				     struct paravirt_patch_site *end)
>>   {
>> @@ -616,6 +628,8 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
>>   }
>>   extern struct paravirt_patch_site __start_parainstructions[],
>>   	__stop_parainstructions[];
>> +#else
>> +static void __init paravirt_set_cap(void) { }
>>   #endif	/* CONFIG_PARAVIRT */
>>   
>>   /*
>> @@ -723,6 +737,18 @@ void __init alternative_instructions(void)
>>   	 * patching.
>>   	 */
>>   
>> +	paravirt_set_cap();
> 
> Can that be called from somewhere in the Xen init path and not from
> here? Somewhere before check_bugs() gets called.

No, this is needed for non-Xen cases, too (especially pv spinlocks).

> 
>> +	/*
>> +	 * First patch paravirt functions, such that we overwrite the indirect
>> +	 * call with the direct call.
>> +	 */
>> +	apply_paravirt(__parainstructions, __parainstructions_end);
>> +
>> +	/*
>> +	 * Then patch alternatives, such that those paravirt calls that are in
>> +	 * alternatives can be overwritten by their immediate fragments.
>> +	 */
>>   	apply_alternatives(__alt_instructions, __alt_instructions_end);
> 
> Can you give an example here pls why the paravirt patching needs to run
> first?

Okay.


Juergen
Borislav Petkov Dec. 9, 2020, 12:03 p.m. UTC | #3
On Wed, Dec 09, 2020 at 08:30:53AM +0100, Jürgen Groß wrote:
> Hey, I already suggested to use ~FEATURE for that purpose (see
> https://lore.kernel.org/lkml/f105a63d-6b51-3afb-83e0-e899ea40813e@suse.com/

Great minds think alike!

:-P

> I'd rather make the syntax:
> 
> ALTERNATIVE_TERNARY <initial-code> <feature> <code-for-feature-set>
>                                              <code-for-feature-unset>
> 
> as this ...

Sure, that is ok too.

> ... would match perfectly to this interpretation.

Yap.

> Hmm, using flags is an alternative (pun intended :-) ).

LOL!

Btw, pls do check how much the vmlinux size of an allyesconfig grows
with this as we will be adding a byte per patch site. Not that it would
matter too much - the flags are a long way a comin'. :-)

> No, this is needed for non-Xen cases, too (especially pv spinlocks).

I see.

> > Can you give an example here pls why the paravirt patching needs to run
> > first?
> 
> Okay.

I meant an example for me to have a look at. :)

If possible pls.

Thx.
Jürgen Groß Dec. 9, 2020, 12:22 p.m. UTC | #4
On 09.12.20 13:03, Borislav Petkov wrote:
> On Wed, Dec 09, 2020 at 08:30:53AM +0100, Jürgen Groß wrote:
>> Hey, I already suggested to use ~FEATURE for that purpose (see
>> https://lore.kernel.org/lkml/f105a63d-6b51-3afb-83e0-e899ea40813e@suse.com/
> 
> Great minds think alike!
> 
> :-P
> 
>> I'd rather make the syntax:
>>
>> ALTERNATIVE_TERNARY <initial-code> <feature> <code-for-feature-set>
>>                                               <code-for-feature-unset>
>>
>> as this ...
> 
> Sure, that is ok too.
> 
>> ... would match perfectly to this interpretation.
> 
> Yap.
> 
>> Hmm, using flags is an alternative (pun intended :-) ).
> 
> LOL!
> 
> Btw, pls do check how much the vmlinux size of an allyesconfig grows
> with this as we will be adding a byte per patch site. Not that it would
> matter too much - the flags are a long way a comin'. :-)
> 
>> No, this is needed for non-Xen cases, too (especially pv spinlocks).
> 
> I see.
> 
>>> Can you give an example here pls why the paravirt patching needs to run
>>> first?
>>
>> Okay.
> 
> I meant an example for me to have a look at. :)
> 
> If possible pls.

Ah, okay.

Lets take the spin_unlock() case. With patch 11 of the series this is

PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
                   "movb $0, (%%" _ASM_ARG1 ");",
                   X86_FEATURE_NO_PVUNLOCK);

which boils down to ALTERNATIVE "call *lock.queued_spin_unlock"
                                 "movb $0,(%rdi)" X86_FEATURE_NO_PVUNLOCK

The initial (paravirt) code is an indirect call in order to allow
spin_unlock() before paravirt/alternative patching takes place.

Paravirt patching will then replace the indirect call with a direct call
to the correct unlock function. Then alternative patching might replace
the direct call to the bare metal unlock with a plain "movb $0,(%rdi)"
in case pvlocks are not enabled.

In case alternative patching would occur first, the indirect call might
be replaced with the "movb ...", and then paravirt patching would
clobber that with the direct call, resulting in the bare metal
optimization being removed again.


Juergen
Borislav Petkov Dec. 10, 2020, 5:58 p.m. UTC | #5
On Wed, Dec 09, 2020 at 01:22:24PM +0100, Jürgen Groß wrote:
> Lets take the spin_unlock() case. With patch 11 of the series this is
> 
> PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
>                   "movb $0, (%%" _ASM_ARG1 ");",
>                   X86_FEATURE_NO_PVUNLOCK);
> 
> which boils down to ALTERNATIVE "call *lock.queued_spin_unlock"
>                                 "movb $0,(%rdi)" X86_FEATURE_NO_PVUNLOCK
> 
> The initial (paravirt) code is an indirect call in order to allow
> spin_unlock() before paravirt/alternative patching takes place.
> 
> Paravirt patching will then replace the indirect call with a direct call
> to the correct unlock function. Then alternative patching might replace
> the direct call to the bare metal unlock with a plain "movb $0,(%rdi)"
> in case pvlocks are not enabled.

Aha, that zeros the locking var on unlock, I see.

> In case alternative patching would occur first, the indirect call might
> be replaced with the "movb ...", and then paravirt patching would
> clobber that with the direct call, resulting in the bare metal
> optimization being removed again.

Yeah, that explains the whole situation much better - thanks - and
considering how complex the whole patching is, I wouldn't mind the gist
of it as text in alternative_instructions() or in a comment above it so
that we don't have to swap everything back in, months and years from
now, when we optimize it yet again. :-}

Thx.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..ffa23c655412 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -237,6 +237,9 @@ 
 #define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
 #define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
 #define X86_FEATURE_SEV_ES		( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_NOT_XENPV		( 8*32+21) /* "" Inverse of X86_FEATURE_XENPV */
+#define X86_FEATURE_NO_PVUNLOCK		( 8*32+22) /* "" No PV unlock function */
+#define X86_FEATURE_NO_VCPUPREEMPT	( 8*32+23) /* "" No PV vcpu_is_preempted function */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
 #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2400ad62f330..f8f9700719cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -593,6 +593,18 @@  int alternatives_text_reserved(void *start, void *end)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
+static void __init paravirt_set_cap(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XENPV))
+		setup_force_cpu_cap(X86_FEATURE_NOT_XENPV);
+
+	if (pv_is_native_spin_unlock())
+		setup_force_cpu_cap(X86_FEATURE_NO_PVUNLOCK);
+
+	if (pv_is_native_vcpu_is_preempted())
+		setup_force_cpu_cap(X86_FEATURE_NO_VCPUPREEMPT);
+}
+
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 				     struct paravirt_patch_site *end)
 {
@@ -616,6 +628,8 @@  void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 }
 extern struct paravirt_patch_site __start_parainstructions[],
 	__stop_parainstructions[];
+#else
+static void __init paravirt_set_cap(void) { }
 #endif	/* CONFIG_PARAVIRT */
 
 /*
@@ -723,6 +737,18 @@  void __init alternative_instructions(void)
 	 * patching.
 	 */
 
+	paravirt_set_cap();
+
+	/*
+	 * First patch paravirt functions, such that we overwrite the indirect
+	 * call with the direct call.
+	 */
+	apply_paravirt(__parainstructions, __parainstructions_end);
+
+	/*
+	 * Then patch alternatives, such that those paravirt calls that are in
+	 * alternatives can be overwritten by their immediate fragments.
+	 */
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
 #ifdef CONFIG_SMP
@@ -741,8 +767,6 @@  void __init alternative_instructions(void)
 	}
 #endif
 
-	apply_paravirt(__parainstructions, __parainstructions_end);
-
 	restart_nmi();
 	alternatives_patched = 1;
 }