diff mbox series

[6/6] x86/boot: Expose MSR_ARCH_CAPS data in guest max policies

Message ID 20230515144259.1009245-7-andrew.cooper3@citrix.com (mailing list archive)
State Superseded
Headers show
Series x86: Introduce MSR_ARCH_CAPS into featuresets | expand

Commit Message

Andrew Cooper May 15, 2023, 2:42 p.m. UTC
We already have common and default feature adjustment helpers.  Introduce one
for max featuresets too.

Offer MSR_ARCH_CAPS unconditionally in the max policy, and stop clobbering the
data inherited from the Host policy.  This will be necessary level a VM safely
for migration.  Note: ARCH_CAPS is still max-only for now, so will not be
inhereted by the default policies.

With this done, the special case for dom0 can be shrunk to just resampling the
Host policy (as ARCH_CAPS isn't visible by default yet).

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 xen/arch/x86/cpu-policy.c | 42 ++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 18 deletions(-)

Comments

Jan Beulich May 16, 2023, 1:06 p.m. UTC | #1
On 15.05.2023 16:42, Andrew Cooper wrote:
> --- a/xen/arch/x86/cpu-policy.c
> +++ b/xen/arch/x86/cpu-policy.c
> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>  }
>  
> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
> +{
> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
> +    {
> +        /*
> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
> +         * unconditionally, although limit it to Intel systems as it is highly
> +         * uarch-specific.
> +         *
> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
> +         * system where RSB underflow uses alternative predictors (a.k.a
> +         * Retpoline not safe)", so these need to be visible to a guest in all
> +         * cases, even when it's only some other server in the pool which
> +         * suffers the identified behaviour.
> +         */
> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
> +    }
> +}

The comment reads as if it wasn't applying to "max" only, but rather to
"default". Reading this I'm therefore now (and perhaps even more so in
the future, when coming across it) wondering whether it's misplaced, or
and hence whether the commented code is also misplaced and/or wrong.

Further is even just non-default exposure of all the various bits okay
to other than Dom0? IOW is there indeed no further adjustment necessary
to guest_rdmsr()?

> @@ -828,7 +845,10 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>       * domain policy logic gains a better understanding of MSRs.
>       */
>      if ( is_hardware_domain(d) && cpu_has_arch_caps )
> +    {
>          p->feat.arch_caps = true;
> +        p->arch_caps.raw = host_cpu_policy.arch_caps.raw;
> +    }

Doesn't this expose all the bits, irrespective of their exposure
annotations in the public header? I.e. even more than just the two
bits that become 'A' in patch 4, but weren't ...

> @@ -858,20 +878,6 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>          p->platform_info.cpuid_faulting = false;
>  
>      recalculate_cpuid_policy(d);
> -
> -    if ( is_hardware_domain(d) && cpu_has_arch_caps )
> -    {
> -        uint64_t val;
> -
> -        rdmsrl(MSR_ARCH_CAPABILITIES, val);
> -
> -        p->arch_caps.raw = val &
> -            (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA |
> -             ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO |
> -             ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO |
> -             ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA |
> -             ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO);
> -    }

... included here?

Jan
Andrew Cooper May 16, 2023, 1:51 p.m. UTC | #2
On 16/05/2023 2:06 pm, Jan Beulich wrote:
> On 15.05.2023 16:42, Andrew Cooper wrote:
>> --- a/xen/arch/x86/cpu-policy.c
>> +++ b/xen/arch/x86/cpu-policy.c
>> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>>  }
>>  
>> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
>> +{
>> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
>> +    {
>> +        /*
>> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
>> +         * unconditionally, although limit it to Intel systems as it is highly
>> +         * uarch-specific.
>> +         *
>> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
>> +         * system where RSB underflow uses alternative predictors (a.k.a
>> +         * Retpoline not safe)", so these need to be visible to a guest in all
>> +         * cases, even when it's only some other server in the pool which
>> +         * suffers the identified behaviour.
>> +         */
>> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
>> +    }
>> +}
> The comment reads as if it wasn't applying to "max" only, but rather to
> "default". Reading this I'm therefore now (and perhaps even more so in
> the future, when coming across it) wondering whether it's misplaced, or
> and hence whether the commented code is also misplaced and/or wrong.

On migrate-in, we (well - toolstacks that understand multiple hosts)
check the cpu policy the VM saw against the appropriate PV/HVM max
policy to determine whether it can safely run.

So this is very intentionally for the max policy.  We need (I think -
still pending an clarification from Intel because there's pending work
still not published) to set RSBA unconditionally, and RRSBA conditional
on EIBRS being available, in max even on pre-Skylake hardware such that
we can migrate-in a VM which previously ran on Skylake or later hardware.

Activating this by default for VMs is just a case of swapping the CPUID
ARCH_CAPS bit from 'a' to 'A', without any adjustment to this logic.

> Further is even just non-default exposure of all the various bits okay
> to other than Dom0? IOW is there indeed no further adjustment necessary
> to guest_rdmsr()?
>
>> @@ -828,7 +845,10 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>>       * domain policy logic gains a better understanding of MSRs.
>>       */
>>      if ( is_hardware_domain(d) && cpu_has_arch_caps )
>> +    {
>>          p->feat.arch_caps = true;
>> +        p->arch_caps.raw = host_cpu_policy.arch_caps.raw;
>> +    }
> Doesn't this expose all the bits, irrespective of their exposure
> annotations in the public header?

No, because of ...

>  I.e. even more than just the two
> bits that become 'A' in patch 4, but weren't ...
>
>> @@ -858,20 +878,6 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>>          p->platform_info.cpuid_faulting = false;
>>  
>>      recalculate_cpuid_policy(d);

... this recalculate_cpuid_policy() (which was moved in patch 1), which
applies the appropriate pv/hvm max mask over the inherited bits.


More generally, this is how *all* opting-into-non-default features needs
to work when it's more than just turning on a single feature bit.  It's
also why doing full-policy levelling in the toolstack is much harder
than it appears on paper.

All domains get the default policy, so zero out all non-default
information.  It has to be recovered from somewhere.  Generally that
would be the appropriate max policy, but the host policy here is fine
because there's nothing to do other than applying the appropriate max mask.

When arch-caps becomes default, the full block feeding arch caps back
into dom0 will be dropped, but there's still a lot of work to do first.

~Andrew
Jan Beulich May 16, 2023, 2:06 p.m. UTC | #3
On 16.05.2023 15:51, Andrew Cooper wrote:
> On 16/05/2023 2:06 pm, Jan Beulich wrote:
>> On 15.05.2023 16:42, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/cpu-policy.c
>>> +++ b/xen/arch/x86/cpu-policy.c
>>> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>>>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>>>  }
>>>  
>>> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
>>> +{
>>> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
>>> +    {
>>> +        /*
>>> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
>>> +         * unconditionally, although limit it to Intel systems as it is highly
>>> +         * uarch-specific.
>>> +         *
>>> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
>>> +         * system where RSB underflow uses alternative predictors (a.k.a
>>> +         * Retpoline not safe)", so these need to be visible to a guest in all
>>> +         * cases, even when it's only some other server in the pool which
>>> +         * suffers the identified behaviour.
>>> +         */
>>> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
>>> +    }
>>> +}
>> The comment reads as if it wasn't applying to "max" only, but rather to
>> "default". Reading this I'm therefore now (and perhaps even more so in
>> the future, when coming across it) wondering whether it's misplaced, or
>> and hence whether the commented code is also misplaced and/or wrong.
> 
> On migrate-in, we (well - toolstacks that understand multiple hosts)
> check the cpu policy the VM saw against the appropriate PV/HVM max
> policy to determine whether it can safely run.
> 
> So this is very intentionally for the max policy.  We need (I think -
> still pending an clarification from Intel because there's pending work
> still not published) to set RSBA unconditionally, and RRSBA conditional
> on EIBRS being available, in max even on pre-Skylake hardware such that
> we can migrate-in a VM which previously ran on Skylake or later hardware.
> 
> Activating this by default for VMs is just a case of swapping the CPUID
> ARCH_CAPS bit from 'a' to 'A', without any adjustment to this logic.

Hmm, I see. Not very intuitive, but I think I follow.

>> Further is even just non-default exposure of all the various bits okay
>> to other than Dom0? IOW is there indeed no further adjustment necessary
>> to guest_rdmsr()?

With your reply further down also sufficiently clarifying things for
me (in particular pointing the one oversight of mine), the question
above is the sole part remaining before I'd be okay giving my R-b here.

Jan

>>> @@ -828,7 +845,10 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>>>       * domain policy logic gains a better understanding of MSRs.
>>>       */
>>>      if ( is_hardware_domain(d) && cpu_has_arch_caps )
>>> +    {
>>>          p->feat.arch_caps = true;
>>> +        p->arch_caps.raw = host_cpu_policy.arch_caps.raw;
>>> +    }
>> Doesn't this expose all the bits, irrespective of their exposure
>> annotations in the public header?
> 
> No, because of ...
> 
>>  I.e. even more than just the two
>> bits that become 'A' in patch 4, but weren't ...
>>
>>> @@ -858,20 +878,6 @@ void __init init_dom0_cpuid_policy(struct domain *d)
>>>          p->platform_info.cpuid_faulting = false;
>>>  
>>>      recalculate_cpuid_policy(d);
> 
> ... this recalculate_cpuid_policy() (which was moved in patch 1), which
> applies the appropriate pv/hvm max mask over the inherited bits.
> 
> 
> More generally, this is how *all* opting-into-non-default features needs
> to work when it's more than just turning on a single feature bit.  It's
> also why doing full-policy levelling in the toolstack is much harder
> than it appears on paper.
> 
> All domains get the default policy, so zero out all non-default
> information.  It has to be recovered from somewhere.  Generally that
> would be the appropriate max policy, but the host policy here is fine
> because there's nothing to do other than applying the appropriate max mask.
> 
> When arch-caps becomes default, the full block feeding arch caps back
> into dom0 will be dropped, but there's still a lot of work to do first.
> 
> ~Andrew
Andrew Cooper May 16, 2023, 2:16 p.m. UTC | #4
On 16/05/2023 3:06 pm, Jan Beulich wrote:
> On 16.05.2023 15:51, Andrew Cooper wrote:
>> On 16/05/2023 2:06 pm, Jan Beulich wrote:
>>> On 15.05.2023 16:42, Andrew Cooper wrote:
>>>> --- a/xen/arch/x86/cpu-policy.c
>>>> +++ b/xen/arch/x86/cpu-policy.c
>>>> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>>>>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>>>>  }
>>>>  
>>>> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
>>>> +{
>>>> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
>>>> +    {
>>>> +        /*
>>>> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
>>>> +         * unconditionally, although limit it to Intel systems as it is highly
>>>> +         * uarch-specific.
>>>> +         *
>>>> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
>>>> +         * system where RSB underflow uses alternative predictors (a.k.a
>>>> +         * Retpoline not safe)", so these need to be visible to a guest in all
>>>> +         * cases, even when it's only some other server in the pool which
>>>> +         * suffers the identified behaviour.
>>>> +         */
>>>> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
>>>> +    }
>>>> +}
>>> The comment reads as if it wasn't applying to "max" only, but rather to
>>> "default". Reading this I'm therefore now (and perhaps even more so in
>>> the future, when coming across it) wondering whether it's misplaced, or
>>> and hence whether the commented code is also misplaced and/or wrong.
>> On migrate-in, we (well - toolstacks that understand multiple hosts)
>> check the cpu policy the VM saw against the appropriate PV/HVM max
>> policy to determine whether it can safely run.
>>
>> So this is very intentionally for the max policy.  We need (I think -
>> still pending an clarification from Intel because there's pending work
>> still not published) to set RSBA unconditionally, and RRSBA conditional
>> on EIBRS being available, in max even on pre-Skylake hardware such that
>> we can migrate-in a VM which previously ran on Skylake or later hardware.
>>
>> Activating this by default for VMs is just a case of swapping the CPUID
>> ARCH_CAPS bit from 'a' to 'A', without any adjustment to this logic.
> Hmm, I see. Not very intuitive, but I think I follow.
>
>>> Further is even just non-default exposure of all the various bits okay
>>> to other than Dom0? IOW is there indeed no further adjustment necessary
>>> to guest_rdmsr()?
> With your reply further down also sufficiently clarifying things for
> me (in particular pointing the one oversight of mine), the question
> above is the sole part remaining before I'd be okay giving my R-b here.

Oh sorry.  Yes, it is sufficient.  Because VMs (other than dom0) don't
get the ARCH_CAPS CPUID bit, reads of MSR_ARCH_CAPS will #GP.

Right now, you can set cpuid = "host:arch-caps" in an xl.cfg file.  If
you do this, you get to keep both pieces, as you'll end up advertising
the MSR but with a value of 0 because of the note in patch 4.  libxl
still only understand the xend CPUID format and can't express any MSR
data at all.

~Andrew
Jan Beulich May 16, 2023, 2:53 p.m. UTC | #5
On 16.05.2023 16:16, Andrew Cooper wrote:
> On 16/05/2023 3:06 pm, Jan Beulich wrote:
>> On 16.05.2023 15:51, Andrew Cooper wrote:
>>> On 16/05/2023 2:06 pm, Jan Beulich wrote:
>>>> On 15.05.2023 16:42, Andrew Cooper wrote:
>>>> Further is even just non-default exposure of all the various bits okay
>>>> to other than Dom0? IOW is there indeed no further adjustment necessary
>>>> to guest_rdmsr()?
>> With your reply further down also sufficiently clarifying things for
>> me (in particular pointing the one oversight of mine), the question
>> above is the sole part remaining before I'd be okay giving my R-b here.
> 
> Oh sorry.  Yes, it is sufficient.  Because VMs (other than dom0) don't
> get the ARCH_CAPS CPUID bit, reads of MSR_ARCH_CAPS will #GP.
> 
> Right now, you can set cpuid = "host:arch-caps" in an xl.cfg file.  If
> you do this, you get to keep both pieces, as you'll end up advertising
> the MSR but with a value of 0 because of the note in patch 4.  libxl
> still only understand the xend CPUID format and can't express any MSR
> data at all.

Hmm, so the CPUID bit being max only results in all the ARCH_CAPS bits
getting turned off in the default policy. That is, to enable anything
you need to not only enable the CPUID bit, but also the ARCH_CAPS bits
you want enabled (with no presents means to do so). I guess that's no
different from other max-only features with dependents, but I wonder
whether that's good behavior. Wouldn't it make more sense for the
individual bits' exposure qualifiers to become meaningful one to
qualifying feature is enabled? I.e. here this would then mean that
some ARCH_CAPS bits may become available, while others may require
explicit turning on (assuming they weren't all 'A').

But irrespective of that (which is kind of orthogonal) my question was
rather with already considering the point in time when the CPUID bit
would become 'A'. IOW I was wondering whether at that point having all
the individual bits be 'A' is actually going to be correct.

Jan
Jan Beulich May 16, 2023, 2:58 p.m. UTC | #6
On 15.05.2023 16:42, Andrew Cooper wrote:
> --- a/xen/arch/x86/cpu-policy.c
> +++ b/xen/arch/x86/cpu-policy.c
> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>  }
>  
> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
> +{
> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
> +    {
> +        /*
> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
> +         * unconditionally, although limit it to Intel systems as it is highly
> +         * uarch-specific.
> +         *
> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
> +         * system where RSB underflow uses alternative predictors (a.k.a
> +         * Retpoline not safe)", so these need to be visible to a guest in all
> +         * cases, even when it's only some other server in the pool which
> +         * suffers the identified behaviour.
> +         */
> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
> +    }
> +}

Wouldn't this better be accompanied by marking the bit !a in the public header?

Jan
Andrew Cooper May 16, 2023, 7:31 p.m. UTC | #7
On 16/05/2023 3:53 pm, Jan Beulich wrote:
> On 16.05.2023 16:16, Andrew Cooper wrote:
>> On 16/05/2023 3:06 pm, Jan Beulich wrote:
>>> On 16.05.2023 15:51, Andrew Cooper wrote:
>>>> On 16/05/2023 2:06 pm, Jan Beulich wrote:
>>>>> On 15.05.2023 16:42, Andrew Cooper wrote:
>>>>> Further is even just non-default exposure of all the various bits okay
>>>>> to other than Dom0? IOW is there indeed no further adjustment necessary
>>>>> to guest_rdmsr()?
>>> With your reply further down also sufficiently clarifying things for
>>> me (in particular pointing the one oversight of mine), the question
>>> above is the sole part remaining before I'd be okay giving my R-b here.
>> Oh sorry.  Yes, it is sufficient.  Because VMs (other than dom0) don't
>> get the ARCH_CAPS CPUID bit, reads of MSR_ARCH_CAPS will #GP.
>>
>> Right now, you can set cpuid = "host:arch-caps" in an xl.cfg file.  If
>> you do this, you get to keep both pieces, as you'll end up advertising
>> the MSR but with a value of 0 because of the note in patch 4.  libxl
>> still only understand the xend CPUID format and can't express any MSR
>> data at all.
> Hmm, so the CPUID bit being max only results in all the ARCH_CAPS bits
> getting turned off in the default policy. That is, to enable anything
> you need to not only enable the CPUID bit, but also the ARCH_CAPS bits
> you want enabled (with no presents means to do so).

Correct.

> I guess that's no
> different from other max-only features with dependents, but I wonder
> whether that's good behavior.

It's not really something you get a choice over.

Default is always less than max, so however you choose to express these
concepts, when you're opting-in you're always having to put information
back in which was previously stripped out.

> Wouldn't it make more sense for the
> individual bits' exposure qualifiers to become meaningful one to
> qualifying feature is enabled? I.e. here this would then mean that
> some ARCH_CAPS bits may become available, while others may require
> explicit turning on (assuming they weren't all 'A').

I'm afraid I don't follow.  You could make some bits of MSR_ARCH_CAPS
itself 'a' vs 'A', and that would have the expected effect (for any VM
where arch_caps was visible).

The thing which is 99% of the complexity with MSR_ARCH_CAPS is getting
RSBA/RRSBA right.  The moment we advertise MSR_ARCH_CAPS to guests,
RSBA/RRSBA must be set appropriately for migrate or we're creating a
security vulnerability in the guest.

If you're wondering about the block disable, that's because MSRs and
CPUID are different.  With CPUID, we have
x86_cpu_policy_clear_out_of_range_leaves() which uses the various
max_leaf.  e.g. a feat.max_leaf=0 is what causes all of subleaf 1 and 2
to be zeroed in a policy.


> But irrespective of that (which is kind of orthogonal) my question was
> rather with already considering the point in time when the CPUID bit
> would become 'A'. IOW I was wondering whether at that point having all
> the individual bits be 'A' is actually going to be correct.

I've chosen all 'A' for these bits because that is what I expect to be
correct in due course.  They're all the simple "you're not vulnerable to
$X" bits, plus eIBRS which in practice is just a qualifying statement on
IBRS (already fully supported in guests).

The rest of MSR_ARCH_CAPS is pretty much a dumping ground for all of the
controls we can't give to guests under any circumstance.  (FB_CLEAR_CTRL
might be an exception - allegedly we might want to give it to guests
which have passthrough and trust their userspace, but I'm unconvinced of
this argument and going to insist on concrete numbers from anyone
wanting to try and implement this usecase.)

But there certainly could be a feature in there in the future where we
leave it at 'a' for a while...  It's just feature bitmap data in a
non-CPUID form factor.

~Andrew
Jan Beulich May 17, 2023, 9:20 a.m. UTC | #8
On 16.05.2023 21:31, Andrew Cooper wrote:
> On 16/05/2023 3:53 pm, Jan Beulich wrote:
>> On 16.05.2023 16:16, Andrew Cooper wrote:
>>> On 16/05/2023 3:06 pm, Jan Beulich wrote:
>>>> On 16.05.2023 15:51, Andrew Cooper wrote:
>>>>> On 16/05/2023 2:06 pm, Jan Beulich wrote:
>>>>>> On 15.05.2023 16:42, Andrew Cooper wrote:
>>>>>> Further is even just non-default exposure of all the various bits okay
>>>>>> to other than Dom0? IOW is there indeed no further adjustment necessary
>>>>>> to guest_rdmsr()?
>>>> With your reply further down also sufficiently clarifying things for
>>>> me (in particular pointing the one oversight of mine), the question
>>>> above is the sole part remaining before I'd be okay giving my R-b here.
>>> Oh sorry.  Yes, it is sufficient.  Because VMs (other than dom0) don't
>>> get the ARCH_CAPS CPUID bit, reads of MSR_ARCH_CAPS will #GP.
>>>
>>> Right now, you can set cpuid = "host:arch-caps" in an xl.cfg file.  If
>>> you do this, you get to keep both pieces, as you'll end up advertising
>>> the MSR but with a value of 0 because of the note in patch 4.  libxl
>>> still only understand the xend CPUID format and can't express any MSR
>>> data at all.
>> Hmm, so the CPUID bit being max only results in all the ARCH_CAPS bits
>> getting turned off in the default policy. That is, to enable anything
>> you need to not only enable the CPUID bit, but also the ARCH_CAPS bits
>> you want enabled (with no presents means to do so).
> 
> Correct.
> 
>> I guess that's no
>> different from other max-only features with dependents, but I wonder
>> whether that's good behavior.
> 
> It's not really something you get a choice over.
> 
> Default is always less than max, so however you choose to express these
> concepts, when you're opting-in you're always having to put information
> back in which was previously stripped out.

But my point is towards the amount of data you need to specify manually.
I would find it quite helpful if default-on sub-features became available
automatically once the top-level feature was turned on. I guess so far
we don't have many such cases, but here you add a whole bunch.

>> Wouldn't it make more sense for the
>> individual bits' exposure qualifiers to become meaningful one to
>> qualifying feature is enabled? I.e. here this would then mean that
>> some ARCH_CAPS bits may become available, while others may require
>> explicit turning on (assuming they weren't all 'A').
> 
> I'm afraid I don't follow.  You could make some bits of MSR_ARCH_CAPS
> itself 'a' vs 'A', and that would have the expected effect (for any VM
> where arch_caps was visible).

Visible by default, you mean. Whereas I'm considering the case where
the CPUID bit is default-off, and turning it on for a guest doesn't at
the same time turn on all the 'A' bits in ARCH_CAPS (which hardware
offers, or which we synthesize).

Something similar could be seen / utilized for AMX, where in my
pending series I set all the bits to 'a', requiring every individual
bit to be turned on along with turning on AMX-TILE. Yet it would be
more user friendly if only the top-level bit needed enabling manually,
with available sub-features then becoming available "automatically".

> The thing which is 99% of the complexity with MSR_ARCH_CAPS is getting
> RSBA/RRSBA right.  The moment we advertise MSR_ARCH_CAPS to guests,
> RSBA/RRSBA must be set appropriately for migrate or we're creating a
> security vulnerability in the guest.
> 
> If you're wondering about the block disable, that's because MSRs and
> CPUID are different.  With CPUID, we have
> x86_cpu_policy_clear_out_of_range_leaves() which uses the various
> max_leaf.  e.g. a feat.max_leaf=0 is what causes all of subleaf 1 and 2
> to be zeroed in a policy.
> 
> 
>> But irrespective of that (which is kind of orthogonal) my question was
>> rather with already considering the point in time when the CPUID bit
>> would become 'A'. IOW I was wondering whether at that point having all
>> the individual bits be 'A' is actually going to be correct.
> 
> I've chosen all 'A' for these bits because that is what I expect to be
> correct in due course.  They're all the simple "you're not vulnerable to
> $X" bits, plus eIBRS which in practice is just a qualifying statement on
> IBRS (already fully supported in guests).

Right, upon checking again I agree.

Jan

> The rest of MSR_ARCH_CAPS is pretty much a dumping ground for all of the
> controls we can't give to guests under any circumstance.  (FB_CLEAR_CTRL
> might be an exception - allegedly we might want to give it to guests
> which have passthrough and trust their userspace, but I'm unconvinced of
> this argument and going to insist on concrete numbers from anyone
> wanting to try and implement this usecase.)
> 
> But there certainly could be a feature in there in the future where we
> leave it at 'a' for a while...  It's just feature bitmap data in a
> non-CPUID form factor.
> 
> ~Andrew
Andrew Cooper May 19, 2023, 3:52 p.m. UTC | #9
On 17/05/2023 10:20 am, Jan Beulich wrote:
> On 16.05.2023 21:31, Andrew Cooper wrote:
>> On 16/05/2023 3:53 pm, Jan Beulich wrote:
>>> I guess that's no
>>> different from other max-only features with dependents, but I wonder
>>> whether that's good behavior.
>> It's not really something you get a choice over.
>>
>> Default is always less than max, so however you choose to express these
>> concepts, when you're opting-in you're always having to put information
>> back in which was previously stripped out.
> But my point is towards the amount of data you need to specify manually.
> I would find it quite helpful if default-on sub-features became available
> automatically once the top-level feature was turned on. I guess so far
> we don't have many such cases, but here you add a whole bunch.

I'm not suggesting specifying it manually.  That would be a dumb UX move.

But you absolutely cannot have "user turns on EIBRS" meaning "turn on
ARCH_CAPS too", because a) that requires creating the reverse feature
map which is massively larger than the forward feature map, and b) it
creates a vulnerability in the guest, and c) it's ambiguous - e.g. what
does turning on a sub-mode of AVX-512 mean for sibling sub-modes?

Whichever algorithm you want, you still need *something* to know that
ARCH_CAPS is special and how to arrange the defaults given a non-default
overarching setting.

When the toolstack infrastructure grows the ability to say no to the
user, it will be able to identify explicit user settings which cannot be
fulfilled.  (And with a bit more complicated logic, why.)

>>> Wouldn't it make more sense for the
>>> individual bits' exposure qualifiers to become meaningful one to
>>> qualifying feature is enabled? I.e. here this would then mean that
>>> some ARCH_CAPS bits may become available, while others may require
>>> explicit turning on (assuming they weren't all 'A').
>> I'm afraid I don't follow.  You could make some bits of MSR_ARCH_CAPS
>> itself 'a' vs 'A', and that would have the expected effect (for any VM
>> where arch_caps was visible).
> Visible by default, you mean. Whereas I'm considering the case where
> the CPUID bit is default-off, and turning it on for a guest doesn't at
> the same time turn on all the 'A' bits in ARCH_CAPS (which hardware
> offers, or which we synthesize).
>
> Something similar could be seen / utilized for AMX, where in my
> pending series I set all the bits to 'a', requiring every individual
> bit to be turned on along with turning on AMX-TILE. Yet it would be
> more user friendly if only the top-level bit needed enabling manually,
> with available sub-features then becoming available "automatically".

I think I've covered all of this in the reply above?

~Andrew
Andrew Cooper May 19, 2023, 3:52 p.m. UTC | #10
On 16/05/2023 3:58 pm, Jan Beulich wrote:
> On 15.05.2023 16:42, Andrew Cooper wrote:
>> --- a/xen/arch/x86/cpu-policy.c
>> +++ b/xen/arch/x86/cpu-policy.c
>> @@ -408,6 +408,25 @@ static void __init calculate_host_policy(void)
>>      p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
>>  }
>>  
>> +static void __init guest_common_max_feature_adjustments(uint32_t *fs)
>> +{
>> +    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
>> +    {
>> +        /*
>> +         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
>> +         * unconditionally, although limit it to Intel systems as it is highly
>> +         * uarch-specific.
>> +         *
>> +         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
>> +         * system where RSB underflow uses alternative predictors (a.k.a
>> +         * Retpoline not safe)", so these need to be visible to a guest in all
>> +         * cases, even when it's only some other server in the pool which
>> +         * suffers the identified behaviour.
>> +         */
>> +        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
>> +    }
>> +}
> Wouldn't this better be accompanied by marking the bit !a in the public header?

Yes, probably.

~Andrew
Jan Beulich May 22, 2023, 7:31 a.m. UTC | #11
On 19.05.2023 17:52, Andrew Cooper wrote:
> On 17/05/2023 10:20 am, Jan Beulich wrote:
>> On 16.05.2023 21:31, Andrew Cooper wrote:
>>> On 16/05/2023 3:53 pm, Jan Beulich wrote:
>>>> I guess that's no
>>>> different from other max-only features with dependents, but I wonder
>>>> whether that's good behavior.
>>> It's not really something you get a choice over.
>>>
>>> Default is always less than max, so however you choose to express these
>>> concepts, when you're opting-in you're always having to put information
>>> back in which was previously stripped out.
>> But my point is towards the amount of data you need to specify manually.
>> I would find it quite helpful if default-on sub-features became available
>> automatically once the top-level feature was turned on. I guess so far
>> we don't have many such cases, but here you add a whole bunch.
> 
> I'm not suggesting specifying it manually.  That would be a dumb UX move.
> 
> But you absolutely cannot have "user turns on EIBRS" meaning "turn on
> ARCH_CAPS too", because a) that requires creating the reverse feature
> map which is massively larger than the forward feature map, and b) it
> creates a vulnerability in the guest, and c) it's ambiguous - e.g. what
> does turning on a sub-mode of AVX-512 mean for sibling sub-modes?

Feels like a misunderstanding at this point, because what you're describing
above is not what I was referring to. Instead I was specifically referring
to "cpuid=...,arch-caps" not having any effect beyond the turning on of the
MSR itself (with all-zero content). This is even worse with libxl not even
having a way right now to express something like "arch-caps=..." to enable
some of the sub-features for guests.

To explicitly answer the AVX512 part of your reply: Turning on a sub-mode
alone could be useful as well (with the effect of also turning on the
main feature, and with or without [policy question] also turning on other
default-on subfeatures of AVX512). But again - that direction of possible
"implications" isn't what my earlier reply was about. As you say, reverse
maps would then also be needed, whereas for what I'm suggesting only the
deep-deps we already have are necessary: We'd grab the main feature's
dependencies and AND that with the default mask before ORing into the
domain's policy.

> Whichever algorithm you want, you still need *something* to know that
> ARCH_CAPS is special and how to arrange the defaults given a non-default
> overarching setting.

Afaict right now that would be achieved by the same 'a', 'A', '!a", and
"!A" annotations, suitably placed on every of the sub-features.

Jan

> When the toolstack infrastructure grows the ability to say no to the
> user, it will be able to identify explicit user settings which cannot be
> fulfilled.  (And with a bit more complicated logic, why.)
> 
>>>> Wouldn't it make more sense for the
>>>> individual bits' exposure qualifiers to become meaningful one to
>>>> qualifying feature is enabled? I.e. here this would then mean that
>>>> some ARCH_CAPS bits may become available, while others may require
>>>> explicit turning on (assuming they weren't all 'A').
>>> I'm afraid I don't follow.  You could make some bits of MSR_ARCH_CAPS
>>> itself 'a' vs 'A', and that would have the expected effect (for any VM
>>> where arch_caps was visible).
>> Visible by default, you mean. Whereas I'm considering the case where
>> the CPUID bit is default-off, and turning it on for a guest doesn't at
>> the same time turn on all the 'A' bits in ARCH_CAPS (which hardware
>> offers, or which we synthesize).
>>
>> Something similar could be seen / utilized for AMX, where in my
>> pending series I set all the bits to 'a', requiring every individual
>> bit to be turned on along with turning on AMX-TILE. Yet it would be
>> more user friendly if only the top-level bit needed enabling manually,
>> with available sub-features then becoming available "automatically".
> 
> I think I've covered all of this in the reply above?
> 
> ~Andrew
Andrew Cooper May 22, 2023, 2:02 p.m. UTC | #12
On 22/05/2023 8:31 am, Jan Beulich wrote:
> On 19.05.2023 17:52, Andrew Cooper wrote:
>> On 17/05/2023 10:20 am, Jan Beulich wrote:
>>> On 16.05.2023 21:31, Andrew Cooper wrote:
>>>> On 16/05/2023 3:53 pm, Jan Beulich wrote:
>>>>> I guess that's no
>>>>> different from other max-only features with dependents, but I wonder
>>>>> whether that's good behavior.
>>>> It's not really something you get a choice over.
>>>>
>>>> Default is always less than max, so however you choose to express these
>>>> concepts, when you're opting-in you're always having to put information
>>>> back in which was previously stripped out.
>>> But my point is towards the amount of data you need to specify manually.
>>> I would find it quite helpful if default-on sub-features became available
>>> automatically once the top-level feature was turned on. I guess so far
>>> we don't have many such cases, but here you add a whole bunch.
>> I'm not suggesting specifying it manually.  That would be a dumb UX move.
>>
>> But you absolutely cannot have "user turns on EIBRS" meaning "turn on
>> ARCH_CAPS too", because a) that requires creating the reverse feature
>> map which is massively larger than the forward feature map, and b) it
>> creates a vulnerability in the guest, and c) it's ambiguous - e.g. what
>> does turning on a sub-mode of AVX-512 mean for sibling sub-modes?
> Feels like a misunderstanding at this point, because what you're describing
> above is not what I was referring to. Instead I was specifically referring
> to "cpuid=...,arch-caps" not having any effect beyond the turning on of the
> MSR itself (with all-zero content). This is even worse with libxl not even
> having a way right now to express something like "arch-caps=..." to enable
> some of the sub-features for guests.

We discussed this on the x86 call.  In summary:

Right now, arch-caps is off-by-default because it doesn't work safely or
nicely.  I'm working on safely first, and nicely will come second.

The end result of my work is going to have arch-caps active by default
and supported.  End users aren't going to in a position of needing to
specify cpuid="...,arch-caps" in their config files.

Fixing libxl's ability to specify the contents is something needing to
be done before we can say arch-caps is fully supported, because right
now it's the only way users of xl/libxl have for levelling a VM for migrate.

> To explicitly answer the AVX512 part of your reply: Turning on a sub-mode
> alone could be useful as well (with the effect of also turning on the
> main feature, and with or without [policy question] also turning on other
> default-on subfeatures of AVX512). But again - that direction of possible
> "implications" isn't what my earlier reply was about. As you say, reverse
> maps would then also be needed, whereas for what I'm suggesting only the
> deep-deps we already have are necessary: We'd grab the main feature's
> dependencies and AND that with the default mask before ORing into the
> domain's policy.

Having discussed this, I agree and specifically have an idea for how to
use it for AVX512.  But it is also going to take a fair amount of
rearranging in libxl/libxc to make work.

I.e., yes, but not immediately right now.

~Andrew
diff mbox series

Patch

diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
index dfd9abd8564c..74266d30b551 100644
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -408,6 +408,25 @@  static void __init calculate_host_policy(void)
     p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
 }
 
+static void __init guest_common_max_feature_adjustments(uint32_t *fs)
+{
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+    {
+        /*
+         * MSR_ARCH_CAPS is just feature data, and we can offer it to guests
+         * unconditionally, although limit it to Intel systems as it is highly
+         * uarch-specific.
+         *
+         * In particular, the RSBA and RRSBA bits mean "you might migrate to a
+         * system where RSB underflow uses alternative predictors (a.k.a
+         * Retpoline not safe)", so these need to be visible to a guest in all
+         * cases, even when it's only some other server in the pool which
+         * suffers the identified behaviour.
+         */
+        __set_bit(X86_FEATURE_ARCH_CAPS, fs);
+    }
+}
+
 static void __init guest_common_default_feature_adjustments(uint32_t *fs)
 {
     /*
@@ -483,6 +502,7 @@  static void __init calculate_pv_max_policy(void)
         __clear_bit(X86_FEATURE_IBRS, fs);
     }
 
+    guest_common_max_feature_adjustments(fs);
     guest_common_feature_adjustments(fs);
 
     sanitise_featureset(fs);
@@ -490,8 +510,6 @@  static void __init calculate_pv_max_policy(void)
     recalculate_xstate(p);
 
     p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
-
-    p->arch_caps.raw = 0; /* Not supported yet. */
 }
 
 static void __init calculate_pv_def_policy(void)
@@ -598,6 +616,7 @@  static void __init calculate_hvm_max_policy(void)
     if ( !cpu_has_vmx )
         __clear_bit(X86_FEATURE_PKS, fs);
 
+    guest_common_max_feature_adjustments(fs);
     guest_common_feature_adjustments(fs);
 
     sanitise_featureset(fs);
@@ -606,8 +625,6 @@  static void __init calculate_hvm_max_policy(void)
 
     /* It's always possible to emulate CPUID faulting for HVM guests */
     p->platform_info.cpuid_faulting = true;
-
-    p->arch_caps.raw = 0; /* Not supported yet. */
 }
 
 static void __init calculate_hvm_def_policy(void)
@@ -828,7 +845,10 @@  void __init init_dom0_cpuid_policy(struct domain *d)
      * domain policy logic gains a better understanding of MSRs.
      */
     if ( is_hardware_domain(d) && cpu_has_arch_caps )
+    {
         p->feat.arch_caps = true;
+        p->arch_caps.raw = host_cpu_policy.arch_caps.raw;
+    }
 
     /* Apply dom0-cpuid= command line settings, if provided. */
     if ( dom0_cpuid_cmdline )
@@ -858,20 +878,6 @@  void __init init_dom0_cpuid_policy(struct domain *d)
         p->platform_info.cpuid_faulting = false;
 
     recalculate_cpuid_policy(d);
-
-    if ( is_hardware_domain(d) && cpu_has_arch_caps )
-    {
-        uint64_t val;
-
-        rdmsrl(MSR_ARCH_CAPABILITIES, val);
-
-        p->arch_caps.raw = val &
-            (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA |
-             ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO |
-             ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO |
-             ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA |
-             ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO);
-    }
 }
 
 static void __init __maybe_unused build_assertions(void)