diff mbox series

[v4] VMX: use a single, global APIC access page

Message ID 1c489e77-6e65-6121-6c28-3c4bd377223c@suse.com (mailing list archive)
State New, archived
Headers show
Series [v4] VMX: use a single, global APIC access page | expand

Commit Message

Jan Beulich April 12, 2021, 10:40 a.m. UTC
The address of this page is used by the CPU only to recognize when to
access the virtual APIC page instead. No accesses would ever go to this
page. It only needs to be present in the (CPU) page tables so that
address translation will produce its address as result for respective
accesses.

By making this page global, we also eliminate the need to refcount it,
or to assign it to any domain in the first place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
v4: Set PGC_extra on the page. Make shadow mode work.
v3: Split p2m insertion change to a separate patch.
v2: Avoid insertion when !has_vlapic(). Split off change to
    p2m_get_iommu_flags().
---
I did further consider not allocating any real page at all, but just
using the address of some unpopulated space (which would require
announcing this page as reserved to Dom0, so it wouldn't put any PCI
MMIO BARs there). But I thought this would be too controversial, because
of the possible risks associated with this.

Comments

Roger Pau Monné April 12, 2021, 3:31 p.m. UTC | #1
On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote:
> The address of this page is used by the CPU only to recognize when to
> access the virtual APIC page instead. No accesses would ever go to this
> page. It only needs to be present in the (CPU) page tables so that
> address translation will produce its address as result for respective
> accesses.
> 
> By making this page global, we also eliminate the need to refcount it,
> or to assign it to any domain in the first place.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> ---
> v4: Set PGC_extra on the page. Make shadow mode work.
> v3: Split p2m insertion change to a separate patch.
> v2: Avoid insertion when !has_vlapic(). Split off change to
>     p2m_get_iommu_flags().
> ---
> I did further consider not allocating any real page at all, but just
> using the address of some unpopulated space (which would require
> announcing this page as reserved to Dom0, so it wouldn't put any PCI
> MMIO BARs there). But I thought this would be too controversial, because
> of the possible risks associated with this.

Really seems more trouble than reward. Also there are systems with
MMIO regions in holes on the memory map, like the issue I had with the
Intel pinctrl stuff that had an MMIO region in a hole on the memory
map [0], so I'm not sure Xen would be in a position to select a
suitable unpopulated page anyway.

[0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept
>  static void vmx_ctxt_switch_from(struct vcpu *v);
>  static void vmx_ctxt_switch_to(struct vcpu *v);
>  
> -static int  vmx_alloc_vlapic_mapping(struct domain *d);
> -static void vmx_free_vlapic_mapping(struct domain *d);
> +static int alloc_vlapic_mapping(void);
>  static void vmx_install_vlapic_mapping(struct vcpu *v);
>  static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
>                                  unsigned int flags);
> @@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign
>  static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
>  static void vmx_invlpg(struct vcpu *v, unsigned long linear);
>  
> +static mfn_t __read_mostly apic_access_mfn;
> +
>  /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
>  #define PI_CSW_FROM (1u << 0)
>  #define PI_CSW_TO   (1u << 1)
> @@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct
>          .to   = vmx_ctxt_switch_to,
>          .tail = vmx_do_resume,
>      };
> -    int rc;
>  
>      d->arch.ctxt_switch = &csw;
>  
> @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct
>       */
>      d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
>  
> -    if ( !has_vlapic(d) )
> -        return 0;
> -
> -    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
> -        return rc;
> -
>      return 0;
>  }
>  
> -static void vmx_domain_relinquish_resources(struct domain *d)
> +static void domain_creation_finished(struct domain *d)
>  {
> -    if ( !has_vlapic(d) )
> +    gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);

Worth making it const static?

> +    uint8_t ipat;
> +
> +    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) )

It would be better to use INVALID_MFN here, and init apic_access_mfn
to that value.

>          return;
>  
> -    vmx_free_vlapic_mapping(d);
> -}
> +    ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat,
> +                              true) == MTRR_TYPE_WRBACK);
> +    ASSERT(ipat);
>  
> -static void domain_creation_finished(struct domain *d)
> -{
> -    if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn, _mfn(0)) &&
> -         set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE),
> -                            d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) )
> +    if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) )
>          domain_crash(d);
>  }
>  
> @@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd
>      .cpu_up_prepare       = vmx_cpu_up_prepare,
>      .cpu_dead             = vmx_cpu_dead,
>      .domain_initialise    = vmx_domain_initialise,
> -    .domain_relinquish_resources = vmx_domain_relinquish_resources,
>      .domain_creation_finished = domain_creation_finished,
>      .vcpu_initialise      = vmx_vcpu_initialise,
>      .vcpu_destroy         = vmx_vcpu_destroy,
> @@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init
>  {
>      set_in_cr4(X86_CR4_VMXE);
>  
> -    if ( vmx_vmcs_init() )
> +    if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
>      {
>          printk("VMX: failed to initialise.\n");
>          return NULL;
> @@ -3224,7 +3217,7 @@ gp_fault:
>      return X86EMUL_EXCEPTION;
>  }
>  
> -static int vmx_alloc_vlapic_mapping(struct domain *d)
> +static int __init alloc_vlapic_mapping(void)
>  {
>      struct page_info *pg;
>      mfn_t mfn;
> @@ -3232,52 +3225,31 @@ static int vmx_alloc_vlapic_mapping(stru
>      if ( !cpu_has_vmx_virtualize_apic_accesses )
>          return 0;
>  
> -    pg = alloc_domheap_page(d, MEMF_no_refcount);
> +    pg = alloc_domheap_page(NULL, 0);
>      if ( !pg )
>          return -ENOMEM;
>  
> -    if ( !get_page_and_type(pg, d, PGT_writable_page) )
> -    {
> -        /*
> -         * The domain can't possibly know about this page yet, so failure
> -         * here is a clear indication of something fishy going on.
> -         */
> -        domain_crash(d);
> -        return -ENODATA;
> -    }
> +    /* Arrange for epte_get_entry_emt() to recognize this page as "special". */
> +    pg->count_info |= PGC_extra;
>  
>      mfn = page_to_mfn(pg);
>      clear_domain_page(mfn);
> -    d->arch.hvm.vmx.apic_access_mfn = mfn;
> +    apic_access_mfn = mfn;
>  
>      return 0;
>  }
>  
> -static void vmx_free_vlapic_mapping(struct domain *d)
> -{
> -    mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
> -
> -    d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
> -    if ( !mfn_eq(mfn, _mfn(0)) )
> -    {
> -        struct page_info *pg = mfn_to_page(mfn);
> -
> -        put_page_alloc_ref(pg);
> -        put_page_and_type(pg);
> -    }
> -}
> -
>  static void vmx_install_vlapic_mapping(struct vcpu *v)
>  {
>      paddr_t virt_page_ma, apic_page_ma;
>  
> -    if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) )
> +    if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, _mfn(0)) )
>          return;
>  
>      ASSERT(cpu_has_vmx_virtualize_apic_accesses);
>  
>      virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
> -    apic_page_ma = mfn_to_maddr(v->domain->arch.hvm.vmx.apic_access_mfn);
> +    apic_page_ma = mfn_to_maddr(apic_access_mfn);
>  
>      vmx_vmcs_enter(v);
>      __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
> --- a/xen/arch/x86/mm/shadow/set.c
> +++ b/xen/arch/x86/mm/shadow/set.c
> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>      ASSERT(!sh_l1e_is_magic(sl1e));
>      ASSERT(shadow_mode_refcounts(d));
>  
> +    /*
> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> +     * get accessed, and hence there's no need to refcount it (and refcounting
> +     * would fail, due to the page having no owner).
> +     */
> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )

I find this assignment inside the parameter list quite ugly, I would
rather split it on it's own line.

> +    {
> +        const struct page_info *pg = mfn_to_page(mfn);
> +
> +        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
> +        {
> +            ASSERT(type == p2m_mmio_direct);
> +            return 0;

Are there any other pages that could pass this check? I don't think
so, but wanted to assert.

Thanks, Roger.
Jan Beulich April 13, 2021, 9:24 a.m. UTC | #2
On 12.04.2021 17:31, Roger Pau Monné wrote:
> On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote:
>> The address of this page is used by the CPU only to recognize when to
>> access the virtual APIC page instead. No accesses would ever go to this
>> page. It only needs to be present in the (CPU) page tables so that
>> address translation will produce its address as result for respective
>> accesses.
>>
>> By making this page global, we also eliminate the need to refcount it,
>> or to assign it to any domain in the first place.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
>> ---
>> v4: Set PGC_extra on the page. Make shadow mode work.
>> v3: Split p2m insertion change to a separate patch.
>> v2: Avoid insertion when !has_vlapic(). Split off change to
>>     p2m_get_iommu_flags().
>> ---
>> I did further consider not allocating any real page at all, but just
>> using the address of some unpopulated space (which would require
>> announcing this page as reserved to Dom0, so it wouldn't put any PCI
>> MMIO BARs there). But I thought this would be too controversial, because
>> of the possible risks associated with this.
> 
> Really seems more trouble than reward. Also there are systems with
> MMIO regions in holes on the memory map, like the issue I had with the
> Intel pinctrl stuff that had an MMIO region in a hole on the memory
> map [0], so I'm not sure Xen would be in a position to select a
> suitable unpopulated page anyway.
> 
> [0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/

Yeah, I had seen that. What I'm having trouble to understand is how the
OS will know to avoid that range for e.g. placing BARs.

>> @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct
>>       */
>>      d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
>>  
>> -    if ( !has_vlapic(d) )
>> -        return 0;
>> -
>> -    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
>> -        return rc;
>> -
>>      return 0;
>>  }
>>  
>> -static void vmx_domain_relinquish_resources(struct domain *d)
>> +static void domain_creation_finished(struct domain *d)
>>  {
>> -    if ( !has_vlapic(d) )
>> +    gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
> 
> Worth making it const static?

The compiler ought to be able to fold this into a suitable constant
at the use site. Definitely not static imo, and I see little point
in making a local variable const, unless one really wants to
document something very special.

>> +    uint8_t ipat;
>> +
>> +    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) )
> 
> It would be better to use INVALID_MFN here, and init apic_access_mfn
> to that value.

Oh, yes, that's easier possible now that the variable is static.

>> --- a/xen/arch/x86/mm/shadow/set.c
>> +++ b/xen/arch/x86/mm/shadow/set.c
>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>>      ASSERT(!sh_l1e_is_magic(sl1e));
>>      ASSERT(shadow_mode_refcounts(d));
>>  
>> +    /*
>> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
>> +     * get accessed, and hence there's no need to refcount it (and refcounting
>> +     * would fail, due to the page having no owner).
>> +     */
>> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
> 
> I find this assignment inside the parameter list quite ugly, I would
> rather split it on it's own line.

Well, okay. To be honest I'm not even sure why I did it this way, as
I could have expected a respective comment.

>> +    {
>> +        const struct page_info *pg = mfn_to_page(mfn);
>> +
>> +        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
>> +        {
>> +            ASSERT(type == p2m_mmio_direct);
>> +            return 0;
> 
> Are there any other pages that could pass this check? I don't think
> so, but wanted to assert.

"Normal" extra pages have an owner, so no, there aren't any others.
If and when any appear, this may need further customizing, albeit
generally I'd hope further pages matching this pattern would also
want similar treatment.

Jan
Roger Pau Monné April 13, 2021, 10:18 a.m. UTC | #3
On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote:
> On 12.04.2021 17:31, Roger Pau Monné wrote:
> > On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote:
> >> The address of this page is used by the CPU only to recognize when to
> >> access the virtual APIC page instead. No accesses would ever go to this
> >> page. It only needs to be present in the (CPU) page tables so that
> >> address translation will produce its address as result for respective
> >> accesses.
> >>
> >> By making this page global, we also eliminate the need to refcount it,
> >> or to assign it to any domain in the first place.
> >>
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> >> ---
> >> v4: Set PGC_extra on the page. Make shadow mode work.
> >> v3: Split p2m insertion change to a separate patch.
> >> v2: Avoid insertion when !has_vlapic(). Split off change to
> >>     p2m_get_iommu_flags().
> >> ---
> >> I did further consider not allocating any real page at all, but just
> >> using the address of some unpopulated space (which would require
> >> announcing this page as reserved to Dom0, so it wouldn't put any PCI
> >> MMIO BARs there). But I thought this would be too controversial, because
> >> of the possible risks associated with this.
> > 
> > Really seems more trouble than reward. Also there are systems with
> > MMIO regions in holes on the memory map, like the issue I had with the
> > Intel pinctrl stuff that had an MMIO region in a hole on the memory
> > map [0], so I'm not sure Xen would be in a position to select a
> > suitable unpopulated page anyway.
> > 
> > [0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/
> 
> Yeah, I had seen that. What I'm having trouble to understand is how the
> OS will know to avoid that range for e.g. placing BARs.

No idea really, I assume they expect that you parse all possible ACPI
devices from the dynamic tables before deciding on any BAR placements?

I still consider a bug that the pinctrl MMIO region is not marked as
reserved in the memory map.

> >> +    {
> >> +        const struct page_info *pg = mfn_to_page(mfn);
> >> +
> >> +        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
> >> +        {
> >> +            ASSERT(type == p2m_mmio_direct);
> >> +            return 0;
> > 
> > Are there any other pages that could pass this check? I don't think
> > so, but wanted to assert.
> 
> "Normal" extra pages have an owner, so no, there aren't any others.
> If and when any appear, this may need further customizing, albeit
> generally I'd hope further pages matching this pattern would also
> want similar treatment.

I wonder whether we want to add an assert here to make sure only the
APIC access page receives this special handling by the shadow code,
but maybe that's a bit too much?

Thanks, Roger.
Jan Beulich April 13, 2021, 12:03 p.m. UTC | #4
On 13.04.2021 12:18, Roger Pau Monné wrote:
> On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote:
>> On 12.04.2021 17:31, Roger Pau Monné wrote:
>>> On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote:
>>>> +    {
>>>> +        const struct page_info *pg = mfn_to_page(mfn);
>>>> +
>>>> +        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
>>>> +        {
>>>> +            ASSERT(type == p2m_mmio_direct);
>>>> +            return 0;
>>>
>>> Are there any other pages that could pass this check? I don't think
>>> so, but wanted to assert.
>>
>> "Normal" extra pages have an owner, so no, there aren't any others.
>> If and when any appear, this may need further customizing, albeit
>> generally I'd hope further pages matching this pattern would also
>> want similar treatment.
> 
> I wonder whether we want to add an assert here to make sure only the
> APIC access page receives this special handling by the shadow code,
> but maybe that's a bit too much?

I think so, yes: It would require either a separate function or
making the variable global. Both feel like a layering violation.

Jan
Roger Pau Monné April 13, 2021, 1:03 p.m. UTC | #5
On Tue, Apr 13, 2021 at 02:03:52PM +0200, Jan Beulich wrote:
> On 13.04.2021 12:18, Roger Pau Monné wrote:
> > On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote:
> >> On 12.04.2021 17:31, Roger Pau Monné wrote:
> >>> On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote:
> >>>> +    {
> >>>> +        const struct page_info *pg = mfn_to_page(mfn);
> >>>> +
> >>>> +        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
> >>>> +        {
> >>>> +            ASSERT(type == p2m_mmio_direct);
> >>>> +            return 0;
> >>>
> >>> Are there any other pages that could pass this check? I don't think
> >>> so, but wanted to assert.
> >>
> >> "Normal" extra pages have an owner, so no, there aren't any others.
> >> If and when any appear, this may need further customizing, albeit
> >> generally I'd hope further pages matching this pattern would also
> >> want similar treatment.
> > 
> > I wonder whether we want to add an assert here to make sure only the
> > APIC access page receives this special handling by the shadow code,
> > but maybe that's a bit too much?
> 
> I think so, yes: It would require either a separate function or
> making the variable global. Both feel like a layering violation.

Right, with the INVALID_MFN change and the shadow mfn_valid style
adjustment:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.
Tim Deegan April 17, 2021, 7:24 p.m. UTC | #6
Hi,

Apologies for not sending comments before, and thanks for the ping.

At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote:
> The address of this page is used by the CPU only to recognize when to
> access the virtual APIC page instead. No accesses would ever go to this
> page. It only needs to be present in the (CPU) page tables so that
> address translation will produce its address as result for respective
> accesses.
> 
> By making this page global, we also eliminate the need to refcount it,
> or to assign it to any domain in the first place.

What is the aim here?  To save 4k per domain?  It seems to come out
about even for adding and removing code. 

> --- a/xen/arch/x86/mm/shadow/set.c
> +++ b/xen/arch/x86/mm/shadow/set.c
> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>      ASSERT(!sh_l1e_is_magic(sl1e));
>      ASSERT(shadow_mode_refcounts(d));
>  
> +    /*
> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> +     * get accessed, and hence there's no need to refcount it (and refcounting
> +     * would fail, due to the page having no owner).
> +     */
> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )

Would it be better to check specifically for mfn == apic_access_mfn
(and apic_access_mfn != 0, I guess)?  If we want this behaviour for
for all un-owned PGC_extra MFNs it would be good to explain that in the
comments.

Cheers,

Tim.
Jan Beulich April 19, 2021, 11:25 a.m. UTC | #7
On 17.04.2021 21:24, Tim Deegan wrote:
> At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote:
>> The address of this page is used by the CPU only to recognize when to
>> access the virtual APIC page instead. No accesses would ever go to this
>> page. It only needs to be present in the (CPU) page tables so that
>> address translation will produce its address as result for respective
>> accesses.
>>
>> By making this page global, we also eliminate the need to refcount it,
>> or to assign it to any domain in the first place.
> 
> What is the aim here?  To save 4k per domain?  It seems to come out
> about even for adding and removing code. 

True, but still it looks wrong to me to use a page per guest when one
her host suffices. Think about many tiny, short-lived VMs (as in
Tamas'es VM forking).

>> --- a/xen/arch/x86/mm/shadow/set.c
>> +++ b/xen/arch/x86/mm/shadow/set.c
>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>>      ASSERT(!sh_l1e_is_magic(sl1e));
>>      ASSERT(shadow_mode_refcounts(d));
>>  
>> +    /*
>> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
>> +     * get accessed, and hence there's no need to refcount it (and refcounting
>> +     * would fail, due to the page having no owner).
>> +     */
>> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
> 
> Would it be better to check specifically for mfn == apic_access_mfn
> (and apic_access_mfn != 0, I guess)?

Roger did ask about the same - I neither want to expose apic_access_mfn
outside its CU, nor do I want to introduce an accessor function. Both
feel like layering violations to me.

>  If we want this behaviour for
> for all un-owned PGC_extra MFNs it would be good to explain that in the
> comments.

This is hard to tell without knowing which (or even if) further such
PGC_extra pages will appear. Hence any comment to that effect would be
guesswork at best. Of course I can add e.g. "Other pages with the same
properties would be treated the same", if that's what you're after?

Jan
Tim Deegan April 22, 2021, 7:42 a.m. UTC | #8
At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote:
> On 17.04.2021 21:24, Tim Deegan wrote:
> > At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote:
> >> By making this page global, we also eliminate the need to refcount it,
> >> or to assign it to any domain in the first place.
> > 
> > What is the aim here?  To save 4k per domain?  It seems to come out
> > about even for adding and removing code. 
> 
> True, but still it looks wrong to me to use a page per guest when one
> her host suffices. Think about many tiny, short-lived VMs (as in
> Tamas'es VM forking).

OK, fair enough.

> >> --- a/xen/arch/x86/mm/shadow/set.c
> >> +++ b/xen/arch/x86/mm/shadow/set.c
> >> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
> >>      ASSERT(!sh_l1e_is_magic(sl1e));
> >>      ASSERT(shadow_mode_refcounts(d));
> >>  
> >> +    /*
> >> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> >> +     * get accessed, and hence there's no need to refcount it (and refcounting
> >> +     * would fail, due to the page having no owner).
> >> +     */
> >> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
> > 
> > Would it be better to check specifically for mfn == apic_access_mfn
> > (and apic_access_mfn != 0, I guess)?
> 
> Roger did ask about the same - I neither want to expose apic_access_mfn
> outside its CU, nor do I want to introduce an accessor function. Both
> feel like layering violations to me.

I think that this is even more of a layering violation: what we
actually want is to allow un-refcounted mappings of the
apic_access_mfn, but to do it we're relying on an internal
implementation detail (that it happens to be un-owned and PGC_extra)
rather than giving ourselves an API.

And so we're tangled up talking about how to write comments to warn
our future selves about the possible side-effects.

> >  If we want this behaviour for
> > for all un-owned PGC_extra MFNs it would be good to explain that in the
> > comments.
> 
> This is hard to tell without knowing which (or even if) further such
> PGC_extra pages will appear. Hence any comment to that effect would be
> guesswork at best. Of course I can add e.g. "Other pages with the same
> properties would be treated the same", if that's what you're after?

If you want to go this way there should be a comment here saying that
we're allowing this for all PGC_extra pages because we need it for
apic_access_mfn, and a comment at PGC_extra saying that it has this
effect.

Cheers,

Tim
Jan Beulich April 22, 2021, 9:38 a.m. UTC | #9
On 22.04.2021 09:42, Tim Deegan wrote:
> At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote:
>> On 17.04.2021 21:24, Tim Deegan wrote:
>>> At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote:
>>>> --- a/xen/arch/x86/mm/shadow/set.c
>>>> +++ b/xen/arch/x86/mm/shadow/set.c
>>>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>>>>      ASSERT(!sh_l1e_is_magic(sl1e));
>>>>      ASSERT(shadow_mode_refcounts(d));
>>>>  
>>>> +    /*
>>>> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
>>>> +     * get accessed, and hence there's no need to refcount it (and refcounting
>>>> +     * would fail, due to the page having no owner).
>>>> +     */
>>>> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
>>>
>>> Would it be better to check specifically for mfn == apic_access_mfn
>>> (and apic_access_mfn != 0, I guess)?
>>
>> Roger did ask about the same - I neither want to expose apic_access_mfn
>> outside its CU, nor do I want to introduce an accessor function. Both
>> feel like layering violations to me.
> 
> I think that this is even more of a layering violation: what we
> actually want is to allow un-refcounted mappings of the
> apic_access_mfn, but to do it we're relying on an internal
> implementation detail (that it happens to be un-owned and PGC_extra)
> rather than giving ourselves an API.
> 
> And so we're tangled up talking about how to write comments to warn
> our future selves about the possible side-effects.
> 
>>>  If we want this behaviour for
>>> for all un-owned PGC_extra MFNs it would be good to explain that in the
>>> comments.
>>
>> This is hard to tell without knowing which (or even if) further such
>> PGC_extra pages will appear. Hence any comment to that effect would be
>> guesswork at best. Of course I can add e.g. "Other pages with the same
>> properties would be treated the same", if that's what you're after?
> 
> If you want to go this way there should be a comment here saying that
> we're allowing this for all PGC_extra pages because we need it for
> apic_access_mfn, and a comment at PGC_extra saying that it has this
> effect.

So (along with a comment to this effect) how about I make
page_suppress_refcounting() and page_refcounting_suppressed() helpers?
The former would set PGC_extra on the page and assert the page has no
owner, while the latter would subsume the checks done here. The only
question then is what to do with the ASSERT(type == p2m_mmio_direct):
That's still a property of the APIC access MFN which may or may not
hold for future such pages. (It can't be part of the new helper anyway
as "put" doesn't have the type available.)

Jan
Tim Deegan April 22, 2021, 3:05 p.m. UTC | #10
At 11:38 +0200 on 22 Apr (1619091522), Jan Beulich wrote:
> On 22.04.2021 09:42, Tim Deegan wrote:
> > At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote:
> >> On 17.04.2021 21:24, Tim Deegan wrote:
> >>> At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote:
> >>>> --- a/xen/arch/x86/mm/shadow/set.c
> >>>> +++ b/xen/arch/x86/mm/shadow/set.c
> >>>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
> >>>>      ASSERT(!sh_l1e_is_magic(sl1e));
> >>>>      ASSERT(shadow_mode_refcounts(d));
> >>>>  
> >>>> +    /*
> >>>> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> >>>> +     * get accessed, and hence there's no need to refcount it (and refcounting
> >>>> +     * would fail, due to the page having no owner).
> >>>> +     */
> >>>> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
> >>>
> >>> Would it be better to check specifically for mfn == apic_access_mfn
> >>> (and apic_access_mfn != 0, I guess)?
> >>
> >> Roger did ask about the same - I neither want to expose apic_access_mfn
> >> outside its CU, nor do I want to introduce an accessor function. Both
> >> feel like layering violations to me.
> > 
> > I think that this is even more of a layering violation: what we
> > actually want is to allow un-refcounted mappings of the
> > apic_access_mfn, but to do it we're relying on an internal
> > implementation detail (that it happens to be un-owned and PGC_extra)
> > rather than giving ourselves an API.
> > 
> > And so we're tangled up talking about how to write comments to warn
> > our future selves about the possible side-effects.
> > 
> >>>  If we want this behaviour for
> >>> for all un-owned PGC_extra MFNs it would be good to explain that in the
> >>> comments.
> >>
> >> This is hard to tell without knowing which (or even if) further such
> >> PGC_extra pages will appear. Hence any comment to that effect would be
> >> guesswork at best. Of course I can add e.g. "Other pages with the same
> >> properties would be treated the same", if that's what you're after?
> > 
> > If you want to go this way there should be a comment here saying that
> > we're allowing this for all PGC_extra pages because we need it for
> > apic_access_mfn, and a comment at PGC_extra saying that it has this
> > effect.
> 
> So (along with a comment to this effect) how about I make
> page_suppress_refcounting() and page_refcounting_suppressed() helpers?
> The former would set PGC_extra on the page and assert the page has no
> owner, while the latter would subsume the checks done here.

That sounds good to me.

> The only
> question then is what to do with the ASSERT(type == p2m_mmio_direct):
> That's still a property of the APIC access MFN which may or may not
> hold for future such pages. (It can't be part of the new helper anyway
> as "put" doesn't have the type available.)

I think we might drop that assertion, since the new mehanism would be
more general.

Cheers,

Tim.
diff mbox series

Patch

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -66,8 +66,7 @@  boolean_param("force-ept", opt_force_ept
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
 
-static int  vmx_alloc_vlapic_mapping(struct domain *d);
-static void vmx_free_vlapic_mapping(struct domain *d);
+static int alloc_vlapic_mapping(void);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
                                 unsigned int flags);
@@ -78,6 +77,8 @@  static int vmx_msr_read_intercept(unsign
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
 static void vmx_invlpg(struct vcpu *v, unsigned long linear);
 
+static mfn_t __read_mostly apic_access_mfn;
+
 /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
 #define PI_CSW_FROM (1u << 0)
 #define PI_CSW_TO   (1u << 1)
@@ -401,7 +402,6 @@  static int vmx_domain_initialise(struct
         .to   = vmx_ctxt_switch_to,
         .tail = vmx_do_resume,
     };
-    int rc;
 
     d->arch.ctxt_switch = &csw;
 
@@ -411,28 +411,22 @@  static int vmx_domain_initialise(struct
      */
     d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
 
-    if ( !has_vlapic(d) )
-        return 0;
-
-    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
-        return rc;
-
     return 0;
 }
 
-static void vmx_domain_relinquish_resources(struct domain *d)
+static void domain_creation_finished(struct domain *d)
 {
-    if ( !has_vlapic(d) )
+    gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
+    uint8_t ipat;
+
+    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) )
         return;
 
-    vmx_free_vlapic_mapping(d);
-}
+    ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat,
+                              true) == MTRR_TYPE_WRBACK);
+    ASSERT(ipat);
 
-static void domain_creation_finished(struct domain *d)
-{
-    if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn, _mfn(0)) &&
-         set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE),
-                            d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) )
+    if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) )
         domain_crash(d);
 }
 
@@ -2415,7 +2409,6 @@  static struct hvm_function_table __initd
     .cpu_up_prepare       = vmx_cpu_up_prepare,
     .cpu_dead             = vmx_cpu_dead,
     .domain_initialise    = vmx_domain_initialise,
-    .domain_relinquish_resources = vmx_domain_relinquish_resources,
     .domain_creation_finished = domain_creation_finished,
     .vcpu_initialise      = vmx_vcpu_initialise,
     .vcpu_destroy         = vmx_vcpu_destroy,
@@ -2662,7 +2655,7 @@  const struct hvm_function_table * __init
 {
     set_in_cr4(X86_CR4_VMXE);
 
-    if ( vmx_vmcs_init() )
+    if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
     {
         printk("VMX: failed to initialise.\n");
         return NULL;
@@ -3224,7 +3217,7 @@  gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
-static int vmx_alloc_vlapic_mapping(struct domain *d)
+static int __init alloc_vlapic_mapping(void)
 {
     struct page_info *pg;
     mfn_t mfn;
@@ -3232,52 +3225,31 @@  static int vmx_alloc_vlapic_mapping(stru
     if ( !cpu_has_vmx_virtualize_apic_accesses )
         return 0;
 
-    pg = alloc_domheap_page(d, MEMF_no_refcount);
+    pg = alloc_domheap_page(NULL, 0);
     if ( !pg )
         return -ENOMEM;
 
-    if ( !get_page_and_type(pg, d, PGT_writable_page) )
-    {
-        /*
-         * The domain can't possibly know about this page yet, so failure
-         * here is a clear indication of something fishy going on.
-         */
-        domain_crash(d);
-        return -ENODATA;
-    }
+    /* Arrange for epte_get_entry_emt() to recognize this page as "special". */
+    pg->count_info |= PGC_extra;
 
     mfn = page_to_mfn(pg);
     clear_domain_page(mfn);
-    d->arch.hvm.vmx.apic_access_mfn = mfn;
+    apic_access_mfn = mfn;
 
     return 0;
 }
 
-static void vmx_free_vlapic_mapping(struct domain *d)
-{
-    mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
-
-    d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
-    if ( !mfn_eq(mfn, _mfn(0)) )
-    {
-        struct page_info *pg = mfn_to_page(mfn);
-
-        put_page_alloc_ref(pg);
-        put_page_and_type(pg);
-    }
-}
-
 static void vmx_install_vlapic_mapping(struct vcpu *v)
 {
     paddr_t virt_page_ma, apic_page_ma;
 
-    if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) )
+    if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, _mfn(0)) )
         return;
 
     ASSERT(cpu_has_vmx_virtualize_apic_accesses);
 
     virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
-    apic_page_ma = mfn_to_maddr(v->domain->arch.hvm.vmx.apic_access_mfn);
+    apic_page_ma = mfn_to_maddr(apic_access_mfn);
 
     vmx_vmcs_enter(v);
     __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
--- a/xen/arch/x86/mm/shadow/set.c
+++ b/xen/arch/x86/mm/shadow/set.c
@@ -94,6 +94,22 @@  shadow_get_page_from_l1e(shadow_l1e_t sl
     ASSERT(!sh_l1e_is_magic(sl1e));
     ASSERT(shadow_mode_refcounts(d));
 
+    /*
+     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
+     * get accessed, and hence there's no need to refcount it (and refcounting
+     * would fail, due to the page having no owner).
+     */
+    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
+    {
+        const struct page_info *pg = mfn_to_page(mfn);
+
+        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
+        {
+            ASSERT(type == p2m_mmio_direct);
+            return 0;
+        }
+    }
+
     res = get_page_from_l1e(sl1e, d, d);
 
     /*
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -276,9 +276,20 @@  int shadow_set_l4e(struct domain *d, sha
 static void inline
 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 {
+    mfn_t mfn;
+
     if ( !shadow_mode_refcounts(d) )
         return;
 
+    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) )
+    {
+        const struct page_info *pg = mfn_to_page(mfn);
+
+        /* See the respective comment in shadow_get_page_from_l1e(). */
+        if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) )
+            return;
+    }
+
     put_page_from_l1e(sl1e, d);
 }
 
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -58,7 +58,6 @@  struct ept_data {
 #define _VMX_DOMAIN_PML_ENABLED    0
 #define VMX_DOMAIN_PML_ENABLED     (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
-    mfn_t apic_access_mfn;
     /* VMX_DOMAIN_* */
     unsigned int status;