Message ID | 1c489e77-6e65-6121-6c28-3c4bd377223c@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v4] VMX: use a single, global APIC access page | expand |
On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote: > The address of this page is used by the CPU only to recognize when to > access the virtual APIC page instead. No accesses would ever go to this > page. It only needs to be present in the (CPU) page tables so that > address translation will produce its address as result for respective > accesses. > > By making this page global, we also eliminate the need to refcount it, > or to assign it to any domain in the first place. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Reviewed-by: Kevin Tian <kevin.tian@intel.com> > --- > v4: Set PGC_extra on the page. Make shadow mode work. > v3: Split p2m insertion change to a separate patch. > v2: Avoid insertion when !has_vlapic(). Split off change to > p2m_get_iommu_flags(). > --- > I did further consider not allocating any real page at all, but just > using the address of some unpopulated space (which would require > announcing this page as reserved to Dom0, so it wouldn't put any PCI > MMIO BARs there). But I thought this would be too controversial, because > of the possible risks associated with this. Really seems more trouble than reward. Also there are systems with MMIO regions in holes on the memory map, like the issue I had with the Intel pinctrl stuff that had an MMIO region in a hole on the memory map [0], so I'm not sure Xen would be in a position to select a suitable unpopulated page anyway. [0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/ > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept > static void vmx_ctxt_switch_from(struct vcpu *v); > static void vmx_ctxt_switch_to(struct vcpu *v); > > -static int vmx_alloc_vlapic_mapping(struct domain *d); > -static void vmx_free_vlapic_mapping(struct domain *d); > +static int alloc_vlapic_mapping(void); > static void vmx_install_vlapic_mapping(struct vcpu *v); > static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr, > unsigned int flags); > @@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign > static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content); > static void vmx_invlpg(struct vcpu *v, unsigned long linear); > > +static mfn_t __read_mostly apic_access_mfn; > + > /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */ > #define PI_CSW_FROM (1u << 0) > #define PI_CSW_TO (1u << 1) > @@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct > .to = vmx_ctxt_switch_to, > .tail = vmx_do_resume, > }; > - int rc; > > d->arch.ctxt_switch = &csw; > > @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct > */ > d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; > > - if ( !has_vlapic(d) ) > - return 0; > - > - if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) > - return rc; > - > return 0; > } > > -static void vmx_domain_relinquish_resources(struct domain *d) > +static void domain_creation_finished(struct domain *d) > { > - if ( !has_vlapic(d) ) > + gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE); Worth making it const static? > + uint8_t ipat; > + > + if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) ) It would be better to use INVALID_MFN here, and init apic_access_mfn to that value. > return; > > - vmx_free_vlapic_mapping(d); > -} > + ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat, > + true) == MTRR_TYPE_WRBACK); > + ASSERT(ipat); > > -static void domain_creation_finished(struct domain *d) > -{ > - if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn, _mfn(0)) && > - set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE), > - d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) ) > + if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) ) > domain_crash(d); > } > > @@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd > .cpu_up_prepare = vmx_cpu_up_prepare, > .cpu_dead = vmx_cpu_dead, > .domain_initialise = vmx_domain_initialise, > - .domain_relinquish_resources = vmx_domain_relinquish_resources, > .domain_creation_finished = domain_creation_finished, > .vcpu_initialise = vmx_vcpu_initialise, > .vcpu_destroy = vmx_vcpu_destroy, > @@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init > { > set_in_cr4(X86_CR4_VMXE); > > - if ( vmx_vmcs_init() ) > + if ( vmx_vmcs_init() || alloc_vlapic_mapping() ) > { > printk("VMX: failed to initialise.\n"); > return NULL; > @@ -3224,7 +3217,7 @@ gp_fault: > return X86EMUL_EXCEPTION; > } > > -static int vmx_alloc_vlapic_mapping(struct domain *d) > +static int __init alloc_vlapic_mapping(void) > { > struct page_info *pg; > mfn_t mfn; > @@ -3232,52 +3225,31 @@ static int vmx_alloc_vlapic_mapping(stru > if ( !cpu_has_vmx_virtualize_apic_accesses ) > return 0; > > - pg = alloc_domheap_page(d, MEMF_no_refcount); > + pg = alloc_domheap_page(NULL, 0); > if ( !pg ) > return -ENOMEM; > > - if ( !get_page_and_type(pg, d, PGT_writable_page) ) > - { > - /* > - * The domain can't possibly know about this page yet, so failure > - * here is a clear indication of something fishy going on. > - */ > - domain_crash(d); > - return -ENODATA; > - } > + /* Arrange for epte_get_entry_emt() to recognize this page as "special". */ > + pg->count_info |= PGC_extra; > > mfn = page_to_mfn(pg); > clear_domain_page(mfn); > - d->arch.hvm.vmx.apic_access_mfn = mfn; > + apic_access_mfn = mfn; > > return 0; > } > > -static void vmx_free_vlapic_mapping(struct domain *d) > -{ > - mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn; > - > - d->arch.hvm.vmx.apic_access_mfn = _mfn(0); > - if ( !mfn_eq(mfn, _mfn(0)) ) > - { > - struct page_info *pg = mfn_to_page(mfn); > - > - put_page_alloc_ref(pg); > - put_page_and_type(pg); > - } > -} > - > static void vmx_install_vlapic_mapping(struct vcpu *v) > { > paddr_t virt_page_ma, apic_page_ma; > > - if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) ) > + if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, _mfn(0)) ) > return; > > ASSERT(cpu_has_vmx_virtualize_apic_accesses); > > virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page); > - apic_page_ma = mfn_to_maddr(v->domain->arch.hvm.vmx.apic_access_mfn); > + apic_page_ma = mfn_to_maddr(apic_access_mfn); > > vmx_vmcs_enter(v); > __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma); > --- a/xen/arch/x86/mm/shadow/set.c > +++ b/xen/arch/x86/mm/shadow/set.c > @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl > ASSERT(!sh_l1e_is_magic(sl1e)); > ASSERT(shadow_mode_refcounts(d)); > > + /* > + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually > + * get accessed, and hence there's no need to refcount it (and refcounting > + * would fail, due to the page having no owner). > + */ > + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) I find this assignment inside the parameter list quite ugly, I would rather split it on it's own line. > + { > + const struct page_info *pg = mfn_to_page(mfn); > + > + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) > + { > + ASSERT(type == p2m_mmio_direct); > + return 0; Are there any other pages that could pass this check? I don't think so, but wanted to assert. Thanks, Roger.
On 12.04.2021 17:31, Roger Pau Monné wrote: > On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote: >> The address of this page is used by the CPU only to recognize when to >> access the virtual APIC page instead. No accesses would ever go to this >> page. It only needs to be present in the (CPU) page tables so that >> address translation will produce its address as result for respective >> accesses. >> >> By making this page global, we also eliminate the need to refcount it, >> or to assign it to any domain in the first place. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> Reviewed-by: Kevin Tian <kevin.tian@intel.com> >> --- >> v4: Set PGC_extra on the page. Make shadow mode work. >> v3: Split p2m insertion change to a separate patch. >> v2: Avoid insertion when !has_vlapic(). Split off change to >> p2m_get_iommu_flags(). >> --- >> I did further consider not allocating any real page at all, but just >> using the address of some unpopulated space (which would require >> announcing this page as reserved to Dom0, so it wouldn't put any PCI >> MMIO BARs there). But I thought this would be too controversial, because >> of the possible risks associated with this. > > Really seems more trouble than reward. Also there are systems with > MMIO regions in holes on the memory map, like the issue I had with the > Intel pinctrl stuff that had an MMIO region in a hole on the memory > map [0], so I'm not sure Xen would be in a position to select a > suitable unpopulated page anyway. > > [0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/ Yeah, I had seen that. What I'm having trouble to understand is how the OS will know to avoid that range for e.g. placing BARs. >> @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct >> */ >> d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; >> >> - if ( !has_vlapic(d) ) >> - return 0; >> - >> - if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) >> - return rc; >> - >> return 0; >> } >> >> -static void vmx_domain_relinquish_resources(struct domain *d) >> +static void domain_creation_finished(struct domain *d) >> { >> - if ( !has_vlapic(d) ) >> + gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE); > > Worth making it const static? The compiler ought to be able to fold this into a suitable constant at the use site. Definitely not static imo, and I see little point in making a local variable const, unless one really wants to document something very special. >> + uint8_t ipat; >> + >> + if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) ) > > It would be better to use INVALID_MFN here, and init apic_access_mfn > to that value. Oh, yes, that's easier possible now that the variable is static. >> --- a/xen/arch/x86/mm/shadow/set.c >> +++ b/xen/arch/x86/mm/shadow/set.c >> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl >> ASSERT(!sh_l1e_is_magic(sl1e)); >> ASSERT(shadow_mode_refcounts(d)); >> >> + /* >> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually >> + * get accessed, and hence there's no need to refcount it (and refcounting >> + * would fail, due to the page having no owner). >> + */ >> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) > > I find this assignment inside the parameter list quite ugly, I would > rather split it on it's own line. Well, okay. To be honest I'm not even sure why I did it this way, as I could have expected a respective comment. >> + { >> + const struct page_info *pg = mfn_to_page(mfn); >> + >> + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) >> + { >> + ASSERT(type == p2m_mmio_direct); >> + return 0; > > Are there any other pages that could pass this check? I don't think > so, but wanted to assert. "Normal" extra pages have an owner, so no, there aren't any others. If and when any appear, this may need further customizing, albeit generally I'd hope further pages matching this pattern would also want similar treatment. Jan
On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote: > On 12.04.2021 17:31, Roger Pau Monné wrote: > > On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote: > >> The address of this page is used by the CPU only to recognize when to > >> access the virtual APIC page instead. No accesses would ever go to this > >> page. It only needs to be present in the (CPU) page tables so that > >> address translation will produce its address as result for respective > >> accesses. > >> > >> By making this page global, we also eliminate the need to refcount it, > >> or to assign it to any domain in the first place. > >> > >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > >> Reviewed-by: Kevin Tian <kevin.tian@intel.com> > >> --- > >> v4: Set PGC_extra on the page. Make shadow mode work. > >> v3: Split p2m insertion change to a separate patch. > >> v2: Avoid insertion when !has_vlapic(). Split off change to > >> p2m_get_iommu_flags(). > >> --- > >> I did further consider not allocating any real page at all, but just > >> using the address of some unpopulated space (which would require > >> announcing this page as reserved to Dom0, so it wouldn't put any PCI > >> MMIO BARs there). But I thought this would be too controversial, because > >> of the possible risks associated with this. > > > > Really seems more trouble than reward. Also there are systems with > > MMIO regions in holes on the memory map, like the issue I had with the > > Intel pinctrl stuff that had an MMIO region in a hole on the memory > > map [0], so I'm not sure Xen would be in a position to select a > > suitable unpopulated page anyway. > > > > [0] https://lore.kernel.org/xen-devel/YFx80wYt%2FKcHanC7@smile.fi.intel.com/ > > Yeah, I had seen that. What I'm having trouble to understand is how the > OS will know to avoid that range for e.g. placing BARs. No idea really, I assume they expect that you parse all possible ACPI devices from the dynamic tables before deciding on any BAR placements? I still consider a bug that the pinctrl MMIO region is not marked as reserved in the memory map. > >> + { > >> + const struct page_info *pg = mfn_to_page(mfn); > >> + > >> + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) > >> + { > >> + ASSERT(type == p2m_mmio_direct); > >> + return 0; > > > > Are there any other pages that could pass this check? I don't think > > so, but wanted to assert. > > "Normal" extra pages have an owner, so no, there aren't any others. > If and when any appear, this may need further customizing, albeit > generally I'd hope further pages matching this pattern would also > want similar treatment. I wonder whether we want to add an assert here to make sure only the APIC access page receives this special handling by the shadow code, but maybe that's a bit too much? Thanks, Roger.
On 13.04.2021 12:18, Roger Pau Monné wrote: > On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote: >> On 12.04.2021 17:31, Roger Pau Monné wrote: >>> On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote: >>>> + { >>>> + const struct page_info *pg = mfn_to_page(mfn); >>>> + >>>> + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) >>>> + { >>>> + ASSERT(type == p2m_mmio_direct); >>>> + return 0; >>> >>> Are there any other pages that could pass this check? I don't think >>> so, but wanted to assert. >> >> "Normal" extra pages have an owner, so no, there aren't any others. >> If and when any appear, this may need further customizing, albeit >> generally I'd hope further pages matching this pattern would also >> want similar treatment. > > I wonder whether we want to add an assert here to make sure only the > APIC access page receives this special handling by the shadow code, > but maybe that's a bit too much? I think so, yes: It would require either a separate function or making the variable global. Both feel like a layering violation. Jan
On Tue, Apr 13, 2021 at 02:03:52PM +0200, Jan Beulich wrote: > On 13.04.2021 12:18, Roger Pau Monné wrote: > > On Tue, Apr 13, 2021 at 11:24:09AM +0200, Jan Beulich wrote: > >> On 12.04.2021 17:31, Roger Pau Monné wrote: > >>> On Mon, Apr 12, 2021 at 12:40:48PM +0200, Jan Beulich wrote: > >>>> + { > >>>> + const struct page_info *pg = mfn_to_page(mfn); > >>>> + > >>>> + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) > >>>> + { > >>>> + ASSERT(type == p2m_mmio_direct); > >>>> + return 0; > >>> > >>> Are there any other pages that could pass this check? I don't think > >>> so, but wanted to assert. > >> > >> "Normal" extra pages have an owner, so no, there aren't any others. > >> If and when any appear, this may need further customizing, albeit > >> generally I'd hope further pages matching this pattern would also > >> want similar treatment. > > > > I wonder whether we want to add an assert here to make sure only the > > APIC access page receives this special handling by the shadow code, > > but maybe that's a bit too much? > > I think so, yes: It would require either a separate function or > making the variable global. Both feel like a layering violation. Right, with the INVALID_MFN change and the shadow mfn_valid style adjustment: Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> Thanks, Roger.
Hi, Apologies for not sending comments before, and thanks for the ping. At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote: > The address of this page is used by the CPU only to recognize when to > access the virtual APIC page instead. No accesses would ever go to this > page. It only needs to be present in the (CPU) page tables so that > address translation will produce its address as result for respective > accesses. > > By making this page global, we also eliminate the need to refcount it, > or to assign it to any domain in the first place. What is the aim here? To save 4k per domain? It seems to come out about even for adding and removing code. > --- a/xen/arch/x86/mm/shadow/set.c > +++ b/xen/arch/x86/mm/shadow/set.c > @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl > ASSERT(!sh_l1e_is_magic(sl1e)); > ASSERT(shadow_mode_refcounts(d)); > > + /* > + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually > + * get accessed, and hence there's no need to refcount it (and refcounting > + * would fail, due to the page having no owner). > + */ > + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) Would it be better to check specifically for mfn == apic_access_mfn (and apic_access_mfn != 0, I guess)? If we want this behaviour for for all un-owned PGC_extra MFNs it would be good to explain that in the comments. Cheers, Tim.
On 17.04.2021 21:24, Tim Deegan wrote: > At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote: >> The address of this page is used by the CPU only to recognize when to >> access the virtual APIC page instead. No accesses would ever go to this >> page. It only needs to be present in the (CPU) page tables so that >> address translation will produce its address as result for respective >> accesses. >> >> By making this page global, we also eliminate the need to refcount it, >> or to assign it to any domain in the first place. > > What is the aim here? To save 4k per domain? It seems to come out > about even for adding and removing code. True, but still it looks wrong to me to use a page per guest when one her host suffices. Think about many tiny, short-lived VMs (as in Tamas'es VM forking). >> --- a/xen/arch/x86/mm/shadow/set.c >> +++ b/xen/arch/x86/mm/shadow/set.c >> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl >> ASSERT(!sh_l1e_is_magic(sl1e)); >> ASSERT(shadow_mode_refcounts(d)); >> >> + /* >> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually >> + * get accessed, and hence there's no need to refcount it (and refcounting >> + * would fail, due to the page having no owner). >> + */ >> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) > > Would it be better to check specifically for mfn == apic_access_mfn > (and apic_access_mfn != 0, I guess)? Roger did ask about the same - I neither want to expose apic_access_mfn outside its CU, nor do I want to introduce an accessor function. Both feel like layering violations to me. > If we want this behaviour for > for all un-owned PGC_extra MFNs it would be good to explain that in the > comments. This is hard to tell without knowing which (or even if) further such PGC_extra pages will appear. Hence any comment to that effect would be guesswork at best. Of course I can add e.g. "Other pages with the same properties would be treated the same", if that's what you're after? Jan
At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote: > On 17.04.2021 21:24, Tim Deegan wrote: > > At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote: > >> By making this page global, we also eliminate the need to refcount it, > >> or to assign it to any domain in the first place. > > > > What is the aim here? To save 4k per domain? It seems to come out > > about even for adding and removing code. > > True, but still it looks wrong to me to use a page per guest when one > her host suffices. Think about many tiny, short-lived VMs (as in > Tamas'es VM forking). OK, fair enough. > >> --- a/xen/arch/x86/mm/shadow/set.c > >> +++ b/xen/arch/x86/mm/shadow/set.c > >> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl > >> ASSERT(!sh_l1e_is_magic(sl1e)); > >> ASSERT(shadow_mode_refcounts(d)); > >> > >> + /* > >> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually > >> + * get accessed, and hence there's no need to refcount it (and refcounting > >> + * would fail, due to the page having no owner). > >> + */ > >> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) > > > > Would it be better to check specifically for mfn == apic_access_mfn > > (and apic_access_mfn != 0, I guess)? > > Roger did ask about the same - I neither want to expose apic_access_mfn > outside its CU, nor do I want to introduce an accessor function. Both > feel like layering violations to me. I think that this is even more of a layering violation: what we actually want is to allow un-refcounted mappings of the apic_access_mfn, but to do it we're relying on an internal implementation detail (that it happens to be un-owned and PGC_extra) rather than giving ourselves an API. And so we're tangled up talking about how to write comments to warn our future selves about the possible side-effects. > > If we want this behaviour for > > for all un-owned PGC_extra MFNs it would be good to explain that in the > > comments. > > This is hard to tell without knowing which (or even if) further such > PGC_extra pages will appear. Hence any comment to that effect would be > guesswork at best. Of course I can add e.g. "Other pages with the same > properties would be treated the same", if that's what you're after? If you want to go this way there should be a comment here saying that we're allowing this for all PGC_extra pages because we need it for apic_access_mfn, and a comment at PGC_extra saying that it has this effect. Cheers, Tim
On 22.04.2021 09:42, Tim Deegan wrote: > At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote: >> On 17.04.2021 21:24, Tim Deegan wrote: >>> At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote: >>>> --- a/xen/arch/x86/mm/shadow/set.c >>>> +++ b/xen/arch/x86/mm/shadow/set.c >>>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl >>>> ASSERT(!sh_l1e_is_magic(sl1e)); >>>> ASSERT(shadow_mode_refcounts(d)); >>>> >>>> + /* >>>> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually >>>> + * get accessed, and hence there's no need to refcount it (and refcounting >>>> + * would fail, due to the page having no owner). >>>> + */ >>>> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) >>> >>> Would it be better to check specifically for mfn == apic_access_mfn >>> (and apic_access_mfn != 0, I guess)? >> >> Roger did ask about the same - I neither want to expose apic_access_mfn >> outside its CU, nor do I want to introduce an accessor function. Both >> feel like layering violations to me. > > I think that this is even more of a layering violation: what we > actually want is to allow un-refcounted mappings of the > apic_access_mfn, but to do it we're relying on an internal > implementation detail (that it happens to be un-owned and PGC_extra) > rather than giving ourselves an API. > > And so we're tangled up talking about how to write comments to warn > our future selves about the possible side-effects. > >>> If we want this behaviour for >>> for all un-owned PGC_extra MFNs it would be good to explain that in the >>> comments. >> >> This is hard to tell without knowing which (or even if) further such >> PGC_extra pages will appear. Hence any comment to that effect would be >> guesswork at best. Of course I can add e.g. "Other pages with the same >> properties would be treated the same", if that's what you're after? > > If you want to go this way there should be a comment here saying that > we're allowing this for all PGC_extra pages because we need it for > apic_access_mfn, and a comment at PGC_extra saying that it has this > effect. So (along with a comment to this effect) how about I make page_suppress_refcounting() and page_refcounting_suppressed() helpers? The former would set PGC_extra on the page and assert the page has no owner, while the latter would subsume the checks done here. The only question then is what to do with the ASSERT(type == p2m_mmio_direct): That's still a property of the APIC access MFN which may or may not hold for future such pages. (It can't be part of the new helper anyway as "put" doesn't have the type available.) Jan
At 11:38 +0200 on 22 Apr (1619091522), Jan Beulich wrote: > On 22.04.2021 09:42, Tim Deegan wrote: > > At 13:25 +0200 on 19 Apr (1618838726), Jan Beulich wrote: > >> On 17.04.2021 21:24, Tim Deegan wrote: > >>> At 12:40 +0200 on 12 Apr (1618231248), Jan Beulich wrote: > >>>> --- a/xen/arch/x86/mm/shadow/set.c > >>>> +++ b/xen/arch/x86/mm/shadow/set.c > >>>> @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl > >>>> ASSERT(!sh_l1e_is_magic(sl1e)); > >>>> ASSERT(shadow_mode_refcounts(d)); > >>>> > >>>> + /* > >>>> + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually > >>>> + * get accessed, and hence there's no need to refcount it (and refcounting > >>>> + * would fail, due to the page having no owner). > >>>> + */ > >>>> + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) > >>> > >>> Would it be better to check specifically for mfn == apic_access_mfn > >>> (and apic_access_mfn != 0, I guess)? > >> > >> Roger did ask about the same - I neither want to expose apic_access_mfn > >> outside its CU, nor do I want to introduce an accessor function. Both > >> feel like layering violations to me. > > > > I think that this is even more of a layering violation: what we > > actually want is to allow un-refcounted mappings of the > > apic_access_mfn, but to do it we're relying on an internal > > implementation detail (that it happens to be un-owned and PGC_extra) > > rather than giving ourselves an API. > > > > And so we're tangled up talking about how to write comments to warn > > our future selves about the possible side-effects. > > > >>> If we want this behaviour for > >>> for all un-owned PGC_extra MFNs it would be good to explain that in the > >>> comments. > >> > >> This is hard to tell without knowing which (or even if) further such > >> PGC_extra pages will appear. Hence any comment to that effect would be > >> guesswork at best. Of course I can add e.g. "Other pages with the same > >> properties would be treated the same", if that's what you're after? > > > > If you want to go this way there should be a comment here saying that > > we're allowing this for all PGC_extra pages because we need it for > > apic_access_mfn, and a comment at PGC_extra saying that it has this > > effect. > > So (along with a comment to this effect) how about I make > page_suppress_refcounting() and page_refcounting_suppressed() helpers? > The former would set PGC_extra on the page and assert the page has no > owner, while the latter would subsume the checks done here. That sounds good to me. > The only > question then is what to do with the ASSERT(type == p2m_mmio_direct): > That's still a property of the APIC access MFN which may or may not > hold for future such pages. (It can't be part of the new helper anyway > as "put" doesn't have the type available.) I think we might drop that assertion, since the new mehanism would be more general. Cheers, Tim.
--- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept static void vmx_ctxt_switch_from(struct vcpu *v); static void vmx_ctxt_switch_to(struct vcpu *v); -static int vmx_alloc_vlapic_mapping(struct domain *d); -static void vmx_free_vlapic_mapping(struct domain *d); +static int alloc_vlapic_mapping(void); static void vmx_install_vlapic_mapping(struct vcpu *v); static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr, unsigned int flags); @@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content); static void vmx_invlpg(struct vcpu *v, unsigned long linear); +static mfn_t __read_mostly apic_access_mfn; + /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */ #define PI_CSW_FROM (1u << 0) #define PI_CSW_TO (1u << 1) @@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct .to = vmx_ctxt_switch_to, .tail = vmx_do_resume, }; - int rc; d->arch.ctxt_switch = &csw; @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct */ d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; - if ( !has_vlapic(d) ) - return 0; - - if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) - return rc; - return 0; } -static void vmx_domain_relinquish_resources(struct domain *d) +static void domain_creation_finished(struct domain *d) { - if ( !has_vlapic(d) ) + gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE); + uint8_t ipat; + + if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, _mfn(0)) ) return; - vmx_free_vlapic_mapping(d); -} + ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat, + true) == MTRR_TYPE_WRBACK); + ASSERT(ipat); -static void domain_creation_finished(struct domain *d) -{ - if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn, _mfn(0)) && - set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE), - d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) ) + if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) ) domain_crash(d); } @@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd .cpu_up_prepare = vmx_cpu_up_prepare, .cpu_dead = vmx_cpu_dead, .domain_initialise = vmx_domain_initialise, - .domain_relinquish_resources = vmx_domain_relinquish_resources, .domain_creation_finished = domain_creation_finished, .vcpu_initialise = vmx_vcpu_initialise, .vcpu_destroy = vmx_vcpu_destroy, @@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init { set_in_cr4(X86_CR4_VMXE); - if ( vmx_vmcs_init() ) + if ( vmx_vmcs_init() || alloc_vlapic_mapping() ) { printk("VMX: failed to initialise.\n"); return NULL; @@ -3224,7 +3217,7 @@ gp_fault: return X86EMUL_EXCEPTION; } -static int vmx_alloc_vlapic_mapping(struct domain *d) +static int __init alloc_vlapic_mapping(void) { struct page_info *pg; mfn_t mfn; @@ -3232,52 +3225,31 @@ static int vmx_alloc_vlapic_mapping(stru if ( !cpu_has_vmx_virtualize_apic_accesses ) return 0; - pg = alloc_domheap_page(d, MEMF_no_refcount); + pg = alloc_domheap_page(NULL, 0); if ( !pg ) return -ENOMEM; - if ( !get_page_and_type(pg, d, PGT_writable_page) ) - { - /* - * The domain can't possibly know about this page yet, so failure - * here is a clear indication of something fishy going on. - */ - domain_crash(d); - return -ENODATA; - } + /* Arrange for epte_get_entry_emt() to recognize this page as "special". */ + pg->count_info |= PGC_extra; mfn = page_to_mfn(pg); clear_domain_page(mfn); - d->arch.hvm.vmx.apic_access_mfn = mfn; + apic_access_mfn = mfn; return 0; } -static void vmx_free_vlapic_mapping(struct domain *d) -{ - mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn; - - d->arch.hvm.vmx.apic_access_mfn = _mfn(0); - if ( !mfn_eq(mfn, _mfn(0)) ) - { - struct page_info *pg = mfn_to_page(mfn); - - put_page_alloc_ref(pg); - put_page_and_type(pg); - } -} - static void vmx_install_vlapic_mapping(struct vcpu *v) { paddr_t virt_page_ma, apic_page_ma; - if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) ) + if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, _mfn(0)) ) return; ASSERT(cpu_has_vmx_virtualize_apic_accesses); virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page); - apic_page_ma = mfn_to_maddr(v->domain->arch.hvm.vmx.apic_access_mfn); + apic_page_ma = mfn_to_maddr(apic_access_mfn); vmx_vmcs_enter(v); __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma); --- a/xen/arch/x86/mm/shadow/set.c +++ b/xen/arch/x86/mm/shadow/set.c @@ -94,6 +94,22 @@ shadow_get_page_from_l1e(shadow_l1e_t sl ASSERT(!sh_l1e_is_magic(sl1e)); ASSERT(shadow_mode_refcounts(d)); + /* + * VMX'es APIC access MFN is just a surrogate page. It doesn't actually + * get accessed, and hence there's no need to refcount it (and refcounting + * would fail, due to the page having no owner). + */ + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) + { + const struct page_info *pg = mfn_to_page(mfn); + + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) + { + ASSERT(type == p2m_mmio_direct); + return 0; + } + } + res = get_page_from_l1e(sl1e, d, d); /* --- a/xen/arch/x86/mm/shadow/types.h +++ b/xen/arch/x86/mm/shadow/types.h @@ -276,9 +276,20 @@ int shadow_set_l4e(struct domain *d, sha static void inline shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) { + mfn_t mfn; + if ( !shadow_mode_refcounts(d) ) return; + if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) ) + { + const struct page_info *pg = mfn_to_page(mfn); + + /* See the respective comment in shadow_get_page_from_l1e(). */ + if ( !page_get_owner(pg) && (pg->count_info & PGC_extra) ) + return; + } + put_page_from_l1e(sl1e, d); } --- a/xen/include/asm-x86/hvm/vmx/vmcs.h +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h @@ -58,7 +58,6 @@ struct ept_data { #define _VMX_DOMAIN_PML_ENABLED 0 #define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED) struct vmx_domain { - mfn_t apic_access_mfn; /* VMX_DOMAIN_* */ unsigned int status;