diff mbox series

[3/5] x86/hvm: fix handling of accesses to partial r/o MMIO pages

Message ID 20250411105411.22334-4-roger.pau@citrix.com (mailing list archive)
State Superseded
Headers show
Series xen/x86: fix implementation of subpage r/o MMIO | expand

Commit Message

Roger Pau Monné April 11, 2025, 10:54 a.m. UTC
The current logic to handle accesses to MMIO pages partially read-only is
based on the (now removed) logic used to handle accesses to the r/o MMCFG
region(s) for PVH v1 dom0.  However that has issues when running on AMD
hardware, as in that case the guest linear address that triggered the fault
is not provided as part of the VM exit.  This caused
mmio_ro_emulated_write() to always fail before calling
subpage_mmio_write_emulate() when running on AMD and called from an HVM
context.

Take a different approach and convert the handling of partial read-only
MMIO page accesses into an HVM MMIO ops handler, as that's the more natural
way to handle this kind of emulation for HVM domains.

This allows getting rid of hvm_emulate_one_mmio() and it's single cal site
in hvm_hap_nested_page_fault().

Note a small adjustment is needed to the `pf-fixup` dom0 PVH logic: avoid
attempting to fixup faults resulting from accesses to read-only MMIO
regions, as handling of those accesses is now done by handle_mmio().

Fixes: 33c19df9a5a0 ('x86/PCI: intercept accesses to RO MMIO from dom0s in HVM containers')
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
The fixes tag is maybe a bit wonky, it's either this or:

8847d6e23f97 ('x86/mm: add API for marking only part of a MMIO page read only')

However the addition of subpage r/o access handling to the existing
mmio_ro_emulated_write() function was done based on the assumption that the
current code was working - which turned out to not be the case for AMD,
hence my preference for blaming the commit that actually introduced the
broken logic.
---
 xen/arch/x86/hvm/emulate.c             | 47 +-------------
 xen/arch/x86/hvm/hvm.c                 | 89 +++++++++++++++++++++++---
 xen/arch/x86/include/asm/hvm/emulate.h |  1 -
 xen/arch/x86/include/asm/mm.h          | 12 ++++
 xen/arch/x86/mm.c                      | 37 +----------
 5 files changed, 96 insertions(+), 90 deletions(-)

Comments

Jan Beulich April 14, 2025, 6:33 a.m. UTC | #1
On 11.04.2025 12:54, Roger Pau Monne wrote:
> The current logic to handle accesses to MMIO pages partially read-only is
> based on the (now removed) logic used to handle accesses to the r/o MMCFG
> region(s) for PVH v1 dom0.  However that has issues when running on AMD
> hardware, as in that case the guest linear address that triggered the fault
> is not provided as part of the VM exit.  This caused
> mmio_ro_emulated_write() to always fail before calling
> subpage_mmio_write_emulate() when running on AMD and called from an HVM
> context.
> 
> Take a different approach and convert the handling of partial read-only
> MMIO page accesses into an HVM MMIO ops handler, as that's the more natural
> way to handle this kind of emulation for HVM domains.
> 
> This allows getting rid of hvm_emulate_one_mmio() and it's single cal site
> in hvm_hap_nested_page_fault().
> 
> Note a small adjustment is needed to the `pf-fixup` dom0 PVH logic: avoid
> attempting to fixup faults resulting from accesses to read-only MMIO
> regions, as handling of those accesses is now done by handle_mmio().
> 
> Fixes: 33c19df9a5a0 ('x86/PCI: intercept accesses to RO MMIO from dom0s in HVM containers')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> The fixes tag is maybe a bit wonky, it's either this or:
> 
> 8847d6e23f97 ('x86/mm: add API for marking only part of a MMIO page read only')
> 
> However the addition of subpage r/o access handling to the existing
> mmio_ro_emulated_write() function was done based on the assumption that the
> current code was working - which turned out to not be the case for AMD,
> hence my preference for blaming the commit that actually introduced the
> broken logic.

I agree.

> ---
>  xen/arch/x86/hvm/emulate.c             | 47 +-------------
>  xen/arch/x86/hvm/hvm.c                 | 89 +++++++++++++++++++++++---

It's quite a bit of rather special code you add to this catch-all file. How
about introducing a small new one, e.g. mmio.c (to later maybe become home
to some further stuff)?

> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -370,7 +370,8 @@ static int hvmemul_do_io(
>          /* If there is no suitable backing DM, just ignore accesses */
>          if ( !s )
>          {
> -            if ( is_mmio && is_hardware_domain(currd) )
> +            if ( is_mmio && is_hardware_domain(currd) &&
> +                 !rangeset_contains_singleton(mmio_ro_ranges, PFN_DOWN(addr)) )

I think this needs a brief comment - it otherwise looks misplaced here.

> @@ -585,9 +585,81 @@ static int cf_check hvm_print_line(
>      return X86EMUL_OKAY;
>  }
>  
> +static int cf_check subpage_mmio_accept(struct vcpu *v, unsigned long addr)
> +{
> +    p2m_type_t t;
> +    mfn_t mfn = get_gfn_query_unlocked(v->domain, addr, &t);
> +
> +    return !mfn_eq(mfn, INVALID_MFN) && t == p2m_mmio_direct &&
> +           !!subpage_mmio_find_page(mfn);

The !! isn't needed here, is it?

> +}
> +
> +static int cf_check subpage_mmio_read(
> +    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
> +{
> +    struct domain *d = v->domain;
> +    p2m_type_t t;
> +    mfn_t mfn = get_gfn_query(d, addr, &t);
> +    struct subpage_ro_range *entry;
> +    volatile void __iomem *mem;
> +
> +    *data = ~0UL;
> +
> +    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
> +    {
> +        put_gfn(d, addr);
> +        return X86EMUL_RETRY;
> +    }
> +
> +    entry = subpage_mmio_find_page(mfn);
> +    if ( !entry )
> +    {
> +        put_gfn(d, addr);
> +        return X86EMUL_RETRY;
> +    }
> +
> +    mem = subpage_mmio_map_page(entry);
> +    if ( !mem )
> +    {
> +        put_gfn(d, addr);
> +        gprintk(XENLOG_ERR, "Failed to map page for MMIO read at %#lx\n",
> +                mfn_to_maddr(mfn) + PAGE_OFFSET(addr));

Makes me wonder whether the function parameter wouldn't better be named
"curr" and the local variable "currd", to reflect that this log message
will report appropriate context.

Would also logging the guest physical address perhaps be worthwhile here?

> +        return X86EMUL_OKAY;
> +    }
> +
> +    *data = read_mmio(mem + PAGE_OFFSET(addr), len);
> +
> +    put_gfn(d, addr);
> +    return X86EMUL_OKAY;
> +}
> +
> +static int cf_check subpage_mmio_write(
> +    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
> +{
> +    struct domain *d = v->domain;
> +    p2m_type_t t;
> +    mfn_t mfn = get_gfn_query(d, addr, &t);
> +
> +    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
> +    {
> +        put_gfn(d, addr);
> +        return X86EMUL_RETRY;
> +    }
> +
> +    subpage_mmio_write_emulate(mfn, PAGE_OFFSET(addr), data, len);
> +
> +    put_gfn(d, addr);
> +    return X86EMUL_OKAY;
> +}

Unlike the read path this doesn't return RETRY when subpage_mmio_find_page()
fails (indicating something must have changed after "accept" said yes).

> @@ -1981,7 +2056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>       */
>      if ( (p2mt == p2m_mmio_dm) ||
>           (npfec.write_access &&
> -          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
> +          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
> +           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
> +           (p2mt == p2m_mmio_direct))) )
>      {
>          if ( !handle_mmio_with_translation(gla, gfn, npfec) )
>              hvm_inject_hw_exception(X86_EXC_GP, 0);

Aren't we handing too many things to handle_mmio_with_translation() this
way? At the very least you're losing ...

> @@ -2033,14 +2110,6 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>          goto out_put_gfn;
>      }
>  
> -    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&

... the .present check.

I'm also concerned of e.g. VT-x'es APIC access MFN, which is
p2m_mmio_direct.

Jan
Roger Pau Monné April 14, 2025, 1:53 p.m. UTC | #2
On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> On 11.04.2025 12:54, Roger Pau Monne wrote:
> > The current logic to handle accesses to MMIO pages partially read-only is
> > based on the (now removed) logic used to handle accesses to the r/o MMCFG
> > region(s) for PVH v1 dom0.  However that has issues when running on AMD
> > hardware, as in that case the guest linear address that triggered the fault
> > is not provided as part of the VM exit.  This caused
> > mmio_ro_emulated_write() to always fail before calling
> > subpage_mmio_write_emulate() when running on AMD and called from an HVM
> > context.
> > 
> > Take a different approach and convert the handling of partial read-only
> > MMIO page accesses into an HVM MMIO ops handler, as that's the more natural
> > way to handle this kind of emulation for HVM domains.
> > 
> > This allows getting rid of hvm_emulate_one_mmio() and it's single cal site
> > in hvm_hap_nested_page_fault().
> > 
> > Note a small adjustment is needed to the `pf-fixup` dom0 PVH logic: avoid
> > attempting to fixup faults resulting from accesses to read-only MMIO
> > regions, as handling of those accesses is now done by handle_mmio().
> > 
> > Fixes: 33c19df9a5a0 ('x86/PCI: intercept accesses to RO MMIO from dom0s in HVM containers')
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > The fixes tag is maybe a bit wonky, it's either this or:
> > 
> > 8847d6e23f97 ('x86/mm: add API for marking only part of a MMIO page read only')
> > 
> > However the addition of subpage r/o access handling to the existing
> > mmio_ro_emulated_write() function was done based on the assumption that the
> > current code was working - which turned out to not be the case for AMD,
> > hence my preference for blaming the commit that actually introduced the
> > broken logic.
> 
> I agree.
> 
> > ---
> >  xen/arch/x86/hvm/emulate.c             | 47 +-------------
> >  xen/arch/x86/hvm/hvm.c                 | 89 +++++++++++++++++++++++---
> 
> It's quite a bit of rather special code you add to this catch-all file. How
> about introducing a small new one, e.g. mmio.c (to later maybe become home
> to some further stuff)?

Yes, I didn't find any suitable place, I was also considering
hvm/io.c or hvm/intercept.c, but none looked very good TBH.

The main benefit of placing it in hvm.c is that the functions can all
be static and confined to hvm.c as it's only user.

> > --- a/xen/arch/x86/hvm/emulate.c
> > +++ b/xen/arch/x86/hvm/emulate.c
> > @@ -370,7 +370,8 @@ static int hvmemul_do_io(
> >          /* If there is no suitable backing DM, just ignore accesses */
> >          if ( !s )
> >          {
> > -            if ( is_mmio && is_hardware_domain(currd) )
> > +            if ( is_mmio && is_hardware_domain(currd) &&
> > +                 !rangeset_contains_singleton(mmio_ro_ranges, PFN_DOWN(addr)) )
> 
> I think this needs a brief comment - it otherwise looks misplaced here.
> 
> > @@ -585,9 +585,81 @@ static int cf_check hvm_print_line(
> >      return X86EMUL_OKAY;
> >  }
> >  
> > +static int cf_check subpage_mmio_accept(struct vcpu *v, unsigned long addr)
> > +{
> > +    p2m_type_t t;
> > +    mfn_t mfn = get_gfn_query_unlocked(v->domain, addr, &t);
> > +
> > +    return !mfn_eq(mfn, INVALID_MFN) && t == p2m_mmio_direct &&
> > +           !!subpage_mmio_find_page(mfn);
> 
> The !! isn't needed here, is it?

IIRC clang complained about conversion from pointer to integer without
a cast, but maybe that was before adding the further conditions here.

> > +}
> > +
> > +static int cf_check subpage_mmio_read(
> > +    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
> > +{
> > +    struct domain *d = v->domain;
> > +    p2m_type_t t;
> > +    mfn_t mfn = get_gfn_query(d, addr, &t);
> > +    struct subpage_ro_range *entry;
> > +    volatile void __iomem *mem;
> > +
> > +    *data = ~0UL;
> > +
> > +    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
> > +    {
> > +        put_gfn(d, addr);
> > +        return X86EMUL_RETRY;
> > +    }
> > +
> > +    entry = subpage_mmio_find_page(mfn);
> > +    if ( !entry )
> > +    {
> > +        put_gfn(d, addr);
> > +        return X86EMUL_RETRY;
> > +    }
> > +
> > +    mem = subpage_mmio_map_page(entry);
> > +    if ( !mem )
> > +    {
> > +        put_gfn(d, addr);
> > +        gprintk(XENLOG_ERR, "Failed to map page for MMIO read at %#lx\n",
> > +                mfn_to_maddr(mfn) + PAGE_OFFSET(addr));
> 
> Makes me wonder whether the function parameter wouldn't better be named
> "curr" and the local variable "currd", to reflect that this log message
> will report appropriate context.

Sure, can adjust.

> Would also logging the guest physical address perhaps be worthwhile here?

Possibly, my first apporahc was to print gfn -> mfn, but ended up
copying the same message from  subpage_mmio_write_emulate() for
symmetry reasons.

> > +        return X86EMUL_OKAY;
> > +    }
> > +
> > +    *data = read_mmio(mem + PAGE_OFFSET(addr), len);
> > +
> > +    put_gfn(d, addr);
> > +    return X86EMUL_OKAY;
> > +}
> > +
> > +static int cf_check subpage_mmio_write(
> > +    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
> > +{
> > +    struct domain *d = v->domain;
> > +    p2m_type_t t;
> > +    mfn_t mfn = get_gfn_query(d, addr, &t);
> > +
> > +    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
> > +    {
> > +        put_gfn(d, addr);
> > +        return X86EMUL_RETRY;
> > +    }
> > +
> > +    subpage_mmio_write_emulate(mfn, PAGE_OFFSET(addr), data, len);
> > +
> > +    put_gfn(d, addr);
> > +    return X86EMUL_OKAY;
> > +}
> 
> Unlike the read path this doesn't return RETRY when subpage_mmio_find_page()
> fails (indicating something must have changed after "accept" said yes).

Yeah, I've noticed this, but didn't feel like modifying
subpage_mmio_write_emulate() for this.  The list of partial r/o MMIO
pages cannot change after system boot, so accept returning yes and not
finding a page here would likely warrant an ASSERT_UNRECHABLE().

Would you like me to modify subpage_mmio_write_emulate to make it
return an error code?

> > @@ -1981,7 +2056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
> >       */
> >      if ( (p2mt == p2m_mmio_dm) ||
> >           (npfec.write_access &&
> > -          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
> > +          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
> > +           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
> > +           (p2mt == p2m_mmio_direct))) )
> >      {
> >          if ( !handle_mmio_with_translation(gla, gfn, npfec) )
> >              hvm_inject_hw_exception(X86_EXC_GP, 0);
> 
> Aren't we handing too many things to handle_mmio_with_translation() this
> way? At the very least you're losing ...
> 
> > @@ -2033,14 +2110,6 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
> >          goto out_put_gfn;
> >      }
> >  
> > -    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
> 
> ... the .present check.

Isn't the p2mt == p2m_mmio_direct check already ensuring the entry is
present?  Otherwise it's type would be p2m_invalid or p2m_mmio_dm?

It did seem to me the other checks in this function already assume
that by having a valid type the entry is present.

> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> p2m_mmio_direct.

But that won't go into hvm_hap_nested_page_fault() when using
cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
mapped as p2m_mmio_direct)?

It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
differently?

Thanks, Roger.
Jan Beulich April 14, 2025, 3:24 p.m. UTC | #3
On 14.04.2025 15:53, Roger Pau Monné wrote:
> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
>> On 11.04.2025 12:54, Roger Pau Monne wrote:
>>> ---
>>>  xen/arch/x86/hvm/emulate.c             | 47 +-------------
>>>  xen/arch/x86/hvm/hvm.c                 | 89 +++++++++++++++++++++++---
>>
>> It's quite a bit of rather special code you add to this catch-all file. How
>> about introducing a small new one, e.g. mmio.c (to later maybe become home
>> to some further stuff)?
> 
> Yes, I didn't find any suitable place, I was also considering
> hvm/io.c or hvm/intercept.c, but none looked very good TBH.
> 
> The main benefit of placing it in hvm.c is that the functions can all
> be static and confined to hvm.c as it's only user.

I understand that. Still, if we went by that criteria, we'd best put all of
our code in a single file ;-)

>>> +static int cf_check subpage_mmio_write(
>>> +    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
>>> +{
>>> +    struct domain *d = v->domain;
>>> +    p2m_type_t t;
>>> +    mfn_t mfn = get_gfn_query(d, addr, &t);
>>> +
>>> +    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
>>> +    {
>>> +        put_gfn(d, addr);
>>> +        return X86EMUL_RETRY;
>>> +    }
>>> +
>>> +    subpage_mmio_write_emulate(mfn, PAGE_OFFSET(addr), data, len);
>>> +
>>> +    put_gfn(d, addr);
>>> +    return X86EMUL_OKAY;
>>> +}
>>
>> Unlike the read path this doesn't return RETRY when subpage_mmio_find_page()
>> fails (indicating something must have changed after "accept" said yes).
> 
> Yeah, I've noticed this, but didn't feel like modifying
> subpage_mmio_write_emulate() for this.  The list of partial r/o MMIO
> pages cannot change after system boot, so accept returning yes and not
> finding a page here would likely warrant an ASSERT_UNRECHABLE().
> 
> Would you like me to modify subpage_mmio_write_emulate to make it
> return an error code?

I'd be happy with the two paths being in sync in whichever way that's
achieved. The argument you give equally holds for the read path, after
all.

>>> @@ -1981,7 +2056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>>>       */
>>>      if ( (p2mt == p2m_mmio_dm) ||
>>>           (npfec.write_access &&
>>> -          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
>>> +          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
>>> +           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
>>> +           (p2mt == p2m_mmio_direct))) )
>>>      {
>>>          if ( !handle_mmio_with_translation(gla, gfn, npfec) )
>>>              hvm_inject_hw_exception(X86_EXC_GP, 0);
>>
>> Aren't we handing too many things to handle_mmio_with_translation() this
>> way? At the very least you're losing ...
>>
>>> @@ -2033,14 +2110,6 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>>>          goto out_put_gfn;
>>>      }
>>>  
>>> -    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
>>
>> ... the .present check.
> 
> Isn't the p2mt == p2m_mmio_direct check already ensuring the entry is
> present?  Otherwise it's type would be p2m_invalid or p2m_mmio_dm?

Yes (to the 1st question), it kind of is.

> It did seem to me the other checks in this function already assume
> that by having a valid type the entry is present.

Except for the code above, where we decided to play safe. AT the very least
if you drop such a check, please say a justifying word in the description.

>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
>> p2m_mmio_direct.
> 
> But that won't go into hvm_hap_nested_page_fault() when using
> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> mapped as p2m_mmio_direct)?
> 
> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> differently?

All true as long as things work as expected (potentially including the guest
also behaving as expected). Also this was explicitly only an example I could
readily think of. I'm simply wary of handle_mmio_with_translation() now
getting things to handle it's not meant to ever see.

Jan
Roger Pau Monné April 14, 2025, 4:13 p.m. UTC | #4
On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
> On 14.04.2025 15:53, Roger Pau Monné wrote:
> > On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> >> On 11.04.2025 12:54, Roger Pau Monne wrote:
> >>> @@ -1981,7 +2056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
> >>>       */
> >>>      if ( (p2mt == p2m_mmio_dm) ||
> >>>           (npfec.write_access &&
> >>> -          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
> >>> +          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
> >>> +           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
> >>> +           (p2mt == p2m_mmio_direct))) )
> >>>      {
> >>>          if ( !handle_mmio_with_translation(gla, gfn, npfec) )
> >>>              hvm_inject_hw_exception(X86_EXC_GP, 0);
> >>
> >> Aren't we handing too many things to handle_mmio_with_translation() this
> >> way? At the very least you're losing ...
> >>
> >>> @@ -2033,14 +2110,6 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
> >>>          goto out_put_gfn;
> >>>      }
> >>>  
> >>> -    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
> >>
> >> ... the .present check.
> > 
> > Isn't the p2mt == p2m_mmio_direct check already ensuring the entry is
> > present?  Otherwise it's type would be p2m_invalid or p2m_mmio_dm?
> 
> Yes (to the 1st question), it kind of is.
> 
> > It did seem to me the other checks in this function already assume
> > that by having a valid type the entry is present.
> 
> Except for the code above, where we decided to play safe. AT the very least
> if you drop such a check, please say a justifying word in the description.

I've added:

"As part of the fix r/o MMIO accesses are now handled by
handle_mmio_with_translation(), re-using the same logic that was used
for other read-only types part of p2m_is_discard_write().  The page
present check is dropped as type p2m_mmio_direct must have the
present bit set in the PTE."

Let me know if you think that's enough.

> >> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> >> p2m_mmio_direct.
> > 
> > But that won't go into hvm_hap_nested_page_fault() when using
> > cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> > mapped as p2m_mmio_direct)?
> > 
> > It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> > differently?
> 
> All true as long as things work as expected (potentially including the guest
> also behaving as expected). Also this was explicitly only an example I could
> readily think of. I'm simply wary of handle_mmio_with_translation() now
> getting things to handle it's not meant to ever see.

How was access to MMIO r/o regions supposed to be handled before
33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
(~2010), yet I can't figure out how writes would be handled back then
that didn't result in a p2m fault and crashing of the domain.

I'm happy to look at other ways to handling this, but given there's
current logic for handling accesses to read-only regions in
hvm_hap_nested_page_fault() I think re-using that was the best way to
also handle accesses to MMIO read-only regions.

Arguably it would already be the case that for other reasons Xen would
need to emulate an instruction that accesses a read-only MMIO region?

Thanks, Roger.
Jan Beulich April 15, 2025, 7:32 a.m. UTC | #5
On 14.04.2025 18:13, Roger Pau Monné wrote:
> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
>> On 14.04.2025 15:53, Roger Pau Monné wrote:
>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
>>>> On 11.04.2025 12:54, Roger Pau Monne wrote:
>>>>> @@ -1981,7 +2056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>>>>>       */
>>>>>      if ( (p2mt == p2m_mmio_dm) ||
>>>>>           (npfec.write_access &&
>>>>> -          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
>>>>> +          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
>>>>> +           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
>>>>> +           (p2mt == p2m_mmio_direct))) )
>>>>>      {
>>>>>          if ( !handle_mmio_with_translation(gla, gfn, npfec) )
>>>>>              hvm_inject_hw_exception(X86_EXC_GP, 0);
>>>>
>>>> Aren't we handing too many things to handle_mmio_with_translation() this
>>>> way? At the very least you're losing ...
>>>>
>>>>> @@ -2033,14 +2110,6 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
>>>>>          goto out_put_gfn;
>>>>>      }
>>>>>  
>>>>> -    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
>>>>
>>>> ... the .present check.
>>>
>>> Isn't the p2mt == p2m_mmio_direct check already ensuring the entry is
>>> present?  Otherwise it's type would be p2m_invalid or p2m_mmio_dm?
>>
>> Yes (to the 1st question), it kind of is.
>>
>>> It did seem to me the other checks in this function already assume
>>> that by having a valid type the entry is present.
>>
>> Except for the code above, where we decided to play safe. AT the very least
>> if you drop such a check, please say a justifying word in the description.
> 
> I've added:
> 
> "As part of the fix r/o MMIO accesses are now handled by
> handle_mmio_with_translation(), re-using the same logic that was used
> for other read-only types part of p2m_is_discard_write().  The page
> present check is dropped as type p2m_mmio_direct must have the
> present bit set in the PTE."
> 
> Let me know if you think that's enough.

That's fine; it's even more verbose than I was hoping for.

Independently (i.e. not for this patch) we may want to actually assert
that npfec.present is set for P2M types where we demand that to always
be the case. (I wouldn't be too surprised if we actually found such an
assertion to trigger.)

>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
>>>> p2m_mmio_direct.
>>>
>>> But that won't go into hvm_hap_nested_page_fault() when using
>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
>>> mapped as p2m_mmio_direct)?
>>>
>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
>>> differently?
>>
>> All true as long as things work as expected (potentially including the guest
>> also behaving as expected). Also this was explicitly only an example I could
>> readily think of. I'm simply wary of handle_mmio_with_translation() now
>> getting things to handle it's not meant to ever see.
> 
> How was access to MMIO r/o regions supposed to be handled before
> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
> (~2010), yet I can't figure out how writes would be handled back then
> that didn't result in a p2m fault and crashing of the domain.

Was that handled at all before said change? mmio_ro_do_page_fault() was
(and still is) invoked for the hardware domain only, and quite likely
the need for handling (discarding) writes for PVHv1 had been overlooked
until someone was hit by the lack thereof.

> I'm happy to look at other ways to handling this, but given there's
> current logic for handling accesses to read-only regions in
> hvm_hap_nested_page_fault() I think re-using that was the best way to
> also handle accesses to MMIO read-only regions.
> 
> Arguably it would already be the case that for other reasons Xen would
> need to emulate an instruction that accesses a read-only MMIO region?

Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
p2m_mmio_direct (after all, "direct" means we expect no emulation is
needed; while arguably wrong for the introspection case, I'm not sure
that and pass-through actually go together). Hence it's down to
hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
which means that if hvm_io_intercept() can't handle it, the access
will be forwarded to the responsible DM, or be "processed" by the
internal null handler.

Given this, perhaps what you do is actually fine. At the same time
note how several functions in hvm/emulate.c simply fail upon
encountering p2m_mmio_direct. These are all REP handlers though, so
the main emulator would then try emulating the insn the non-REP way.

Jan
Roger Pau Monné April 15, 2025, 8:34 a.m. UTC | #6
On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
> On 14.04.2025 18:13, Roger Pau Monné wrote:
> > On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
> >> On 14.04.2025 15:53, Roger Pau Monné wrote:
> >>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> >>>> On 11.04.2025 12:54, Roger Pau Monne wrote:
> Independently (i.e. not for this patch) we may want to actually assert
> that npfec.present is set for P2M types where we demand that to always
> be the case. (I wouldn't be too surprised if we actually found such an
> assertion to trigger.)
> 
> >>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> >>>> p2m_mmio_direct.
> >>>
> >>> But that won't go into hvm_hap_nested_page_fault() when using
> >>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> >>> mapped as p2m_mmio_direct)?
> >>>
> >>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> >>> differently?
> >>
> >> All true as long as things work as expected (potentially including the guest
> >> also behaving as expected). Also this was explicitly only an example I could
> >> readily think of. I'm simply wary of handle_mmio_with_translation() now
> >> getting things to handle it's not meant to ever see.
> > 
> > How was access to MMIO r/o regions supposed to be handled before
> > 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
> > added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
> > (~2010), yet I can't figure out how writes would be handled back then
> > that didn't result in a p2m fault and crashing of the domain.
> 
> Was that handled at all before said change?

Not really AFAICT, hence me wondering how where write accesses to r/o
MMIO regions supposed to be handled by (non-priv) domains.  Was the
expectation that those writes trigger an p2m violation thus crashing
the domain?

> mmio_ro_do_page_fault() was
> (and still is) invoked for the hardware domain only, and quite likely
> the need for handling (discarding) writes for PVHv1 had been overlooked
> until someone was hit by the lack thereof.

I see, I didn't realize r/o MMIO was only handled for PV hardware
domains only.  I could arguably do the same for HVM in
hvm_hap_nested_page_fault().

Not sure whether the subpage stuff is supposed to be functional for
domains different than the hardware domain?  It seems to be available
to the hanrdware domain only for PV guests, while for HVM is available
for both PV and HVM domains:

is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)

In hvm_hap_nested_page_fault().

> > I'm happy to look at other ways to handling this, but given there's
> > current logic for handling accesses to read-only regions in
> > hvm_hap_nested_page_fault() I think re-using that was the best way to
> > also handle accesses to MMIO read-only regions.
> > 
> > Arguably it would already be the case that for other reasons Xen would
> > need to emulate an instruction that accesses a read-only MMIO region?
> 
> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
> p2m_mmio_direct (after all, "direct" means we expect no emulation is
> needed; while arguably wrong for the introspection case, I'm not sure
> that and pass-through actually go together). Hence it's down to
> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
> which means that if hvm_io_intercept() can't handle it, the access
> will be forwarded to the responsible DM, or be "processed" by the
> internal null handler.
> 
> Given this, perhaps what you do is actually fine. At the same time
> note how several functions in hvm/emulate.c simply fail upon
> encountering p2m_mmio_direct. These are all REP handlers though, so
> the main emulator would then try emulating the insn the non-REP way.

I'm open to alternative ways of handling such accesses, just used what
seemed more natural in the context of hvm_hap_nested_page_fault().

Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
perspective, that would "just" result in the guest getting a #GP
injected.

Would you like me to add some of your reasoning above to the commit
message?

Thanks, Roger.
Jan Beulich April 15, 2025, 9:41 a.m. UTC | #7
On 15.04.2025 10:34, Roger Pau Monné wrote:
> On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
>> On 14.04.2025 18:13, Roger Pau Monné wrote:
>>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
>>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
>>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
>>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
>>>>>> p2m_mmio_direct.
>>>>>
>>>>> But that won't go into hvm_hap_nested_page_fault() when using
>>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
>>>>> mapped as p2m_mmio_direct)?
>>>>>
>>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
>>>>> differently?
>>>>
>>>> All true as long as things work as expected (potentially including the guest
>>>> also behaving as expected). Also this was explicitly only an example I could
>>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
>>>> getting things to handle it's not meant to ever see.
>>>
>>> How was access to MMIO r/o regions supposed to be handled before
>>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
>>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
>>> (~2010), yet I can't figure out how writes would be handled back then
>>> that didn't result in a p2m fault and crashing of the domain.
>>
>> Was that handled at all before said change?
> 
> Not really AFAICT, hence me wondering how where write accesses to r/o
> MMIO regions supposed to be handled by (non-priv) domains.  Was the
> expectation that those writes trigger an p2m violation thus crashing
> the domain?

I think so, yes. Devices with such special areas weren't (aren't?) supposed
to be handed to DomU-s.

>> mmio_ro_do_page_fault() was
>> (and still is) invoked for the hardware domain only, and quite likely
>> the need for handling (discarding) writes for PVHv1 had been overlooked
>> until someone was hit by the lack thereof.
> 
> I see, I didn't realize r/o MMIO was only handled for PV hardware
> domains only.  I could arguably do the same for HVM in
> hvm_hap_nested_page_fault().
> 
> Not sure whether the subpage stuff is supposed to be functional for
> domains different than the hardware domain?  It seems to be available
> to the hanrdware domain only for PV guests, while for HVM is available
> for both PV and HVM domains:

DYM Dom0 and DomU here?

> is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)
> 
> In hvm_hap_nested_page_fault().

See the three XHCI_SHARE_* modes. When it's XHCI_SHARE_ANY, even DomU-s
would require this handling. It looks like a mistake that we permit the
path to be taken for DomU-s even when the mode is XHCI_SHARE_HWDOM. It
also looks like a mistake that the PV path has remained Dom0-only, even
in the XHCI_SHARE_ANY case. Cc-ing Marek once again ...

>>> I'm happy to look at other ways to handling this, but given there's
>>> current logic for handling accesses to read-only regions in
>>> hvm_hap_nested_page_fault() I think re-using that was the best way to
>>> also handle accesses to MMIO read-only regions.
>>>
>>> Arguably it would already be the case that for other reasons Xen would
>>> need to emulate an instruction that accesses a read-only MMIO region?
>>
>> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
>> p2m_mmio_direct (after all, "direct" means we expect no emulation is
>> needed; while arguably wrong for the introspection case, I'm not sure
>> that and pass-through actually go together). Hence it's down to
>> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
>> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
>> which means that if hvm_io_intercept() can't handle it, the access
>> will be forwarded to the responsible DM, or be "processed" by the
>> internal null handler.
>>
>> Given this, perhaps what you do is actually fine. At the same time
>> note how several functions in hvm/emulate.c simply fail upon
>> encountering p2m_mmio_direct. These are all REP handlers though, so
>> the main emulator would then try emulating the insn the non-REP way.
> 
> I'm open to alternative ways of handling such accesses, just used what
> seemed more natural in the context of hvm_hap_nested_page_fault().
> 
> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
> perspective, that would "just" result in the guest getting a #GP
> injected.

That's not the part I'm worried about. What worries me is that we open up
another (or better: we're widening a) way to hit the emulator in the first
place. (Plus, as said, the issue with the not really tidy P2M type system.)

> Would you like me to add some of your reasoning above to the commit
> message?

While I'd still be a little hesitant as to ack-ing of the result, I think
that's all I'm really asking for, yes.

Jan
Roger Pau Monné April 15, 2025, 10:04 a.m. UTC | #8
On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
> On 15.04.2025 10:34, Roger Pau Monné wrote:
> > On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
> >> On 14.04.2025 18:13, Roger Pau Monné wrote:
> >>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
> >>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
> >>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> >>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> >>>>>> p2m_mmio_direct.
> >>>>>
> >>>>> But that won't go into hvm_hap_nested_page_fault() when using
> >>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> >>>>> mapped as p2m_mmio_direct)?
> >>>>>
> >>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> >>>>> differently?
> >>>>
> >>>> All true as long as things work as expected (potentially including the guest
> >>>> also behaving as expected). Also this was explicitly only an example I could
> >>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
> >>>> getting things to handle it's not meant to ever see.
> >>>
> >>> How was access to MMIO r/o regions supposed to be handled before
> >>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
> >>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
> >>> (~2010), yet I can't figure out how writes would be handled back then
> >>> that didn't result in a p2m fault and crashing of the domain.
> >>
> >> Was that handled at all before said change?
> > 
> > Not really AFAICT, hence me wondering how where write accesses to r/o
> > MMIO regions supposed to be handled by (non-priv) domains.  Was the
> > expectation that those writes trigger an p2m violation thus crashing
> > the domain?
> 
> I think so, yes. Devices with such special areas weren't (aren't?) supposed
> to be handed to DomU-s.

Oh, I see.  That makes stuff a bit clearer.  I think we would then
also want to add some checks to {ept_}p2m_type_to_flags()?

I wonder why handling of mmio_ro_ranges was added to the HVM p2m code
in ~2010 then.  If mmio_ro_ranges is only supposed to be relevant for
the hardware domain in ~2010 an HVM dom0 was not even in sight?

Sorry to ask so many questions, I'm a bit confused about how this
was/is supposed to work.

> >> mmio_ro_do_page_fault() was
> >> (and still is) invoked for the hardware domain only, and quite likely
> >> the need for handling (discarding) writes for PVHv1 had been overlooked
> >> until someone was hit by the lack thereof.
> > 
> > I see, I didn't realize r/o MMIO was only handled for PV hardware
> > domains only.  I could arguably do the same for HVM in
> > hvm_hap_nested_page_fault().
> > 
> > Not sure whether the subpage stuff is supposed to be functional for
> > domains different than the hardware domain?  It seems to be available
> > to the hanrdware domain only for PV guests, while for HVM is available
> > for both PV and HVM domains:
> 
> DYM Dom0 and DomU here?

Indeed, sorry.

> > is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)
> > 
> > In hvm_hap_nested_page_fault().
> 
> See the three XHCI_SHARE_* modes. When it's XHCI_SHARE_ANY, even DomU-s
> would require this handling. It looks like a mistake that we permit the
> path to be taken for DomU-s even when the mode is XHCI_SHARE_HWDOM.

Arguable a domU will never get the device assigned in the first place
unless the share mode is set to XHCI_SHARE_ANY.  For the other modes
the device is hidden, and hence couldn't be assigned to a domU anyway.

> It
> also looks like a mistake that the PV path has remained Dom0-only, even
> in the XHCI_SHARE_ANY case. Cc-ing Marek once again ...
> 
> >>> I'm happy to look at other ways to handling this, but given there's
> >>> current logic for handling accesses to read-only regions in
> >>> hvm_hap_nested_page_fault() I think re-using that was the best way to
> >>> also handle accesses to MMIO read-only regions.
> >>>
> >>> Arguably it would already be the case that for other reasons Xen would
> >>> need to emulate an instruction that accesses a read-only MMIO region?
> >>
> >> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
> >> p2m_mmio_direct (after all, "direct" means we expect no emulation is
> >> needed; while arguably wrong for the introspection case, I'm not sure
> >> that and pass-through actually go together). Hence it's down to
> >> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
> >> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
> >> which means that if hvm_io_intercept() can't handle it, the access
> >> will be forwarded to the responsible DM, or be "processed" by the
> >> internal null handler.
> >>
> >> Given this, perhaps what you do is actually fine. At the same time
> >> note how several functions in hvm/emulate.c simply fail upon
> >> encountering p2m_mmio_direct. These are all REP handlers though, so
> >> the main emulator would then try emulating the insn the non-REP way.
> > 
> > I'm open to alternative ways of handling such accesses, just used what
> > seemed more natural in the context of hvm_hap_nested_page_fault().
> > 
> > Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
> > perspective, that would "just" result in the guest getting a #GP
> > injected.
> 
> That's not the part I'm worried about. What worries me is that we open up
> another (or better: we're widening a) way to hit the emulator in the first
> place. (Plus, as said, the issue with the not really tidy P2M type system.)

But the hit would be limited to domains having r/o p2m_mmio_direct
entries in the p2m, as otherwise the path would be unreachable?

> > Would you like me to add some of your reasoning above to the commit
> > message?
> 
> While I'd still be a little hesitant as to ack-ing of the result, I think
> that's all I'm really asking for, yes.

As said before - I'm happy to consider suggestions here, I don't want
to fix this with yet another bodge that will cause us further issues
down the road.  What I proposed seemed like the most natural way to
handle those accesses IMO, but I'm not an expert on the emulator.

Thanks, Roger.
Jan Beulich April 15, 2025, 10:18 a.m. UTC | #9
On 15.04.2025 12:04, Roger Pau Monné wrote:
> On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
>> On 15.04.2025 10:34, Roger Pau Monné wrote:
>>> On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
>>>> On 14.04.2025 18:13, Roger Pau Monné wrote:
>>>>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
>>>>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
>>>>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
>>>>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
>>>>>>>> p2m_mmio_direct.
>>>>>>>
>>>>>>> But that won't go into hvm_hap_nested_page_fault() when using
>>>>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
>>>>>>> mapped as p2m_mmio_direct)?
>>>>>>>
>>>>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
>>>>>>> differently?
>>>>>>
>>>>>> All true as long as things work as expected (potentially including the guest
>>>>>> also behaving as expected). Also this was explicitly only an example I could
>>>>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
>>>>>> getting things to handle it's not meant to ever see.
>>>>>
>>>>> How was access to MMIO r/o regions supposed to be handled before
>>>>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
>>>>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
>>>>> (~2010), yet I can't figure out how writes would be handled back then
>>>>> that didn't result in a p2m fault and crashing of the domain.
>>>>
>>>> Was that handled at all before said change?
>>>
>>> Not really AFAICT, hence me wondering how where write accesses to r/o
>>> MMIO regions supposed to be handled by (non-priv) domains.  Was the
>>> expectation that those writes trigger an p2m violation thus crashing
>>> the domain?
>>
>> I think so, yes. Devices with such special areas weren't (aren't?) supposed
>> to be handed to DomU-s.
> 
> Oh, I see.  That makes stuff a bit clearer.  I think we would then
> also want to add some checks to {ept_}p2m_type_to_flags()?
> 
> I wonder why handling of mmio_ro_ranges was added to the HVM p2m code
> in ~2010 then.  If mmio_ro_ranges is only supposed to be relevant for
> the hardware domain in ~2010 an HVM dom0 was not even in sight?

I fear because I was wrong with what I said in the earlier reply: There's
one exception - the MSI-X tables of devices. DomU-s (and even Dom0) aren't
supposed to access them directly, but we'd permit reads (which, at least
back at the time, were also required to keep qemu working).

> Sorry to ask so many questions, I'm a bit confused about how this
> was/is supposed to work.

No worries - as you can see, I'm not getting it quite straight either.

>>>> mmio_ro_do_page_fault() was
>>>> (and still is) invoked for the hardware domain only, and quite likely
>>>> the need for handling (discarding) writes for PVHv1 had been overlooked
>>>> until someone was hit by the lack thereof.
>>>
>>> I see, I didn't realize r/o MMIO was only handled for PV hardware
>>> domains only.  I could arguably do the same for HVM in
>>> hvm_hap_nested_page_fault().
>>>
>>> Not sure whether the subpage stuff is supposed to be functional for
>>> domains different than the hardware domain?  It seems to be available
>>> to the hanrdware domain only for PV guests, while for HVM is available
>>> for both PV and HVM domains:
>>
>> DYM Dom0 and DomU here?
> 
> Indeed, sorry.
> 
>>> is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)
>>>
>>> In hvm_hap_nested_page_fault().
>>
>> See the three XHCI_SHARE_* modes. When it's XHCI_SHARE_ANY, even DomU-s
>> would require this handling. It looks like a mistake that we permit the
>> path to be taken for DomU-s even when the mode is XHCI_SHARE_HWDOM.
> 
> Arguable a domU will never get the device assigned in the first place
> unless the share mode is set to XHCI_SHARE_ANY.  For the other modes
> the device is hidden, and hence couldn't be assigned to a domU anyway.

Correct. Yet then we permit a code path to be taken which is supposedly
unnecessary, but potentially (if something went wrong) harmful.

>>>>> I'm happy to look at other ways to handling this, but given there's
>>>>> current logic for handling accesses to read-only regions in
>>>>> hvm_hap_nested_page_fault() I think re-using that was the best way to
>>>>> also handle accesses to MMIO read-only regions.
>>>>>
>>>>> Arguably it would already be the case that for other reasons Xen would
>>>>> need to emulate an instruction that accesses a read-only MMIO region?
>>>>
>>>> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
>>>> p2m_mmio_direct (after all, "direct" means we expect no emulation is
>>>> needed; while arguably wrong for the introspection case, I'm not sure
>>>> that and pass-through actually go together). Hence it's down to
>>>> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
>>>> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
>>>> which means that if hvm_io_intercept() can't handle it, the access
>>>> will be forwarded to the responsible DM, or be "processed" by the
>>>> internal null handler.
>>>>
>>>> Given this, perhaps what you do is actually fine. At the same time
>>>> note how several functions in hvm/emulate.c simply fail upon
>>>> encountering p2m_mmio_direct. These are all REP handlers though, so
>>>> the main emulator would then try emulating the insn the non-REP way.
>>>
>>> I'm open to alternative ways of handling such accesses, just used what
>>> seemed more natural in the context of hvm_hap_nested_page_fault().
>>>
>>> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
>>> perspective, that would "just" result in the guest getting a #GP
>>> injected.
>>
>> That's not the part I'm worried about. What worries me is that we open up
>> another (or better: we're widening a) way to hit the emulator in the first
>> place. (Plus, as said, the issue with the not really tidy P2M type system.)
> 
> But the hit would be limited to domains having r/o p2m_mmio_direct
> entries in the p2m, as otherwise the path would be unreachable?

I fear I don't follow - all you look for in the newly extended conditional
is the type being p2m_mmio_direct. There's no r/o-ness being checked for
until we'd make it through the emulator and into subpage_mmio_accept().

Jan
Marek Marczykowski-Górecki April 15, 2025, 10:40 a.m. UTC | #10
On Tue, Apr 15, 2025 at 12:18:04PM +0200, Jan Beulich wrote:
> On 15.04.2025 12:04, Roger Pau Monné wrote:
> > On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
> >> On 15.04.2025 10:34, Roger Pau Monné wrote:
> >>> On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
> >>>> On 14.04.2025 18:13, Roger Pau Monné wrote:
> >>>>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
> >>>>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
> >>>>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> >>>>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> >>>>>>>> p2m_mmio_direct.
> >>>>>>>
> >>>>>>> But that won't go into hvm_hap_nested_page_fault() when using
> >>>>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> >>>>>>> mapped as p2m_mmio_direct)?
> >>>>>>>
> >>>>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> >>>>>>> differently?
> >>>>>>
> >>>>>> All true as long as things work as expected (potentially including the guest
> >>>>>> also behaving as expected). Also this was explicitly only an example I could
> >>>>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
> >>>>>> getting things to handle it's not meant to ever see.
> >>>>>
> >>>>> How was access to MMIO r/o regions supposed to be handled before
> >>>>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
> >>>>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
> >>>>> (~2010), yet I can't figure out how writes would be handled back then
> >>>>> that didn't result in a p2m fault and crashing of the domain.
> >>>>
> >>>> Was that handled at all before said change?
> >>>
> >>> Not really AFAICT, hence me wondering how where write accesses to r/o
> >>> MMIO regions supposed to be handled by (non-priv) domains.  Was the
> >>> expectation that those writes trigger an p2m violation thus crashing
> >>> the domain?
> >>
> >> I think so, yes. Devices with such special areas weren't (aren't?) supposed
> >> to be handed to DomU-s.
> > 
> > Oh, I see.  That makes stuff a bit clearer.  I think we would then
> > also want to add some checks to {ept_}p2m_type_to_flags()?
> > 
> > I wonder why handling of mmio_ro_ranges was added to the HVM p2m code
> > in ~2010 then.  If mmio_ro_ranges is only supposed to be relevant for
> > the hardware domain in ~2010 an HVM dom0 was not even in sight?
> 
> I fear because I was wrong with what I said in the earlier reply: There's
> one exception - the MSI-X tables of devices. DomU-s (and even Dom0) aren't
> supposed to access them directly, but we'd permit reads (which, at least
> back at the time, were also required to keep qemu working).

And there is also a case where some devices have other registers on the
same page as MSI-X tables. But this case is handled specially in the
MSI-X code, not via sub-page R/O API.

> > Sorry to ask so many questions, I'm a bit confused about how this
> > was/is supposed to work.
> 
> No worries - as you can see, I'm not getting it quite straight either.
> 
> >>>> mmio_ro_do_page_fault() was
> >>>> (and still is) invoked for the hardware domain only, and quite likely
> >>>> the need for handling (discarding) writes for PVHv1 had been overlooked
> >>>> until someone was hit by the lack thereof.
> >>>
> >>> I see, I didn't realize r/o MMIO was only handled for PV hardware
> >>> domains only.  I could arguably do the same for HVM in
> >>> hvm_hap_nested_page_fault().
> >>>
> >>> Not sure whether the subpage stuff is supposed to be functional for
> >>> domains different than the hardware domain?  It seems to be available
> >>> to the hanrdware domain only for PV guests, while for HVM is available
> >>> for both PV and HVM domains:
> >>
> >> DYM Dom0 and DomU here?
> > 
> > Indeed, sorry.

I'm not sure about the PV case and domU. I think I tested it at some
iteration, but it isn't configuration that I care much about. If it
doesn't work (and fixing it would make it even more complex), IMO we can
simply adjust documentation of XHCI_SHARE_ANY to say it works only with
HVM domU.

The domU case exists mostly (only?) to enable automated testing. I do a
lot of that on laptops, which have only a single USB controller (no way
to plug any extra one), and I need that USB controller in a domU for
several tests. In fact, the XHCI console is a debugging feature in the
first place. So, the domU part doesn't need security support, can
require extra hoops to jump through etc.

> >>> is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)
> >>>
> >>> In hvm_hap_nested_page_fault().
> >>
> >> See the three XHCI_SHARE_* modes. When it's XHCI_SHARE_ANY, even DomU-s
> >> would require this handling. It looks like a mistake that we permit the
> >> path to be taken for DomU-s even when the mode is XHCI_SHARE_HWDOM.
> > 
> > Arguable a domU will never get the device assigned in the first place
> > unless the share mode is set to XHCI_SHARE_ANY.  For the other modes
> > the device is hidden, and hence couldn't be assigned to a domU anyway.
> 
> Correct. Yet then we permit a code path to be taken which is supposedly
> unnecessary, but potentially (if something went wrong) harmful.

Since the XHCI_SHARE_ANY case is rare (and not security-supported),
maybe there should be a global variable guarding this part? It would be
set to true only if XHCI_SHARE_ANY is used (or some future use of this
subpage-ro API with a domU). Then, that code would still be potentially
reachable for all domUs (if XHCI_SHARE_ANY is used), but that's still
better?
Anyway, I'm still not sure what the concern is. What is the (not purely
theoretical) case where domU gains access to the emulator, where without
this feature it wouldn't have it already? Any HVM can hit the emulator
already, regardless of this feature, no?

> >>>>> I'm happy to look at other ways to handling this, but given there's
> >>>>> current logic for handling accesses to read-only regions in
> >>>>> hvm_hap_nested_page_fault() I think re-using that was the best way to
> >>>>> also handle accesses to MMIO read-only regions.
> >>>>>
> >>>>> Arguably it would already be the case that for other reasons Xen would
> >>>>> need to emulate an instruction that accesses a read-only MMIO region?
> >>>>
> >>>> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
> >>>> p2m_mmio_direct (after all, "direct" means we expect no emulation is
> >>>> needed; while arguably wrong for the introspection case, I'm not sure
> >>>> that and pass-through actually go together). Hence it's down to
> >>>> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
> >>>> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
> >>>> which means that if hvm_io_intercept() can't handle it, the access
> >>>> will be forwarded to the responsible DM, or be "processed" by the
> >>>> internal null handler.
> >>>>
> >>>> Given this, perhaps what you do is actually fine. At the same time
> >>>> note how several functions in hvm/emulate.c simply fail upon
> >>>> encountering p2m_mmio_direct. These are all REP handlers though, so
> >>>> the main emulator would then try emulating the insn the non-REP way.
> >>>
> >>> I'm open to alternative ways of handling such accesses, just used what
> >>> seemed more natural in the context of hvm_hap_nested_page_fault().
> >>>
> >>> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
> >>> perspective, that would "just" result in the guest getting a #GP
> >>> injected.
> >>
> >> That's not the part I'm worried about. What worries me is that we open up
> >> another (or better: we're widening a) way to hit the emulator in the first
> >> place. (Plus, as said, the issue with the not really tidy P2M type system.)
> > 
> > But the hit would be limited to domains having r/o p2m_mmio_direct
> > entries in the p2m, as otherwise the path would be unreachable?
> 
> I fear I don't follow - all you look for in the newly extended conditional
> is the type being p2m_mmio_direct. There's no r/o-ness being checked for
> until we'd make it through the emulator and into subpage_mmio_accept().

But EPT violation can be hit on p2m_mmio_direct page only if it's a
write and the page is read-only, no? Is there any other case that exists
today?
Jan Beulich April 15, 2025, 10:50 a.m. UTC | #11
On 15.04.2025 12:40, Marek Marczykowski wrote:
> On Tue, Apr 15, 2025 at 12:18:04PM +0200, Jan Beulich wrote:
>> On 15.04.2025 12:04, Roger Pau Monné wrote:
>>> On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
>>>> On 15.04.2025 10:34, Roger Pau Monné wrote:
>>>>> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
>>>>> perspective, that would "just" result in the guest getting a #GP
>>>>> injected.
>>>>
>>>> That's not the part I'm worried about. What worries me is that we open up
>>>> another (or better: we're widening a) way to hit the emulator in the first
>>>> place. (Plus, as said, the issue with the not really tidy P2M type system.)
>>>
>>> But the hit would be limited to domains having r/o p2m_mmio_direct
>>> entries in the p2m, as otherwise the path would be unreachable?
>>
>> I fear I don't follow - all you look for in the newly extended conditional
>> is the type being p2m_mmio_direct. There's no r/o-ness being checked for
>> until we'd make it through the emulator and into subpage_mmio_accept().
> 
> But EPT violation can be hit on p2m_mmio_direct page only if it's a
> write and the page is read-only, no? Is there any other case that exists
> today?

Today and if everything works as it should - yes, I think so.

Jan
Roger Pau Monné April 15, 2025, 12:47 p.m. UTC | #12
On Tue, Apr 15, 2025 at 12:18:04PM +0200, Jan Beulich wrote:
> On 15.04.2025 12:04, Roger Pau Monné wrote:
> > On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
> >> On 15.04.2025 10:34, Roger Pau Monné wrote:
> >>> On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
> >>>> On 14.04.2025 18:13, Roger Pau Monné wrote:
> >>>>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
> >>>>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
> >>>>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
> >>>>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
> >>>>>>>> p2m_mmio_direct.
> >>>>>>>
> >>>>>>> But that won't go into hvm_hap_nested_page_fault() when using
> >>>>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
> >>>>>>> mapped as p2m_mmio_direct)?
> >>>>>>>
> >>>>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
> >>>>>>> differently?
> >>>>>>
> >>>>>> All true as long as things work as expected (potentially including the guest
> >>>>>> also behaving as expected). Also this was explicitly only an example I could
> >>>>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
> >>>>>> getting things to handle it's not meant to ever see.
> >>>>>
> >>>>> How was access to MMIO r/o regions supposed to be handled before
> >>>>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
> >>>>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
> >>>>> (~2010), yet I can't figure out how writes would be handled back then
> >>>>> that didn't result in a p2m fault and crashing of the domain.
> >>>>
> >>>> Was that handled at all before said change?
> >>>
> >>> Not really AFAICT, hence me wondering how where write accesses to r/o
> >>> MMIO regions supposed to be handled by (non-priv) domains.  Was the
> >>> expectation that those writes trigger an p2m violation thus crashing
> >>> the domain?
> >>
> >> I think so, yes. Devices with such special areas weren't (aren't?) supposed
> >> to be handed to DomU-s.
> > 
> > Oh, I see.  That makes stuff a bit clearer.  I think we would then
> > also want to add some checks to {ept_}p2m_type_to_flags()?
> > 
> > I wonder why handling of mmio_ro_ranges was added to the HVM p2m code
> > in ~2010 then.  If mmio_ro_ranges is only supposed to be relevant for
> > the hardware domain in ~2010 an HVM dom0 was not even in sight?
> 
> I fear because I was wrong with what I said in the earlier reply: There's
> one exception - the MSI-X tables of devices. DomU-s (and even Dom0) aren't
> supposed to access them directly, but we'd permit reads (which, at least
> back at the time, were also required to keep qemu working).

Hm, but reads to the MSI-X table for HVM domains will go through QEMU,
and hence not hit the r/o MMIO path, because the MSI-X table will
never be mapped to an HVM guest p2m?

Reads from QEMU are indeed different, but those where always made from
a PV domain.  As said above - HVM guests never got to see the native
MSI-X table at all.

> >>>>> I'm happy to look at other ways to handling this, but given there's
> >>>>> current logic for handling accesses to read-only regions in
> >>>>> hvm_hap_nested_page_fault() I think re-using that was the best way to
> >>>>> also handle accesses to MMIO read-only regions.
> >>>>>
> >>>>> Arguably it would already be the case that for other reasons Xen would
> >>>>> need to emulate an instruction that accesses a read-only MMIO region?
> >>>>
> >>>> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
> >>>> p2m_mmio_direct (after all, "direct" means we expect no emulation is
> >>>> needed; while arguably wrong for the introspection case, I'm not sure
> >>>> that and pass-through actually go together). Hence it's down to
> >>>> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
> >>>> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
> >>>> which means that if hvm_io_intercept() can't handle it, the access
> >>>> will be forwarded to the responsible DM, or be "processed" by the
> >>>> internal null handler.
> >>>>
> >>>> Given this, perhaps what you do is actually fine. At the same time
> >>>> note how several functions in hvm/emulate.c simply fail upon
> >>>> encountering p2m_mmio_direct. These are all REP handlers though, so
> >>>> the main emulator would then try emulating the insn the non-REP way.
> >>>
> >>> I'm open to alternative ways of handling such accesses, just used what
> >>> seemed more natural in the context of hvm_hap_nested_page_fault().
> >>>
> >>> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
> >>> perspective, that would "just" result in the guest getting a #GP
> >>> injected.
> >>
> >> That's not the part I'm worried about. What worries me is that we open up
> >> another (or better: we're widening a) way to hit the emulator in the first
> >> place. (Plus, as said, the issue with the not really tidy P2M type system.)
> > 
> > But the hit would be limited to domains having r/o p2m_mmio_direct
> > entries in the p2m, as otherwise the path would be unreachable?
> 
> I fear I don't follow - all you look for in the newly extended conditional
> is the type being p2m_mmio_direct. There's no r/o-ness being checked for
> until we'd make it through the emulator and into subpage_mmio_accept().

Well, it's a write page-fault of a type with p2m_mmio_direct.  What
about limiting the path even further by checking for mmio_ro_ranges:

    if ( (p2mt == p2m_mmio_dm) ||
         (npfec.write_access &&
          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
           (p2mt == p2m_mmio_direct &&
            rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn))))) )
    {

Thanks, Roger.
Jan Beulich April 15, 2025, 12:53 p.m. UTC | #13
On 15.04.2025 14:47, Roger Pau Monné wrote:
> On Tue, Apr 15, 2025 at 12:18:04PM +0200, Jan Beulich wrote:
>> On 15.04.2025 12:04, Roger Pau Monné wrote:
>>> On Tue, Apr 15, 2025 at 11:41:27AM +0200, Jan Beulich wrote:
>>>> On 15.04.2025 10:34, Roger Pau Monné wrote:
>>>>> On Tue, Apr 15, 2025 at 09:32:37AM +0200, Jan Beulich wrote:
>>>>>> On 14.04.2025 18:13, Roger Pau Monné wrote:
>>>>>>> On Mon, Apr 14, 2025 at 05:24:32PM +0200, Jan Beulich wrote:
>>>>>>>> On 14.04.2025 15:53, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Apr 14, 2025 at 08:33:44AM +0200, Jan Beulich wrote:
>>>>>>>>>> I'm also concerned of e.g. VT-x'es APIC access MFN, which is
>>>>>>>>>> p2m_mmio_direct.
>>>>>>>>>
>>>>>>>>> But that won't go into hvm_hap_nested_page_fault() when using
>>>>>>>>> cpu_has_vmx_virtualize_apic_accesses (and thus having an APIC page
>>>>>>>>> mapped as p2m_mmio_direct)?
>>>>>>>>>
>>>>>>>>> It would instead be an EXIT_REASON_APIC_ACCESS vmexit which is handled
>>>>>>>>> differently?
>>>>>>>>
>>>>>>>> All true as long as things work as expected (potentially including the guest
>>>>>>>> also behaving as expected). Also this was explicitly only an example I could
>>>>>>>> readily think of. I'm simply wary of handle_mmio_with_translation() now
>>>>>>>> getting things to handle it's not meant to ever see.
>>>>>>>
>>>>>>> How was access to MMIO r/o regions supposed to be handled before
>>>>>>> 33c19df9a5a0 (~2015)?  I see that setting r/o MMIO p2m entries was
>>>>>>> added way before to p2m_type_to_flags() and ept_p2m_type_to_flags()
>>>>>>> (~2010), yet I can't figure out how writes would be handled back then
>>>>>>> that didn't result in a p2m fault and crashing of the domain.
>>>>>>
>>>>>> Was that handled at all before said change?
>>>>>
>>>>> Not really AFAICT, hence me wondering how where write accesses to r/o
>>>>> MMIO regions supposed to be handled by (non-priv) domains.  Was the
>>>>> expectation that those writes trigger an p2m violation thus crashing
>>>>> the domain?
>>>>
>>>> I think so, yes. Devices with such special areas weren't (aren't?) supposed
>>>> to be handed to DomU-s.
>>>
>>> Oh, I see.  That makes stuff a bit clearer.  I think we would then
>>> also want to add some checks to {ept_}p2m_type_to_flags()?
>>>
>>> I wonder why handling of mmio_ro_ranges was added to the HVM p2m code
>>> in ~2010 then.  If mmio_ro_ranges is only supposed to be relevant for
>>> the hardware domain in ~2010 an HVM dom0 was not even in sight?
>>
>> I fear because I was wrong with what I said in the earlier reply: There's
>> one exception - the MSI-X tables of devices. DomU-s (and even Dom0) aren't
>> supposed to access them directly, but we'd permit reads (which, at least
>> back at the time, were also required to keep qemu working).
> 
> Hm, but reads to the MSI-X table for HVM domains will go through QEMU,
> and hence not hit the r/o MMIO path, because the MSI-X table will
> never be mapped to an HVM guest p2m?

Right, just wanted to be on the safe side there.

>>>>>>> I'm happy to look at other ways to handling this, but given there's
>>>>>>> current logic for handling accesses to read-only regions in
>>>>>>> hvm_hap_nested_page_fault() I think re-using that was the best way to
>>>>>>> also handle accesses to MMIO read-only regions.
>>>>>>>
>>>>>>> Arguably it would already be the case that for other reasons Xen would
>>>>>>> need to emulate an instruction that accesses a read-only MMIO region?
>>>>>>
>>>>>> Aiui hvm_translate_get_page() will yield HVMTRANS_bad_gfn_to_mfn for
>>>>>> p2m_mmio_direct (after all, "direct" means we expect no emulation is
>>>>>> needed; while arguably wrong for the introspection case, I'm not sure
>>>>>> that and pass-through actually go together). Hence it's down to
>>>>>> hvmemul_linear_mmio_access() -> hvmemul_phys_mmio_access() ->
>>>>>> hvmemul_do_mmio_buffer() -> hvmemul_do_io_buffer() -> hvmemul_do_io(),
>>>>>> which means that if hvm_io_intercept() can't handle it, the access
>>>>>> will be forwarded to the responsible DM, or be "processed" by the
>>>>>> internal null handler.
>>>>>>
>>>>>> Given this, perhaps what you do is actually fine. At the same time
>>>>>> note how several functions in hvm/emulate.c simply fail upon
>>>>>> encountering p2m_mmio_direct. These are all REP handlers though, so
>>>>>> the main emulator would then try emulating the insn the non-REP way.
>>>>>
>>>>> I'm open to alternative ways of handling such accesses, just used what
>>>>> seemed more natural in the context of hvm_hap_nested_page_fault().
>>>>>
>>>>> Emulation of r/o MMIO accesses failing wouldn't be an issue from Xen's
>>>>> perspective, that would "just" result in the guest getting a #GP
>>>>> injected.
>>>>
>>>> That's not the part I'm worried about. What worries me is that we open up
>>>> another (or better: we're widening a) way to hit the emulator in the first
>>>> place. (Plus, as said, the issue with the not really tidy P2M type system.)
>>>
>>> But the hit would be limited to domains having r/o p2m_mmio_direct
>>> entries in the p2m, as otherwise the path would be unreachable?
>>
>> I fear I don't follow - all you look for in the newly extended conditional
>> is the type being p2m_mmio_direct. There's no r/o-ness being checked for
>> until we'd make it through the emulator and into subpage_mmio_accept().
> 
> Well, it's a write page-fault of a type with p2m_mmio_direct.  What
> about limiting the path even further by checking for mmio_ro_ranges:
> 
>     if ( (p2mt == p2m_mmio_dm) ||
>          (npfec.write_access &&
>           (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
>            /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
>            (p2mt == p2m_mmio_direct &&
>             rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn))))) )
>     {

Yes, this would make me quite a bit less nervous.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 9fff1b82f7c6..ed888f0b49d3 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -370,7 +370,8 @@  static int hvmemul_do_io(
         /* If there is no suitable backing DM, just ignore accesses */
         if ( !s )
         {
-            if ( is_mmio && is_hardware_domain(currd) )
+            if ( is_mmio && is_hardware_domain(currd) &&
+                 !rangeset_contains_singleton(mmio_ro_ranges, PFN_DOWN(addr)) )
             {
                 /*
                  * PVH dom0 is likely missing MMIO mappings on the p2m, due to
@@ -2856,50 +2857,6 @@  int hvm_emulate_one(
     return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops, completion);
 }
 
-int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla)
-{
-    static const struct x86_emulate_ops hvm_ro_emulate_ops_mmio = {
-        .read       = x86emul_unhandleable_rw,
-        .insn_fetch = hvmemul_insn_fetch,
-        .write      = mmio_ro_emulated_write,
-        .validate   = hvmemul_validate,
-    };
-    struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla, .mfn = _mfn(mfn) };
-    struct hvm_emulate_ctxt ctxt;
-    unsigned int seg, bdf;
-    int rc;
-
-    if ( pci_ro_mmcfg_decode(mfn, &seg, &bdf) )
-    {
-        /* Should be always handled by vPCI for PVH dom0. */
-        gdprintk(XENLOG_ERR, "unhandled MMCFG access for %pp\n",
-                 &PCI_SBDF(seg, bdf));
-        ASSERT_UNREACHABLE();
-        return X86EMUL_UNHANDLEABLE;
-    }
-
-    hvm_emulate_init_once(&ctxt, x86_insn_is_mem_write,
-                          guest_cpu_user_regs());
-    ctxt.ctxt.data = &mmio_ro_ctxt;
-
-    switch ( rc = _hvm_emulate_one(&ctxt, &hvm_ro_emulate_ops_mmio,
-                                   VIO_no_completion) )
-    {
-    case X86EMUL_UNHANDLEABLE:
-    case X86EMUL_UNIMPLEMENTED:
-        hvm_dump_emulation_state(XENLOG_G_WARNING, "r/o MMIO", &ctxt, rc);
-        break;
-    case X86EMUL_EXCEPTION:
-        hvm_inject_event(&ctxt.ctxt.event);
-        /* fallthrough */
-    default:
-        hvm_emulate_writeback(&ctxt);
-        break;
-    }
-
-    return rc;
-}
-
 void hvm_emulate_one_vm_event(enum emul_kind kind, unsigned int trapnr,
     unsigned int errcode)
 {
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 6f1174c5127e..21f005b0947c 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -8,6 +8,7 @@ 
  */
 
 #include <xen/init.h>
+#include <xen/io.h>
 #include <xen/ioreq.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -35,7 +36,6 @@ 
 #include <asm/current.h>
 #include <asm/debugreg.h>
 #include <asm/e820.h>
-#include <asm/io.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -585,9 +585,81 @@  static int cf_check hvm_print_line(
     return X86EMUL_OKAY;
 }
 
+static int cf_check subpage_mmio_accept(struct vcpu *v, unsigned long addr)
+{
+    p2m_type_t t;
+    mfn_t mfn = get_gfn_query_unlocked(v->domain, addr, &t);
+
+    return !mfn_eq(mfn, INVALID_MFN) && t == p2m_mmio_direct &&
+           !!subpage_mmio_find_page(mfn);
+}
+
+static int cf_check subpage_mmio_read(
+    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
+{
+    struct domain *d = v->domain;
+    p2m_type_t t;
+    mfn_t mfn = get_gfn_query(d, addr, &t);
+    struct subpage_ro_range *entry;
+    volatile void __iomem *mem;
+
+    *data = ~0UL;
+
+    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
+    {
+        put_gfn(d, addr);
+        return X86EMUL_RETRY;
+    }
+
+    entry = subpage_mmio_find_page(mfn);
+    if ( !entry )
+    {
+        put_gfn(d, addr);
+        return X86EMUL_RETRY;
+    }
+
+    mem = subpage_mmio_map_page(entry);
+    if ( !mem )
+    {
+        put_gfn(d, addr);
+        gprintk(XENLOG_ERR, "Failed to map page for MMIO read at %#lx\n",
+                mfn_to_maddr(mfn) + PAGE_OFFSET(addr));
+        return X86EMUL_OKAY;
+    }
+
+    *data = read_mmio(mem + PAGE_OFFSET(addr), len);
+
+    put_gfn(d, addr);
+    return X86EMUL_OKAY;
+}
+
+static int cf_check subpage_mmio_write(
+    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data)
+{
+    struct domain *d = v->domain;
+    p2m_type_t t;
+    mfn_t mfn = get_gfn_query(d, addr, &t);
+
+    if ( mfn_eq(mfn, INVALID_MFN) || t != p2m_mmio_direct )
+    {
+        put_gfn(d, addr);
+        return X86EMUL_RETRY;
+    }
+
+    subpage_mmio_write_emulate(mfn, PAGE_OFFSET(addr), data, len);
+
+    put_gfn(d, addr);
+    return X86EMUL_OKAY;
+}
+
 int hvm_domain_initialise(struct domain *d,
                           const struct xen_domctl_createdomain *config)
 {
+    static const struct hvm_mmio_ops subpage_mmio_ops = {
+        .check = subpage_mmio_accept,
+        .read = subpage_mmio_read,
+        .write = subpage_mmio_write,
+    };
     unsigned int nr_gsis;
     int rc;
 
@@ -692,6 +764,9 @@  int hvm_domain_initialise(struct domain *d,
 
     register_portio_handler(d, XEN_HVM_DEBUGCONS_IOPORT, 1, hvm_print_line);
 
+    /* Handler for r/o MMIO subpage accesses. */
+    register_mmio_handler(d, &subpage_mmio_ops);
+
     if ( hvm_tsc_scaling_supported )
         d->arch.hvm.tsc_scaling_ratio = hvm_default_tsc_scaling_ratio;
 
@@ -1981,7 +2056,9 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
      */
     if ( (p2mt == p2m_mmio_dm) ||
          (npfec.write_access &&
-          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
+          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server) ||
+           /* MMIO entries can be r/o if the target mfn is in mmio_ro_ranges. */
+           (p2mt == p2m_mmio_direct))) )
     {
         if ( !handle_mmio_with_translation(gla, gfn, npfec) )
             hvm_inject_hw_exception(X86_EXC_GP, 0);
@@ -2033,14 +2110,6 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
         goto out_put_gfn;
     }
 
-    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
-         (is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)) &&
-         (hvm_emulate_one_mmio(mfn_x(mfn), gla) == X86EMUL_OKAY) )
-    {
-        rc = 1;
-        goto out_put_gfn;
-    }
-
     /* If we fell through, the vcpu will retry now that access restrictions have
      * been removed. It may fault again if the p2m entry type still requires so.
      * Otherwise, this is an error condition. */
diff --git a/xen/arch/x86/include/asm/hvm/emulate.h b/xen/arch/x86/include/asm/hvm/emulate.h
index c7a2d2a5be4e..178ac32e151f 100644
--- a/xen/arch/x86/include/asm/hvm/emulate.h
+++ b/xen/arch/x86/include/asm/hvm/emulate.h
@@ -86,7 +86,6 @@  void hvmemul_cancel(struct vcpu *v);
 struct segment_register *hvmemul_get_seg_reg(
     enum x86_segment seg,
     struct hvm_emulate_ctxt *hvmemul_ctxt);
-int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
 
 static inline bool handle_mmio(void)
 {
diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
index a1bc8cc27451..c2e9ef6e5023 100644
--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -554,6 +554,18 @@  int cf_check mmio_ro_emulated_write(
     enum x86_segment seg, unsigned long offset, void *p_data,
     unsigned int bytes, struct x86_emulate_ctxt *ctxt);
 
+/* r/o MMIO subpage access handlers. */
+struct subpage_ro_range {
+    struct list_head list;
+    mfn_t mfn;
+    void __iomem *mapped;
+    DECLARE_BITMAP(ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN);
+};
+struct subpage_ro_range *subpage_mmio_find_page(mfn_t mfn);
+void __iomem *subpage_mmio_map_page(struct subpage_ro_range *entry);
+void subpage_mmio_write_emulate(
+    mfn_t mfn, unsigned int offset, unsigned long data, unsigned int len);
+
 int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
 
 extern int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 989e62e7ce6f..f59c7816fba5 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -157,13 +157,6 @@  struct rangeset *__read_mostly mmio_ro_ranges;
 static uint32_t __ro_after_init base_disallow_mask;
 
 /* Handling sub-page read-only MMIO regions */
-struct subpage_ro_range {
-    struct list_head list;
-    mfn_t mfn;
-    void __iomem *mapped;
-    DECLARE_BITMAP(ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN);
-};
-
 static LIST_HEAD_RO_AFTER_INIT(subpage_ro_ranges);
 static DEFINE_SPINLOCK(subpage_ro_lock);
 
@@ -4929,7 +4922,7 @@  long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }
 
-static struct subpage_ro_range *subpage_mmio_find_page(mfn_t mfn)
+struct subpage_ro_range *subpage_mmio_find_page(mfn_t mfn)
 {
     struct subpage_ro_range *entry;
 
@@ -5074,7 +5067,7 @@  int __init subpage_mmio_ro_add(
     return rc;
 }
 
-static void __iomem *subpage_mmio_map_page(
+void __iomem *subpage_mmio_map_page(
     struct subpage_ro_range *entry)
 {
     void __iomem *mapped_page;
@@ -5099,7 +5092,7 @@  static void __iomem *subpage_mmio_map_page(
     return entry->mapped;
 }
 
-static void subpage_mmio_write_emulate(
+void subpage_mmio_write_emulate(
     mfn_t mfn,
     unsigned int offset,
     unsigned long data,
@@ -5133,30 +5126,6 @@  static void subpage_mmio_write_emulate(
     write_mmio(addr + offset, data, len);
 }
 
-#ifdef CONFIG_HVM
-bool subpage_mmio_write_accept(mfn_t mfn, unsigned long gla)
-{
-    unsigned int offset = PAGE_OFFSET(gla);
-    const struct subpage_ro_range *entry;
-
-    entry = subpage_mmio_find_page(mfn);
-    if ( !entry )
-        return false;
-
-    if ( !test_bit(offset / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems) )
-    {
-        /*
-         * We don't know the write size at this point yet, so it could be
-         * an unaligned write, but accept it here anyway and deal with it
-         * later.
-         */
-        return true;
-    }
-
-    return false;
-}
-#endif
-
 int cf_check mmio_ro_emulated_write(
     enum x86_segment seg,
     unsigned long offset,