diff mbox series

[v5,01/15] IOMMU/x86: restrict IO-APIC mappings for PV Dom0

Message ID 1de2cc0a-e89c-6be9-9d6e-a10219f6f9aa@suse.com (mailing list archive)
State Superseded
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich May 27, 2022, 11:12 a.m. UTC
While already the case for PVH, there's no reason to treat PV
differently here, though of course the addresses get taken from another
source in this case. Except that, to match CPU side mappings, by default
we permit r/o ones. This then also means we now deal consistently with
IO-APICs whose MMIO is or is not covered by E820 reserved regions.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: Extend to also cover e.g. HPET, which in turn means explicitly
    excluding PCI MMCFG ranges.
[integrated] v1: Integrate into series.
[standalone] v2: Keep IOMMU mappings in sync with CPU ones.

Comments

Roger Pau Monné May 31, 2022, 2:40 p.m. UTC | #1
On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
> While already the case for PVH, there's no reason to treat PV
> differently here, though of course the addresses get taken from another
> source in this case. Except that, to match CPU side mappings, by default
> we permit r/o ones. This then also means we now deal consistently with
> IO-APICs whose MMIO is or is not covered by E820 reserved regions.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Just one comment below.

> ---
> v5: Extend to also cover e.g. HPET, which in turn means explicitly
>     excluding PCI MMCFG ranges.
> [integrated] v1: Integrate into series.
> [standalone] v2: Keep IOMMU mappings in sync with CPU ones.
> 
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -13,6 +13,7 @@
>   */
>  
>  #include <xen/sched.h>
> +#include <xen/iocap.h>
>  #include <xen/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> @@ -275,12 +276,12 @@ void iommu_identity_map_teardown(struct
>      }
>  }
>  
> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d,
> -                                         unsigned long pfn,
> -                                         unsigned long max_pfn)
> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
> +                                                 unsigned long pfn,
> +                                                 unsigned long max_pfn)
>  {
>      mfn_t mfn = _mfn(pfn);
> -    unsigned int i, type;
> +    unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable;
>  
>      /*
>       * Set up 1:1 mapping for dom0. Default to include only conventional RAM
> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
>       * that fall in unusable ranges for PV Dom0.
>       */
>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
> -        return false;
> +        return 0;
>  
>      switch ( type = page_get_ram_type(mfn) )
>      {
>      case RAM_TYPE_UNUSABLE:
> -        return false;
> +        return 0;
>  
>      case RAM_TYPE_CONVENTIONAL:
>          if ( iommu_hwdom_strict )
> -            return false;
> +            return 0;
>          break;
>  
>      default:
>          if ( type & RAM_TYPE_RESERVED )
>          {
>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
> -                return false;
> +                perms = 0;
>          }
> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
> -            return false;
> +        else if ( is_hvm_domain(d) )
> +            return 0;
> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
> +            perms = 0;
>      }
>  
>      /* Check that it doesn't overlap with the Interrupt Address Range. */
>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
> -        return false;
> +        return 0;
>      /* ... or the IO-APIC */
> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> -            return false;
> +    if ( has_vioapic(d) )
> +    {
> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> +                return 0;
> +    }
> +    else if ( is_pv_domain(d) )
> +    {
> +        /*
> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
> +         * ones there (also for e.g. HPET in certain cases), so it should also
> +         * have such established for IOMMUs.
> +         */
> +        if ( iomem_access_permitted(d, pfn, pfn) &&
> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
> +            perms = IOMMUF_readable;
> +    }
>      /*
>       * ... or the PCIe MCFG regions.
>       * TODO: runtime added MMCFG regions are not checked to make sure they
>       * don't overlap with already mapped regions, thus preventing trapping.
>       */
>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
> -        return false;
> +        return 0;
> +    else if ( is_pv_domain(d) )
> +    {
> +        /*
> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
> +         * These shouldn't be accessed via DMA by devices.

Could you expand the comment a bit to explicitly mention the reason
why MMCFG regions shouldn't be accessible from device DMA operations?

Thanks, Roger.
Jan Beulich May 31, 2022, 3:40 p.m. UTC | #2
On 31.05.2022 16:40, Roger Pau Monné wrote:
> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
>> While already the case for PVH, there's no reason to treat PV
>> differently here, though of course the addresses get taken from another
>> source in this case. Except that, to match CPU side mappings, by default
>> we permit r/o ones. This then also means we now deal consistently with
>> IO-APICs whose MMIO is or is not covered by E820 reserved regions.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
>>       * that fall in unusable ranges for PV Dom0.
>>       */
>>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
>> -        return false;
>> +        return 0;
>>  
>>      switch ( type = page_get_ram_type(mfn) )
>>      {
>>      case RAM_TYPE_UNUSABLE:
>> -        return false;
>> +        return 0;
>>  
>>      case RAM_TYPE_CONVENTIONAL:
>>          if ( iommu_hwdom_strict )
>> -            return false;
>> +            return 0;
>>          break;
>>  
>>      default:
>>          if ( type & RAM_TYPE_RESERVED )
>>          {
>>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
>> -                return false;
>> +                perms = 0;
>>          }
>> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
>> -            return false;
>> +        else if ( is_hvm_domain(d) )
>> +            return 0;
>> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
>> +            perms = 0;
>>      }
>>  
>>      /* Check that it doesn't overlap with the Interrupt Address Range. */
>>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
>> -        return false;
>> +        return 0;
>>      /* ... or the IO-APIC */
>> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
>> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>> -            return false;
>> +    if ( has_vioapic(d) )
>> +    {
>> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
>> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>> +                return 0;
>> +    }
>> +    else if ( is_pv_domain(d) )
>> +    {
>> +        /*
>> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
>> +         * ones there (also for e.g. HPET in certain cases), so it should also
>> +         * have such established for IOMMUs.
>> +         */
>> +        if ( iomem_access_permitted(d, pfn, pfn) &&
>> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
>> +            perms = IOMMUF_readable;
>> +    }
>>      /*
>>       * ... or the PCIe MCFG regions.

With this comment (which I leave alone) ...

>>       * TODO: runtime added MMCFG regions are not checked to make sure they
>>       * don't overlap with already mapped regions, thus preventing trapping.
>>       */
>>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
>> -        return false;
>> +        return 0;
>> +    else if ( is_pv_domain(d) )
>> +    {
>> +        /*
>> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
>> +         * These shouldn't be accessed via DMA by devices.
> 
> Could you expand the comment a bit to explicitly mention the reason
> why MMCFG regions shouldn't be accessible from device DMA operations?

... it's hard to tell what I should write here. I'd expect extended
reasoning to go there (if anywhere). I'd be okay adjusting the earlier
comment, if only I knew what to write. "We don't want them to be
accessed that way" seems a little blunt. I could say "Devices have
other means to access PCI config space", but this not being said there
I took as being implied. Or else what was the reason to exclude these
for PVH Dom0?

Jan
Roger Pau Monné May 31, 2022, 4:15 p.m. UTC | #3
On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote:
> On 31.05.2022 16:40, Roger Pau Monné wrote:
> > On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
> >> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
> >>       * that fall in unusable ranges for PV Dom0.
> >>       */
> >>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
> >> -        return false;
> >> +        return 0;
> >>  
> >>      switch ( type = page_get_ram_type(mfn) )
> >>      {
> >>      case RAM_TYPE_UNUSABLE:
> >> -        return false;
> >> +        return 0;
> >>  
> >>      case RAM_TYPE_CONVENTIONAL:
> >>          if ( iommu_hwdom_strict )
> >> -            return false;
> >> +            return 0;
> >>          break;
> >>  
> >>      default:
> >>          if ( type & RAM_TYPE_RESERVED )
> >>          {
> >>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
> >> -                return false;
> >> +                perms = 0;
> >>          }
> >> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
> >> -            return false;
> >> +        else if ( is_hvm_domain(d) )
> >> +            return 0;
> >> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
> >> +            perms = 0;
> >>      }
> >>  
> >>      /* Check that it doesn't overlap with the Interrupt Address Range. */
> >>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
> >> -        return false;
> >> +        return 0;
> >>      /* ... or the IO-APIC */
> >> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
> >> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> >> -            return false;
> >> +    if ( has_vioapic(d) )
> >> +    {
> >> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
> >> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> >> +                return 0;
> >> +    }
> >> +    else if ( is_pv_domain(d) )
> >> +    {
> >> +        /*
> >> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
> >> +         * ones there (also for e.g. HPET in certain cases), so it should also
> >> +         * have such established for IOMMUs.
> >> +         */
> >> +        if ( iomem_access_permitted(d, pfn, pfn) &&
> >> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
> >> +            perms = IOMMUF_readable;
> >> +    }
> >>      /*
> >>       * ... or the PCIe MCFG regions.
> 
> With this comment (which I leave alone) ...
> 
> >>       * TODO: runtime added MMCFG regions are not checked to make sure they
> >>       * don't overlap with already mapped regions, thus preventing trapping.
> >>       */
> >>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
> >> -        return false;
> >> +        return 0;
> >> +    else if ( is_pv_domain(d) )
> >> +    {
> >> +        /*
> >> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
> >> +         * These shouldn't be accessed via DMA by devices.
> > 
> > Could you expand the comment a bit to explicitly mention the reason
> > why MMCFG regions shouldn't be accessible from device DMA operations?
> 
> ... it's hard to tell what I should write here. I'd expect extended
> reasoning to go there (if anywhere). I'd be okay adjusting the earlier
> comment, if only I knew what to write. "We don't want them to be
> accessed that way" seems a little blunt. I could say "Devices have
> other means to access PCI config space", but this not being said there
> I took as being implied.

But we could likely say the same about IO-APIC or HPET MMIO regions.
I don't think we expect them to be accessed by devices, yet we provide
them for coherency with CPU side mappings in the PV case.

> Or else what was the reason to exclude these
> for PVH Dom0?

The reason for PVH is because the config space is (partially) emulated
for the hardware domain, so we don't allow untrapped access by the CPU
either.

Thanks, Roger.
Jan Beulich June 1, 2022, 7:10 a.m. UTC | #4
On 31.05.2022 18:15, Roger Pau Monné wrote:
> On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote:
>> On 31.05.2022 16:40, Roger Pau Monné wrote:
>>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
>>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
>>>>       * that fall in unusable ranges for PV Dom0.
>>>>       */
>>>>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
>>>> -        return false;
>>>> +        return 0;
>>>>  
>>>>      switch ( type = page_get_ram_type(mfn) )
>>>>      {
>>>>      case RAM_TYPE_UNUSABLE:
>>>> -        return false;
>>>> +        return 0;
>>>>  
>>>>      case RAM_TYPE_CONVENTIONAL:
>>>>          if ( iommu_hwdom_strict )
>>>> -            return false;
>>>> +            return 0;
>>>>          break;
>>>>  
>>>>      default:
>>>>          if ( type & RAM_TYPE_RESERVED )
>>>>          {
>>>>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
>>>> -                return false;
>>>> +                perms = 0;
>>>>          }
>>>> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
>>>> -            return false;
>>>> +        else if ( is_hvm_domain(d) )
>>>> +            return 0;
>>>> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
>>>> +            perms = 0;
>>>>      }
>>>>  
>>>>      /* Check that it doesn't overlap with the Interrupt Address Range. */
>>>>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
>>>> -        return false;
>>>> +        return 0;
>>>>      /* ... or the IO-APIC */
>>>> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
>>>> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>>>> -            return false;
>>>> +    if ( has_vioapic(d) )
>>>> +    {
>>>> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
>>>> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>>>> +                return 0;
>>>> +    }
>>>> +    else if ( is_pv_domain(d) )
>>>> +    {
>>>> +        /*
>>>> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
>>>> +         * ones there (also for e.g. HPET in certain cases), so it should also
>>>> +         * have such established for IOMMUs.
>>>> +         */
>>>> +        if ( iomem_access_permitted(d, pfn, pfn) &&
>>>> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
>>>> +            perms = IOMMUF_readable;
>>>> +    }
>>>>      /*
>>>>       * ... or the PCIe MCFG regions.
>>
>> With this comment (which I leave alone) ...
>>
>>>>       * TODO: runtime added MMCFG regions are not checked to make sure they
>>>>       * don't overlap with already mapped regions, thus preventing trapping.
>>>>       */
>>>>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
>>>> -        return false;
>>>> +        return 0;
>>>> +    else if ( is_pv_domain(d) )
>>>> +    {
>>>> +        /*
>>>> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
>>>> +         * These shouldn't be accessed via DMA by devices.
>>>
>>> Could you expand the comment a bit to explicitly mention the reason
>>> why MMCFG regions shouldn't be accessible from device DMA operations?
>>
>> ... it's hard to tell what I should write here. I'd expect extended
>> reasoning to go there (if anywhere). I'd be okay adjusting the earlier
>> comment, if only I knew what to write. "We don't want them to be
>> accessed that way" seems a little blunt. I could say "Devices have
>> other means to access PCI config space", but this not being said there
>> I took as being implied.
> 
> But we could likely say the same about IO-APIC or HPET MMIO regions.
> I don't think we expect them to be accessed by devices, yet we provide
> them for coherency with CPU side mappings in the PV case.

As to "say the same" - yes for the first part of my earlier reply, but
no for the latter part.

>> Or else what was the reason to exclude these
>> for PVH Dom0?
> 
> The reason for PVH is because the config space is (partially) emulated
> for the hardware domain, so we don't allow untrapped access by the CPU
> either.

Hmm, right - there's read emulation there as well, while for PV we
only intercept writes.

So overall should we perhaps permit r/o access to MMCFG for PV? Of
course that would only end up consistent once we adjust mappings
dynamically when MMCFG ranges are put in use (IOW if we can't verify
an MMCFG range is suitably reserved, we'd not find in
mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU
side mapping even if CPU side mappings are permitted). But for the
patch here it would simply mean dropping some of the code I did add
for v5.

Otherwise, i.e. if the code is to remain as is, I'm afraid I still
wouldn't see what to put usefully in the comment.

Jan
Roger Pau Monné June 1, 2022, 8:17 a.m. UTC | #5
On Wed, Jun 01, 2022 at 09:10:09AM +0200, Jan Beulich wrote:
> On 31.05.2022 18:15, Roger Pau Monné wrote:
> > On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote:
> >> On 31.05.2022 16:40, Roger Pau Monné wrote:
> >>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
> >>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
> >>>>       * that fall in unusable ranges for PV Dom0.
> >>>>       */
> >>>>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
> >>>> -        return false;
> >>>> +        return 0;
> >>>>  
> >>>>      switch ( type = page_get_ram_type(mfn) )
> >>>>      {
> >>>>      case RAM_TYPE_UNUSABLE:
> >>>> -        return false;
> >>>> +        return 0;
> >>>>  
> >>>>      case RAM_TYPE_CONVENTIONAL:
> >>>>          if ( iommu_hwdom_strict )
> >>>> -            return false;
> >>>> +            return 0;
> >>>>          break;
> >>>>  
> >>>>      default:
> >>>>          if ( type & RAM_TYPE_RESERVED )
> >>>>          {
> >>>>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
> >>>> -                return false;
> >>>> +                perms = 0;
> >>>>          }
> >>>> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
> >>>> -            return false;
> >>>> +        else if ( is_hvm_domain(d) )
> >>>> +            return 0;
> >>>> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
> >>>> +            perms = 0;
> >>>>      }
> >>>>  
> >>>>      /* Check that it doesn't overlap with the Interrupt Address Range. */
> >>>>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
> >>>> -        return false;
> >>>> +        return 0;
> >>>>      /* ... or the IO-APIC */
> >>>> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
> >>>> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> >>>> -            return false;
> >>>> +    if ( has_vioapic(d) )
> >>>> +    {
> >>>> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
> >>>> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
> >>>> +                return 0;
> >>>> +    }
> >>>> +    else if ( is_pv_domain(d) )
> >>>> +    {
> >>>> +        /*
> >>>> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
> >>>> +         * ones there (also for e.g. HPET in certain cases), so it should also
> >>>> +         * have such established for IOMMUs.
> >>>> +         */
> >>>> +        if ( iomem_access_permitted(d, pfn, pfn) &&
> >>>> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
> >>>> +            perms = IOMMUF_readable;
> >>>> +    }
> >>>>      /*
> >>>>       * ... or the PCIe MCFG regions.
> >>
> >> With this comment (which I leave alone) ...
> >>
> >>>>       * TODO: runtime added MMCFG regions are not checked to make sure they
> >>>>       * don't overlap with already mapped regions, thus preventing trapping.
> >>>>       */
> >>>>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
> >>>> -        return false;
> >>>> +        return 0;
> >>>> +    else if ( is_pv_domain(d) )
> >>>> +    {
> >>>> +        /*
> >>>> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
> >>>> +         * These shouldn't be accessed via DMA by devices.
> >>>
> >>> Could you expand the comment a bit to explicitly mention the reason
> >>> why MMCFG regions shouldn't be accessible from device DMA operations?
> >>
> >> ... it's hard to tell what I should write here. I'd expect extended
> >> reasoning to go there (if anywhere). I'd be okay adjusting the earlier
> >> comment, if only I knew what to write. "We don't want them to be
> >> accessed that way" seems a little blunt. I could say "Devices have
> >> other means to access PCI config space", but this not being said there
> >> I took as being implied.
> > 
> > But we could likely say the same about IO-APIC or HPET MMIO regions.
> > I don't think we expect them to be accessed by devices, yet we provide
> > them for coherency with CPU side mappings in the PV case.
> 
> As to "say the same" - yes for the first part of my earlier reply, but
> no for the latter part.

Yes, obviously devices cannot access the HPET or the IO-APIC MMIO from
the PCI config space :).

> >> Or else what was the reason to exclude these
> >> for PVH Dom0?
> > 
> > The reason for PVH is because the config space is (partially) emulated
> > for the hardware domain, so we don't allow untrapped access by the CPU
> > either.
> 
> Hmm, right - there's read emulation there as well, while for PV we
> only intercept writes.
> 
> So overall should we perhaps permit r/o access to MMCFG for PV? Of
> course that would only end up consistent once we adjust mappings
> dynamically when MMCFG ranges are put in use (IOW if we can't verify
> an MMCFG range is suitably reserved, we'd not find in
> mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU
> side mapping even if CPU side mappings are permitted). But for the
> patch here it would simply mean dropping some of the code I did add
> for v5.

I would be OK with that, as I think we would then be consistent with
how IO-APIC and HPET MMIO regions are handled.  We would have to add
some small helper/handling in PHYSDEVOP_pci_mmcfg_reserved for PV.

> Otherwise, i.e. if the code is to remain as is, I'm afraid I still
> wouldn't see what to put usefully in the comment.

IMO the important part is to note whether there's a reason or not why
the handling of IO-APIC, HPET vs MMCFG RO regions differ in PV mode.
Ie: if we don't want to handle MMCFG in RO mode for device mappings
because of the complication with handling dynamic changes as a result
of PHYSDEVOP_pci_mmcfg_reserved we should just note it.

Thanks, Roger.
Jan Beulich June 1, 2022, 3:10 p.m. UTC | #6
On 01.06.2022 10:17, Roger Pau Monné wrote:
> On Wed, Jun 01, 2022 at 09:10:09AM +0200, Jan Beulich wrote:
>> On 31.05.2022 18:15, Roger Pau Monné wrote:
>>> On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote:
>>>> On 31.05.2022 16:40, Roger Pau Monné wrote:
>>>>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote:
>>>>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map
>>>>>>       * that fall in unusable ranges for PV Dom0.
>>>>>>       */
>>>>>>      if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
>>>>>> -        return false;
>>>>>> +        return 0;
>>>>>>  
>>>>>>      switch ( type = page_get_ram_type(mfn) )
>>>>>>      {
>>>>>>      case RAM_TYPE_UNUSABLE:
>>>>>> -        return false;
>>>>>> +        return 0;
>>>>>>  
>>>>>>      case RAM_TYPE_CONVENTIONAL:
>>>>>>          if ( iommu_hwdom_strict )
>>>>>> -            return false;
>>>>>> +            return 0;
>>>>>>          break;
>>>>>>  
>>>>>>      default:
>>>>>>          if ( type & RAM_TYPE_RESERVED )
>>>>>>          {
>>>>>>              if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
>>>>>> -                return false;
>>>>>> +                perms = 0;
>>>>>>          }
>>>>>> -        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
>>>>>> -            return false;
>>>>>> +        else if ( is_hvm_domain(d) )
>>>>>> +            return 0;
>>>>>> +        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
>>>>>> +            perms = 0;
>>>>>>      }
>>>>>>  
>>>>>>      /* Check that it doesn't overlap with the Interrupt Address Range. */
>>>>>>      if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
>>>>>> -        return false;
>>>>>> +        return 0;
>>>>>>      /* ... or the IO-APIC */
>>>>>> -    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
>>>>>> -        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>>>>>> -            return false;
>>>>>> +    if ( has_vioapic(d) )
>>>>>> +    {
>>>>>> +        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
>>>>>> +            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
>>>>>> +                return 0;
>>>>>> +    }
>>>>>> +    else if ( is_pv_domain(d) )
>>>>>> +    {
>>>>>> +        /*
>>>>>> +         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
>>>>>> +         * ones there (also for e.g. HPET in certain cases), so it should also
>>>>>> +         * have such established for IOMMUs.
>>>>>> +         */
>>>>>> +        if ( iomem_access_permitted(d, pfn, pfn) &&
>>>>>> +             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
>>>>>> +            perms = IOMMUF_readable;
>>>>>> +    }
>>>>>>      /*
>>>>>>       * ... or the PCIe MCFG regions.
>>>>
>>>> With this comment (which I leave alone) ...
>>>>
>>>>>>       * TODO: runtime added MMCFG regions are not checked to make sure they
>>>>>>       * don't overlap with already mapped regions, thus preventing trapping.
>>>>>>       */
>>>>>>      if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
>>>>>> -        return false;
>>>>>> +        return 0;
>>>>>> +    else if ( is_pv_domain(d) )
>>>>>> +    {
>>>>>> +        /*
>>>>>> +         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
>>>>>> +         * These shouldn't be accessed via DMA by devices.
>>>>>
>>>>> Could you expand the comment a bit to explicitly mention the reason
>>>>> why MMCFG regions shouldn't be accessible from device DMA operations?
>>>>
>>>> ... it's hard to tell what I should write here. I'd expect extended
>>>> reasoning to go there (if anywhere). I'd be okay adjusting the earlier
>>>> comment, if only I knew what to write. "We don't want them to be
>>>> accessed that way" seems a little blunt. I could say "Devices have
>>>> other means to access PCI config space", but this not being said there
>>>> I took as being implied.
>>>
>>> But we could likely say the same about IO-APIC or HPET MMIO regions.
>>> I don't think we expect them to be accessed by devices, yet we provide
>>> them for coherency with CPU side mappings in the PV case.
>>
>> As to "say the same" - yes for the first part of my earlier reply, but
>> no for the latter part.
> 
> Yes, obviously devices cannot access the HPET or the IO-APIC MMIO from
> the PCI config space :).
> 
>>>> Or else what was the reason to exclude these
>>>> for PVH Dom0?
>>>
>>> The reason for PVH is because the config space is (partially) emulated
>>> for the hardware domain, so we don't allow untrapped access by the CPU
>>> either.
>>
>> Hmm, right - there's read emulation there as well, while for PV we
>> only intercept writes.
>>
>> So overall should we perhaps permit r/o access to MMCFG for PV? Of
>> course that would only end up consistent once we adjust mappings
>> dynamically when MMCFG ranges are put in use (IOW if we can't verify
>> an MMCFG range is suitably reserved, we'd not find in
>> mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU
>> side mapping even if CPU side mappings are permitted). But for the
>> patch here it would simply mean dropping some of the code I did add
>> for v5.
> 
> I would be OK with that, as I think we would then be consistent with
> how IO-APIC and HPET MMIO regions are handled.  We would have to add
> some small helper/handling in PHYSDEVOP_pci_mmcfg_reserved for PV.

Okay, I'll drop that code again then. But I'm not going to look into
making the dynamic part work, at least not within this series.

Jan
diff mbox series

Patch

--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -13,6 +13,7 @@ 
  */
 
 #include <xen/sched.h>
+#include <xen/iocap.h>
 #include <xen/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
@@ -275,12 +276,12 @@  void iommu_identity_map_teardown(struct
     }
 }
 
-static bool __hwdom_init hwdom_iommu_map(const struct domain *d,
-                                         unsigned long pfn,
-                                         unsigned long max_pfn)
+static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
+                                                 unsigned long pfn,
+                                                 unsigned long max_pfn)
 {
     mfn_t mfn = _mfn(pfn);
-    unsigned int i, type;
+    unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable;
 
     /*
      * Set up 1:1 mapping for dom0. Default to include only conventional RAM
@@ -289,44 +290,75 @@  static bool __hwdom_init hwdom_iommu_map
      * that fall in unusable ranges for PV Dom0.
      */
     if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
-        return false;
+        return 0;
 
     switch ( type = page_get_ram_type(mfn) )
     {
     case RAM_TYPE_UNUSABLE:
-        return false;
+        return 0;
 
     case RAM_TYPE_CONVENTIONAL:
         if ( iommu_hwdom_strict )
-            return false;
+            return 0;
         break;
 
     default:
         if ( type & RAM_TYPE_RESERVED )
         {
             if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
-                return false;
+                perms = 0;
         }
-        else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn )
-            return false;
+        else if ( is_hvm_domain(d) )
+            return 0;
+        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
+            perms = 0;
     }
 
     /* Check that it doesn't overlap with the Interrupt Address Range. */
     if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
-        return false;
+        return 0;
     /* ... or the IO-APIC */
-    for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ )
-        if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
-            return false;
+    if ( has_vioapic(d) )
+    {
+        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
+            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
+                return 0;
+    }
+    else if ( is_pv_domain(d) )
+    {
+        /*
+         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
+         * ones there (also for e.g. HPET in certain cases), so it should also
+         * have such established for IOMMUs.
+         */
+        if ( iomem_access_permitted(d, pfn, pfn) &&
+             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
+            perms = IOMMUF_readable;
+    }
     /*
      * ... or the PCIe MCFG regions.
      * TODO: runtime added MMCFG regions are not checked to make sure they
      * don't overlap with already mapped regions, thus preventing trapping.
      */
     if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
-        return false;
+        return 0;
+    else if ( is_pv_domain(d) )
+    {
+        /*
+         * Don't extend consistency with CPU mappings to PCI MMCFG regions.
+         * These shouldn't be accessed via DMA by devices.
+         */
+        const struct acpi_mcfg_allocation *cfg = pci_mmcfg_config;
+
+        for ( i = 0; i < pci_mmcfg_config_num; ++i, ++cfg )
+            if ( pfn >= PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number,
+                                                         0, 0) &&
+                 pfn <= PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number,
+                                                         ~0, ~0))
+                return 0;
+    }
 
-    return true;
+    return perms;
 }
 
 void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
@@ -368,15 +400,19 @@  void __hwdom_init arch_iommu_hwdom_init(
     for ( ; i < top; i++ )
     {
         unsigned long pfn = pdx_to_pfn(i);
+        unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
         int rc;
 
-        if ( !hwdom_iommu_map(d, pfn, max_pfn) )
+        if ( !perms )
             rc = 0;
         else if ( paging_mode_translate(d) )
-            rc = p2m_add_identity_entry(d, pfn, p2m_access_rw, 0);
+            rc = p2m_add_identity_entry(d, pfn,
+                                        perms & IOMMUF_writable ? p2m_access_rw
+                                                                : p2m_access_r,
+                                        0);
         else
             rc = iommu_map(d, _dfn(pfn), _mfn(pfn), 1ul << PAGE_ORDER_4K,
-                           IOMMUF_readable | IOMMUF_writable, &flush_flags);
+                           perms, &flush_flags);
 
         if ( rc )
             printk(XENLOG_WARNING "%pd: identity %smapping of %lx failed: %d\n",