diff mbox series

[v3,1/2] IOMMU/x86: disallow device assignment to PoD guests

Message ID b0a77526-17f2-a5ab-6f7f-1b3caeb4a59b@suse.com (mailing list archive)
State New, archived
Headers show
Series x86/mm: get/set PoD target related adjustments | expand

Commit Message

Jan Beulich Jan. 4, 2022, 9:41 a.m. UTC
While it is okay for IOMMU page tables to get set up for guests starting
in PoD mode, actual device assignment may only occur once all PoD
entries have been removed from the P2M. So far this was enforced only
for boot-time assignment, and only in the tool stack.

Also use the new function to replace p2m_pod_entry_count(): Its unlocked
access to p2m->pod.entry_count wasn't really okay (irrespective of the
result being stale by the time the caller gets to see it).

To allow the tool stack to see a consistent snapshot of PoD state, move
the tail of XENMEM_{get,set}_pod_target handling into a function, adding
proper locking there.

In libxl take the liberty to use the new local variable r also for a
pre-existing call into libxc.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
permit device assignment by actively resolving all remaining PoD entries.

Initially I thought this was introduced by f89f555827a6 ("remove late
(on-demand) construction of IOMMU page tables"), but without
arch_iommu_use_permitted() checking for PoD I think the issue has been
there before that.
---
v3: In p2m_pod_set_mem_target() move check down.
v2: New.

Comments

Jan Beulich Jan. 27, 2022, 3:02 p.m. UTC | #1
On 04.01.2022 10:41, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Ping? (Anthony, I realize you weren't Cc-ed on the original submission.)

Jan

> ---
> If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
> permit device assignment by actively resolving all remaining PoD entries.
> 
> Initially I thought this was introduced by f89f555827a6 ("remove late
> (on-demand) construction of IOMMU page tables"), but without
> arch_iommu_use_permitted() checking for PoD I think the issue has been
> there before that.
> ---
> v3: In p2m_pod_set_mem_target() move check down.
> v2: New.
> 
> --- a/tools/libs/light/libxl_pci.c
> +++ b/tools/libs/light/libxl_pci.c
> @@ -1619,8 +1619,13 @@ void libxl__device_pci_add(libxl__egc *e
>      pas->callback = device_pci_add_stubdom_done;
>  
>      if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
> -        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> -        if (rc) {
> +        int r;
> +        uint64_t cache, ents;
> +
> +        rc = ERROR_FAIL;
> +
> +        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> +        if (r) {
>              LOGD(ERROR, domid,
>                   "PCI device %04x:%02x:%02x.%u %s?",
>                   pci->domain, pci->bus, pci->dev, pci->func,
> @@ -1628,6 +1633,22 @@ void libxl__device_pci_add(libxl__egc *e
>                   : "already assigned to a different guest");
>              goto out;
>          }
> +
> +        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
> +        if (r) {
> +            LOGED(ERROR, domid, "Cannot determine PoD status");
> +            goto out;
> +        }
> +        /*
> +         * In principle it is sufficient for the domain to have ballooned down
> +         * enough such that ents <= cache.  But any remaining entries would
> +         * need resolving first.  Until such time when this gets effected,
> +         * refuse assignment as long as any entries are left.
> +         */
> +        if (ents /* > cache */) {
> +            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
> +            goto out;
> +        }
>      }
>  
>      rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include <xen/event.h>
> +#include <xen/iocap.h>
>  #include <xen/ioreq.h>
>  #include <xen/mm.h>
>  #include <xen/sched.h>
> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        ret = -ENOTEMPTY;
> +    else
> +        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>  
>  out:
>      pod_unlock(p2m);
> @@ -367,6 +371,23 @@ out:
>      return ret;
>  }
>  
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    ASSERT(is_hvm_domain(d));
> +
> +    pod_lock(p2m);
> +    lock_page_alloc(p2m);
> +
> +    target->tot_pages       = domain_tot_pages(d);
> +    target->pod_cache_pages = p2m->pod.count;
> +    target->pod_entries     = p2m->pod.entry_count;
> +
> +    unlock_page_alloc(p2m);
> +    pod_unlock(p2m);
> +}
> +
>  int p2m_pod_empty_cache(struct domain *d)
>  {
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> @@ -1384,6 +1405,9 @@ guest_physmap_mark_populate_on_demand(st
>      if ( !paging_mode_translate(d) )
>          return -EINVAL;
>  
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        return -ENOTEMPTY;
> +
>      do {
>          rc = mark_populate_on_demand(d, gfn, chunk_order);
>  
> @@ -1405,3 +1429,20 @@ void p2m_pod_init(struct p2m_domain *p2m
>      for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
>          p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
>  }
> +
> +bool p2m_pod_active(const struct domain *d)
> +{
> +    struct p2m_domain *p2m;
> +    bool res;
> +
> +    if ( !is_hvm_domain(d) )
> +        return false;
> +
> +    p2m = p2m_get_hostp2m(d);
> +
> +    pod_lock(p2m);
> +    res = p2m->pod.entry_count | p2m->pod.count;
> +    pod_unlock(p2m);
> +
> +    return res;
> +}
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4778,7 +4778,6 @@ long arch_memory_op(unsigned long cmd, X
>      {
>          xen_pod_target_t target;
>          struct domain *d;
> -        struct p2m_domain *p2m;
>  
>          if ( copy_from_guest(&target, arg, 1) )
>              return -EFAULT;
> @@ -4787,7 +4786,9 @@ long arch_memory_op(unsigned long cmd, X
>          if ( d == NULL )
>              return -ESRCH;
>  
> -        if ( cmd == XENMEM_set_pod_target )
> +        if ( !is_hvm_domain(d) )
> +            rc = -EINVAL;
> +        else if ( cmd == XENMEM_set_pod_target )
>              rc = xsm_set_pod_target(XSM_PRIV, d);
>          else
>              rc = xsm_get_pod_target(XSM_PRIV, d);
> @@ -4813,10 +4814,7 @@ long arch_memory_op(unsigned long cmd, X
>          }
>          else if ( rc >= 0 )
>          {
> -            p2m = p2m_get_hostp2m(d);
> -            target.tot_pages       = domain_tot_pages(d);
> -            target.pod_cache_pages = p2m->pod.count;
> -            target.pod_entries     = p2m->pod.entry_count;
> +            p2m_pod_get_mem_target(d, &target);
>  
>              if ( __copy_to_guest(arg, &target, 1) )
>              {
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>  
>              rc = -EXDEV;
>              /* Disallow paging in a PoD guest */
> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> +            if ( p2m_pod_active(d) )
>                  break;
>  
>              /* domain_pause() not required here, see XSA-99 */
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -454,11 +454,12 @@ bool arch_iommu_use_permitted(const stru
>  {
>      /*
>       * Prevent device assign if mem paging, mem sharing or log-dirty
> -     * have been enabled for this domain.
> +     * have been enabled for this domain, or if PoD is still in active use.
>       */
>      return d == dom_io ||
>             (likely(!mem_sharing_enabled(d)) &&
>              likely(!mem_paging_enabled(d)) &&
> +            likely(!p2m_pod_active(d)) &&
>              likely(!p2m_get_hostp2m(d)->global_logdirty));
>  }
>  
> --- a/xen/arch/x86/include/asm/p2m.h
> +++ b/xen/arch/x86/include/asm/p2m.h
> @@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d
>   * domain matches target */
>  int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
>  
> +/* Obtain a consistent snapshot of PoD related domain state. */
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
> +
> +/* Check whether PoD is (still) active in a domain. */
> +bool p2m_pod_active(const struct domain *d);
> +
>  /* Scan pod cache when offline/broken page triggered */
>  int
>  p2m_pod_offline_or_broken_hit(struct page_info *p);
> @@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct pag
>  void
>  p2m_pod_offline_or_broken_replace(struct page_info *p);
>  
> -static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
> -{
> -    return p2m->pod.entry_count;
> -}
> -
>  void p2m_pod_init(struct p2m_domain *p2m);
>  
>  #else
> @@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(st
>      return 0;
>  }
>  
> +static inline bool p2m_pod_active(const struct domain *d)
> +{
> +    return false;
> +}
> +
>  static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
>  {
>      return 0;
> @@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_br
>      ASSERT_UNREACHABLE();
>  }
>  
> -static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
> -{
> -    return 0;
> -}
> -
>  static inline void p2m_pod_init(struct p2m_domain *p2m) {}
>  
>  #endif
> 
>
Anthony PERARD Jan. 28, 2022, 4:07 p.m. UTC | #2
On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Anthony PERARD <anthony.perard@citrix.com>

Thanks,
Roger Pau Monné Feb. 2, 2022, 4:13 p.m. UTC | #3
On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
> permit device assignment by actively resolving all remaining PoD entries.
> 
> Initially I thought this was introduced by f89f555827a6 ("remove late
> (on-demand) construction of IOMMU page tables"), but without
> arch_iommu_use_permitted() checking for PoD I think the issue has been
> there before that.
> ---
> v3: In p2m_pod_set_mem_target() move check down.
> v2: New.
> 
> --- a/tools/libs/light/libxl_pci.c
> +++ b/tools/libs/light/libxl_pci.c
> @@ -1619,8 +1619,13 @@ void libxl__device_pci_add(libxl__egc *e
>      pas->callback = device_pci_add_stubdom_done;
>  
>      if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
> -        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> -        if (rc) {
> +        int r;
> +        uint64_t cache, ents;
> +
> +        rc = ERROR_FAIL;
> +
> +        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> +        if (r) {
>              LOGD(ERROR, domid,
>                   "PCI device %04x:%02x:%02x.%u %s?",
>                   pci->domain, pci->bus, pci->dev, pci->func,
> @@ -1628,6 +1633,22 @@ void libxl__device_pci_add(libxl__egc *e
>                   : "already assigned to a different guest");
>              goto out;
>          }
> +
> +        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
> +        if (r) {
> +            LOGED(ERROR, domid, "Cannot determine PoD status");
> +            goto out;
> +        }
> +        /*
> +         * In principle it is sufficient for the domain to have ballooned down
> +         * enough such that ents <= cache.  But any remaining entries would
> +         * need resolving first.  Until such time when this gets effected,
> +         * refuse assignment as long as any entries are left.
> +         */
> +        if (ents /* > cache */) {
> +            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
> +            goto out;
> +        }
>      }
>  
>      rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include <xen/event.h>
> +#include <xen/iocap.h>
>  #include <xen/ioreq.h>
>  #include <xen/mm.h>
>  #include <xen/sched.h>
> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )

Is it possible to have cache flush allowed without any PCI device
assigned? AFAICT the iomem/ioport_caps would only get setup when there
are device passed through?

TBH I would be fine if we just say that PoD cannot be used in
conjunction with an IOMMU, and just check for is_iommu_enable(d) here.

I understand it's technically possible for PoD to be used together
with a domain that will later get a device passed through once PoD is
no longer in use, but I doubt there's much value in supporting that
use case, and I fear we might be introducing corner cases that could
create issues in the future. Overall I think it would be safer to just
disable PoD in conjunction with an IOMMU.

> +        ret = -ENOTEMPTY;
> +    else
> +        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>  
>  out:
>      pod_unlock(p2m);
> @@ -367,6 +371,23 @@ out:
>      return ret;
>  }
>  
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    ASSERT(is_hvm_domain(d));
> +
> +    pod_lock(p2m);
> +    lock_page_alloc(p2m);
> +
> +    target->tot_pages       = domain_tot_pages(d);
> +    target->pod_cache_pages = p2m->pod.count;
> +    target->pod_entries     = p2m->pod.entry_count;
> +
> +    unlock_page_alloc(p2m);
> +    pod_unlock(p2m);
> +}
> +
>  int p2m_pod_empty_cache(struct domain *d)
>  {
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> @@ -1384,6 +1405,9 @@ guest_physmap_mark_populate_on_demand(st
>      if ( !paging_mode_translate(d) )
>          return -EINVAL;
>  
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        return -ENOTEMPTY;
> +
>      do {
>          rc = mark_populate_on_demand(d, gfn, chunk_order);
>  
> @@ -1405,3 +1429,20 @@ void p2m_pod_init(struct p2m_domain *p2m
>      for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
>          p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
>  }
> +
> +bool p2m_pod_active(const struct domain *d)
> +{
> +    struct p2m_domain *p2m;
> +    bool res;
> +
> +    if ( !is_hvm_domain(d) )
> +        return false;
> +
> +    p2m = p2m_get_hostp2m(d);
> +
> +    pod_lock(p2m);
> +    res = p2m->pod.entry_count | p2m->pod.count;
> +    pod_unlock(p2m);
> +
> +    return res;
> +}
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4778,7 +4778,6 @@ long arch_memory_op(unsigned long cmd, X
>      {
>          xen_pod_target_t target;
>          struct domain *d;
> -        struct p2m_domain *p2m;
>  
>          if ( copy_from_guest(&target, arg, 1) )
>              return -EFAULT;
> @@ -4787,7 +4786,9 @@ long arch_memory_op(unsigned long cmd, X
>          if ( d == NULL )
>              return -ESRCH;
>  
> -        if ( cmd == XENMEM_set_pod_target )
> +        if ( !is_hvm_domain(d) )
> +            rc = -EINVAL;
> +        else if ( cmd == XENMEM_set_pod_target )
>              rc = xsm_set_pod_target(XSM_PRIV, d);
>          else
>              rc = xsm_get_pod_target(XSM_PRIV, d);
> @@ -4813,10 +4814,7 @@ long arch_memory_op(unsigned long cmd, X
>          }
>          else if ( rc >= 0 )
>          {
> -            p2m = p2m_get_hostp2m(d);
> -            target.tot_pages       = domain_tot_pages(d);
> -            target.pod_cache_pages = p2m->pod.count;
> -            target.pod_entries     = p2m->pod.entry_count;
> +            p2m_pod_get_mem_target(d, &target);
>  
>              if ( __copy_to_guest(arg, &target, 1) )
>              {
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>  
>              rc = -EXDEV;
>              /* Disallow paging in a PoD guest */
> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> +            if ( p2m_pod_active(d) )

Isn't it fine to just check for entry_count like you suggest in the
change to libxl? This is what p2m_pod_entry_count actually does
(rather than entry_count | count).

Thanks, Roger.
Jan Beulich Feb. 3, 2022, 8:31 a.m. UTC | #4
On 02.02.2022 17:13, Roger Pau Monné wrote:
> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>  
>>      ASSERT( pod_target >= p2m->pod.count );
>>  
>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> 
> Is it possible to have cache flush allowed without any PCI device
> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> are device passed through?

One can assign MMIO or ports to a guest the raw way. That's not
secure, but functionally explicitly permitted.

> TBH I would be fine if we just say that PoD cannot be used in
> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> 
> I understand it's technically possible for PoD to be used together
> with a domain that will later get a device passed through once PoD is
> no longer in use, but I doubt there's much value in supporting that
> use case, and I fear we might be introducing corner cases that could
> create issues in the future. Overall I think it would be safer to just
> disable PoD in conjunction with an IOMMU.

I consider it wrong to put in place such a restriction, but I could
perhaps accept you and Andrew thinking this way if this was the only
aspect playing into here. However, this would then want an equivalent
tools side check, and while hunting down where to make the change as
done here, I wasn't able to figure out where that alternative
adjustment would need doing. Hence I would possibly(!) buy into this
only if someone else took care of doing so properly in the tool stack
(including the emission of a sensible error message).

Finally this still leaves out the "raw MMIO / ports" case mentioned
above.

>> --- a/xen/common/vm_event.c
>> +++ b/xen/common/vm_event.c
>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>>  
>>              rc = -EXDEV;
>>              /* Disallow paging in a PoD guest */
>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
>> +            if ( p2m_pod_active(d) )
> 
> Isn't it fine to just check for entry_count like you suggest in the
> change to libxl?

I didn't think it would be, but I'm not entirely sure: If paging was
enabled before a guest actually starts, it wouldn't have any entries
but still be a PoD guest if it has a non-empty cache. The VM event
folks may be able to clarify this either way. But ...

> This is what p2m_pod_entry_count actually does (rather than entry_count | count).

... you really mean "did" here, as I'm removing p2m_pod_entry_count()
in this patch. Of course locking could be added to it instead (or
p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
could go with just the check which precisely matches what the comment
says (IOW otherwise I'd need to additionally know what exactly the
comment is to say).

Jan
Roger Pau Monné Feb. 3, 2022, 9:04 a.m. UTC | #5
On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> On 02.02.2022 17:13, Roger Pau Monné wrote:
> > On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>  
> >>      ASSERT( pod_target >= p2m->pod.count );
> >>  
> >> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> > 
> > Is it possible to have cache flush allowed without any PCI device
> > assigned? AFAICT the iomem/ioport_caps would only get setup when there
> > are device passed through?
> 
> One can assign MMIO or ports to a guest the raw way. That's not
> secure, but functionally explicitly permitted.
> 
> > TBH I would be fine if we just say that PoD cannot be used in
> > conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> > 
> > I understand it's technically possible for PoD to be used together
> > with a domain that will later get a device passed through once PoD is
> > no longer in use, but I doubt there's much value in supporting that
> > use case, and I fear we might be introducing corner cases that could
> > create issues in the future. Overall I think it would be safer to just
> > disable PoD in conjunction with an IOMMU.
> 
> I consider it wrong to put in place such a restriction, but I could
> perhaps accept you and Andrew thinking this way if this was the only
> aspect playing into here. However, this would then want an equivalent
> tools side check, and while hunting down where to make the change as
> done here, I wasn't able to figure out where that alternative
> adjustment would need doing. Hence I would possibly(!) buy into this
> only if someone else took care of doing so properly in the tool stack
> (including the emission of a sensible error message).

What about the (completely untested) chunk below:

diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
index d7a40d7550..e585ef4c5c 100644
--- a/tools/libs/light/libxl_create.c
+++ b/tools/libs/light/libxl_create.c
@@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
     pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
         (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
 
-    /* We cannot have PoD and PCI device assignment at the same time
+    /* We cannot have PoD and an active IOMMU at the same time
      * for HVM guest. It was reported that IOMMU cannot work with PoD
      * enabled because it needs to populated entire page table for
-     * guest. To stay on the safe side, we disable PCI device
-     * assignment when PoD is enabled.
+     * guest.
      */
     if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
-        d_config->num_pcidevs && pod_enabled) {
+        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
+        pod_enabled) {
         ret = ERROR_INVAL;
-        LOGD(ERROR, domid,
-             "PCI device assignment for HVM guest failed due to PoD enabled");
+        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
         goto error_out;
     }
 


> Finally this still leaves out the "raw MMIO / ports" case mentioned
> above.

But the raw MMIO 'mode' doesn't care much about PoD, because if
there's no PCI device assigned there's no IOMMU setup, and thus such
raw MMIO regions (could?) belong to a device that's not constrained by
the guest p2m anyway?

> >> --- a/xen/common/vm_event.c
> >> +++ b/xen/common/vm_event.c
> >> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
> >>  
> >>              rc = -EXDEV;
> >>              /* Disallow paging in a PoD guest */
> >> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> >> +            if ( p2m_pod_active(d) )
> > 
> > Isn't it fine to just check for entry_count like you suggest in the
> > change to libxl?
> 
> I didn't think it would be, but I'm not entirely sure: If paging was
> enabled before a guest actually starts, it wouldn't have any entries
> but still be a PoD guest if it has a non-empty cache. The VM event
> folks may be able to clarify this either way. But ...
> 
> > This is what p2m_pod_entry_count actually does (rather than entry_count | count).
> 
> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
> in this patch. Of course locking could be added to it instead (or
> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
> could go with just the check which precisely matches what the comment
> says (IOW otherwise I'd need to additionally know what exactly the
> comment is to say).

Could you briefly mention this in the commit message? Ie: VM event
code is also adjusted to make sure PoD is not in use and cannot be
used during the guest lifetime?

Thanks, Roger.
Jan Beulich Feb. 3, 2022, 9:21 a.m. UTC | #6
On 03.02.2022 10:04, Roger Pau Monné wrote:
> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
>> On 02.02.2022 17:13, Roger Pau Monné wrote:
>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>>>  
>>>>      ASSERT( pod_target >= p2m->pod.count );
>>>>  
>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>>
>>> Is it possible to have cache flush allowed without any PCI device
>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
>>> are device passed through?
>>
>> One can assign MMIO or ports to a guest the raw way. That's not
>> secure, but functionally explicitly permitted.
>>
>>> TBH I would be fine if we just say that PoD cannot be used in
>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
>>>
>>> I understand it's technically possible for PoD to be used together
>>> with a domain that will later get a device passed through once PoD is
>>> no longer in use, but I doubt there's much value in supporting that
>>> use case, and I fear we might be introducing corner cases that could
>>> create issues in the future. Overall I think it would be safer to just
>>> disable PoD in conjunction with an IOMMU.
>>
>> I consider it wrong to put in place such a restriction, but I could
>> perhaps accept you and Andrew thinking this way if this was the only
>> aspect playing into here. However, this would then want an equivalent
>> tools side check, and while hunting down where to make the change as
>> done here, I wasn't able to figure out where that alternative
>> adjustment would need doing. Hence I would possibly(!) buy into this
>> only if someone else took care of doing so properly in the tool stack
>> (including the emission of a sensible error message).
> 
> What about the (completely untested) chunk below:
> 
> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> index d7a40d7550..e585ef4c5c 100644
> --- a/tools/libs/light/libxl_create.c
> +++ b/tools/libs/light/libxl_create.c
> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
>  
> -    /* We cannot have PoD and PCI device assignment at the same time
> +    /* We cannot have PoD and an active IOMMU at the same time
>       * for HVM guest. It was reported that IOMMU cannot work with PoD
>       * enabled because it needs to populated entire page table for
> -     * guest. To stay on the safe side, we disable PCI device
> -     * assignment when PoD is enabled.
> +     * guest.
>       */
>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> -        d_config->num_pcidevs && pod_enabled) {
> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> +        pod_enabled) {
>          ret = ERROR_INVAL;
> -        LOGD(ERROR, domid,
> -             "PCI device assignment for HVM guest failed due to PoD enabled");
> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
>          goto error_out;
>      }

Perhaps. Seeing this I actually recall coming across this check during
my investigation. Not changing it along the lines of what you do was
then really more because of me not being convinced of the extra
restriction; I clearly misremembered when writing the earlier reply.
If we were to do what you suggest, I'd like to ask that the comment be
changed differently, though: "We cannot ..." then isn't really true
anymore. We choose not to permit this mode; "cannot" only applies to
actual device assignment (and of course only as long as there aren't
restartable IOMMU faults).

>> Finally this still leaves out the "raw MMIO / ports" case mentioned
>> above.
> 
> But the raw MMIO 'mode' doesn't care much about PoD, because if
> there's no PCI device assigned there's no IOMMU setup, and thus such
> raw MMIO regions (could?) belong to a device that's not constrained by
> the guest p2m anyway?

Hmm, yes, true.

>>>> --- a/xen/common/vm_event.c
>>>> +++ b/xen/common/vm_event.c
>>>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>>>>  
>>>>              rc = -EXDEV;
>>>>              /* Disallow paging in a PoD guest */
>>>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
>>>> +            if ( p2m_pod_active(d) )
>>>
>>> Isn't it fine to just check for entry_count like you suggest in the
>>> change to libxl?
>>
>> I didn't think it would be, but I'm not entirely sure: If paging was
>> enabled before a guest actually starts, it wouldn't have any entries
>> but still be a PoD guest if it has a non-empty cache. The VM event
>> folks may be able to clarify this either way. But ...
>>
>>> This is what p2m_pod_entry_count actually does (rather than entry_count | count).
>>
>> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
>> in this patch. Of course locking could be added to it instead (or
>> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
>> could go with just the check which precisely matches what the comment
>> says (IOW otherwise I'd need to additionally know what exactly the
>> comment is to say).
> 
> Could you briefly mention this in the commit message? Ie: VM event
> code is also adjusted to make sure PoD is not in use and cannot be
> used during the guest lifetime?

I've added

"Nor was the use of that function in line with the immediately preceding
 comment: A PoD guest isn't just one with a non-zero entry count, but
 also one with a non-empty cache (e.g. prior to actually launching the
 guest)."

to the already existing paragraph about the removal of that function.

Jan
Roger Pau Monné Feb. 3, 2022, 9:52 a.m. UTC | #7
On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
> On 03.02.2022 10:04, Roger Pau Monné wrote:
> > On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> >> On 02.02.2022 17:13, Roger Pau Monné wrote:
> >>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>>>  
> >>>>      ASSERT( pod_target >= p2m->pod.count );
> >>>>  
> >>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> >>>
> >>> Is it possible to have cache flush allowed without any PCI device
> >>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> >>> are device passed through?
> >>
> >> One can assign MMIO or ports to a guest the raw way. That's not
> >> secure, but functionally explicitly permitted.
> >>
> >>> TBH I would be fine if we just say that PoD cannot be used in
> >>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> >>>
> >>> I understand it's technically possible for PoD to be used together
> >>> with a domain that will later get a device passed through once PoD is
> >>> no longer in use, but I doubt there's much value in supporting that
> >>> use case, and I fear we might be introducing corner cases that could
> >>> create issues in the future. Overall I think it would be safer to just
> >>> disable PoD in conjunction with an IOMMU.
> >>
> >> I consider it wrong to put in place such a restriction, but I could
> >> perhaps accept you and Andrew thinking this way if this was the only
> >> aspect playing into here. However, this would then want an equivalent
> >> tools side check, and while hunting down where to make the change as
> >> done here, I wasn't able to figure out where that alternative
> >> adjustment would need doing. Hence I would possibly(!) buy into this
> >> only if someone else took care of doing so properly in the tool stack
> >> (including the emission of a sensible error message).
> > 
> > What about the (completely untested) chunk below:
> > 
> > diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> > index d7a40d7550..e585ef4c5c 100644
> > --- a/tools/libs/light/libxl_create.c
> > +++ b/tools/libs/light/libxl_create.c
> > @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
> >      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
> >          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
> >  
> > -    /* We cannot have PoD and PCI device assignment at the same time
> > +    /* We cannot have PoD and an active IOMMU at the same time
> >       * for HVM guest. It was reported that IOMMU cannot work with PoD
> >       * enabled because it needs to populated entire page table for
> > -     * guest. To stay on the safe side, we disable PCI device
> > -     * assignment when PoD is enabled.
> > +     * guest.
> >       */
> >      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> > -        d_config->num_pcidevs && pod_enabled) {
> > +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> > +        pod_enabled) {
> >          ret = ERROR_INVAL;
> > -        LOGD(ERROR, domid,
> > -             "PCI device assignment for HVM guest failed due to PoD enabled");
> > +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
> >          goto error_out;
> >      }
> 
> Perhaps. Seeing this I actually recall coming across this check during
> my investigation. Not changing it along the lines of what you do was
> then really more because of me not being convinced of the extra
> restriction; I clearly misremembered when writing the earlier reply.
> If we were to do what you suggest, I'd like to ask that the comment be
> changed differently, though: "We cannot ..." then isn't really true
> anymore. We choose not to permit this mode; "cannot" only applies to
> actual device assignment (and of course only as long as there aren't
> restartable IOMMU faults).

I'm fine with an adjusted wording here. This was mostly a placement
suggestion, but I didn't gave much thought to the error message.

> >> Finally this still leaves out the "raw MMIO / ports" case mentioned
> >> above.
> > 
> > But the raw MMIO 'mode' doesn't care much about PoD, because if
> > there's no PCI device assigned there's no IOMMU setup, and thus such
> > raw MMIO regions (could?) belong to a device that's not constrained by
> > the guest p2m anyway?
> 
> Hmm, yes, true.
> 
> >>>> --- a/xen/common/vm_event.c
> >>>> +++ b/xen/common/vm_event.c
> >>>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
> >>>>  
> >>>>              rc = -EXDEV;
> >>>>              /* Disallow paging in a PoD guest */
> >>>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> >>>> +            if ( p2m_pod_active(d) )
> >>>
> >>> Isn't it fine to just check for entry_count like you suggest in the
> >>> change to libxl?
> >>
> >> I didn't think it would be, but I'm not entirely sure: If paging was
> >> enabled before a guest actually starts, it wouldn't have any entries
> >> but still be a PoD guest if it has a non-empty cache. The VM event
> >> folks may be able to clarify this either way. But ...
> >>
> >>> This is what p2m_pod_entry_count actually does (rather than entry_count | count).
> >>
> >> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
> >> in this patch. Of course locking could be added to it instead (or
> >> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
> >> could go with just the check which precisely matches what the comment
> >> says (IOW otherwise I'd need to additionally know what exactly the
> >> comment is to say).
> > 
> > Could you briefly mention this in the commit message? Ie: VM event
> > code is also adjusted to make sure PoD is not in use and cannot be
> > used during the guest lifetime?
> 
> I've added
> 
> "Nor was the use of that function in line with the immediately preceding
>  comment: A PoD guest isn't just one with a non-zero entry count, but
>  also one with a non-empty cache (e.g. prior to actually launching the
>  guest)."
> 
> to the already existing paragraph about the removal of that function.

Thanks, Roger.
Jan Beulich Feb. 3, 2022, 10:20 a.m. UTC | #8
On 03.02.2022 10:52, Roger Pau Monné wrote:
> On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
>> On 03.02.2022 10:04, Roger Pau Monné wrote:
>>> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
>>>> On 02.02.2022 17:13, Roger Pau Monné wrote:
>>>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>>>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>>>>>  
>>>>>>      ASSERT( pod_target >= p2m->pod.count );
>>>>>>  
>>>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>>>>
>>>>> Is it possible to have cache flush allowed without any PCI device
>>>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
>>>>> are device passed through?
>>>>
>>>> One can assign MMIO or ports to a guest the raw way. That's not
>>>> secure, but functionally explicitly permitted.
>>>>
>>>>> TBH I would be fine if we just say that PoD cannot be used in
>>>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
>>>>>
>>>>> I understand it's technically possible for PoD to be used together
>>>>> with a domain that will later get a device passed through once PoD is
>>>>> no longer in use, but I doubt there's much value in supporting that
>>>>> use case, and I fear we might be introducing corner cases that could
>>>>> create issues in the future. Overall I think it would be safer to just
>>>>> disable PoD in conjunction with an IOMMU.
>>>>
>>>> I consider it wrong to put in place such a restriction, but I could
>>>> perhaps accept you and Andrew thinking this way if this was the only
>>>> aspect playing into here. However, this would then want an equivalent
>>>> tools side check, and while hunting down where to make the change as
>>>> done here, I wasn't able to figure out where that alternative
>>>> adjustment would need doing. Hence I would possibly(!) buy into this
>>>> only if someone else took care of doing so properly in the tool stack
>>>> (including the emission of a sensible error message).
>>>
>>> What about the (completely untested) chunk below:
>>>
>>> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
>>> index d7a40d7550..e585ef4c5c 100644
>>> --- a/tools/libs/light/libxl_create.c
>>> +++ b/tools/libs/light/libxl_create.c
>>> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
>>>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
>>>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
>>>  
>>> -    /* We cannot have PoD and PCI device assignment at the same time
>>> +    /* We cannot have PoD and an active IOMMU at the same time
>>>       * for HVM guest. It was reported that IOMMU cannot work with PoD
>>>       * enabled because it needs to populated entire page table for
>>> -     * guest. To stay on the safe side, we disable PCI device
>>> -     * assignment when PoD is enabled.
>>> +     * guest.
>>>       */
>>>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
>>> -        d_config->num_pcidevs && pod_enabled) {
>>> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
>>> +        pod_enabled) {
>>>          ret = ERROR_INVAL;
>>> -        LOGD(ERROR, domid,
>>> -             "PCI device assignment for HVM guest failed due to PoD enabled");
>>> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
>>>          goto error_out;
>>>      }
>>
>> Perhaps. Seeing this I actually recall coming across this check during
>> my investigation. Not changing it along the lines of what you do was
>> then really more because of me not being convinced of the extra
>> restriction; I clearly misremembered when writing the earlier reply.
>> If we were to do what you suggest, I'd like to ask that the comment be
>> changed differently, though: "We cannot ..." then isn't really true
>> anymore. We choose not to permit this mode; "cannot" only applies to
>> actual device assignment (and of course only as long as there aren't
>> restartable IOMMU faults).
> 
> I'm fine with an adjusted wording here. This was mostly a placement
> suggestion, but I didn't gave much thought to the error message.

FTAOD: Are you going to transform this into a proper patch then? While
I wouldn't object to such a behavioral change, I also wouldn't want to
put my name under it. But if it went in, I think I might be able to
then drop the libxl adjustment from my patch.

Jan
Roger Pau Monné Feb. 3, 2022, 10:23 a.m. UTC | #9
On Thu, Feb 03, 2022 at 11:20:15AM +0100, Jan Beulich wrote:
> On 03.02.2022 10:52, Roger Pau Monné wrote:
> > On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
> >> On 03.02.2022 10:04, Roger Pau Monné wrote:
> >>> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> >>>> On 02.02.2022 17:13, Roger Pau Monné wrote:
> >>>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >>>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>>>>>  
> >>>>>>      ASSERT( pod_target >= p2m->pod.count );
> >>>>>>  
> >>>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >>>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> >>>>>
> >>>>> Is it possible to have cache flush allowed without any PCI device
> >>>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> >>>>> are device passed through?
> >>>>
> >>>> One can assign MMIO or ports to a guest the raw way. That's not
> >>>> secure, but functionally explicitly permitted.
> >>>>
> >>>>> TBH I would be fine if we just say that PoD cannot be used in
> >>>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> >>>>>
> >>>>> I understand it's technically possible for PoD to be used together
> >>>>> with a domain that will later get a device passed through once PoD is
> >>>>> no longer in use, but I doubt there's much value in supporting that
> >>>>> use case, and I fear we might be introducing corner cases that could
> >>>>> create issues in the future. Overall I think it would be safer to just
> >>>>> disable PoD in conjunction with an IOMMU.
> >>>>
> >>>> I consider it wrong to put in place such a restriction, but I could
> >>>> perhaps accept you and Andrew thinking this way if this was the only
> >>>> aspect playing into here. However, this would then want an equivalent
> >>>> tools side check, and while hunting down where to make the change as
> >>>> done here, I wasn't able to figure out where that alternative
> >>>> adjustment would need doing. Hence I would possibly(!) buy into this
> >>>> only if someone else took care of doing so properly in the tool stack
> >>>> (including the emission of a sensible error message).
> >>>
> >>> What about the (completely untested) chunk below:
> >>>
> >>> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> >>> index d7a40d7550..e585ef4c5c 100644
> >>> --- a/tools/libs/light/libxl_create.c
> >>> +++ b/tools/libs/light/libxl_create.c
> >>> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
> >>>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
> >>>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
> >>>  
> >>> -    /* We cannot have PoD and PCI device assignment at the same time
> >>> +    /* We cannot have PoD and an active IOMMU at the same time
> >>>       * for HVM guest. It was reported that IOMMU cannot work with PoD
> >>>       * enabled because it needs to populated entire page table for
> >>> -     * guest. To stay on the safe side, we disable PCI device
> >>> -     * assignment when PoD is enabled.
> >>> +     * guest.
> >>>       */
> >>>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> >>> -        d_config->num_pcidevs && pod_enabled) {
> >>> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> >>> +        pod_enabled) {
> >>>          ret = ERROR_INVAL;
> >>> -        LOGD(ERROR, domid,
> >>> -             "PCI device assignment for HVM guest failed due to PoD enabled");
> >>> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
> >>>          goto error_out;
> >>>      }
> >>
> >> Perhaps. Seeing this I actually recall coming across this check during
> >> my investigation. Not changing it along the lines of what you do was
> >> then really more because of me not being convinced of the extra
> >> restriction; I clearly misremembered when writing the earlier reply.
> >> If we were to do what you suggest, I'd like to ask that the comment be
> >> changed differently, though: "We cannot ..." then isn't really true
> >> anymore. We choose not to permit this mode; "cannot" only applies to
> >> actual device assignment (and of course only as long as there aren't
> >> restartable IOMMU faults).
> > 
> > I'm fine with an adjusted wording here. This was mostly a placement
> > suggestion, but I didn't gave much thought to the error message.
> 
> FTAOD: Are you going to transform this into a proper patch then? While
> I wouldn't object to such a behavioral change, I also wouldn't want to
> put my name under it. But if it went in, I think I might be able to
> then drop the libxl adjustment from my patch.

Oh, I somewhat assumed you would integrate this check into the patch.
I can send a standalone patch myself if that's your preference. Let me
do that now.

Roger.
diff mbox series

Patch

--- a/tools/libs/light/libxl_pci.c
+++ b/tools/libs/light/libxl_pci.c
@@ -1619,8 +1619,13 @@  void libxl__device_pci_add(libxl__egc *e
     pas->callback = device_pci_add_stubdom_done;
 
     if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
-        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
-        if (rc) {
+        int r;
+        uint64_t cache, ents;
+
+        rc = ERROR_FAIL;
+
+        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
+        if (r) {
             LOGD(ERROR, domid,
                  "PCI device %04x:%02x:%02x.%u %s?",
                  pci->domain, pci->bus, pci->dev, pci->func,
@@ -1628,6 +1633,22 @@  void libxl__device_pci_add(libxl__egc *e
                  : "already assigned to a different guest");
             goto out;
         }
+
+        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
+        if (r) {
+            LOGED(ERROR, domid, "Cannot determine PoD status");
+            goto out;
+        }
+        /*
+         * In principle it is sufficient for the domain to have ballooned down
+         * enough such that ents <= cache.  But any remaining entries would
+         * need resolving first.  Until such time when this gets effected,
+         * refuse assignment as long as any entries are left.
+         */
+        if (ents /* > cache */) {
+            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
+            goto out;
+        }
     }
 
     rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -20,6 +20,7 @@ 
  */
 
 #include <xen/event.h>
+#include <xen/iocap.h>
 #include <xen/ioreq.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
@@ -359,7 +360,10 @@  p2m_pod_set_mem_target(struct domain *d,
 
     ASSERT( pod_target >= p2m->pod.count );
 
-    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        ret = -ENOTEMPTY;
+    else
+        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
 
 out:
     pod_unlock(p2m);
@@ -367,6 +371,23 @@  out:
     return ret;
 }
 
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    ASSERT(is_hvm_domain(d));
+
+    pod_lock(p2m);
+    lock_page_alloc(p2m);
+
+    target->tot_pages       = domain_tot_pages(d);
+    target->pod_cache_pages = p2m->pod.count;
+    target->pod_entries     = p2m->pod.entry_count;
+
+    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
+}
+
 int p2m_pod_empty_cache(struct domain *d)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
@@ -1384,6 +1405,9 @@  guest_physmap_mark_populate_on_demand(st
     if ( !paging_mode_translate(d) )
         return -EINVAL;
 
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        return -ENOTEMPTY;
+
     do {
         rc = mark_populate_on_demand(d, gfn, chunk_order);
 
@@ -1405,3 +1429,20 @@  void p2m_pod_init(struct p2m_domain *p2m
     for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
         p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
 }
+
+bool p2m_pod_active(const struct domain *d)
+{
+    struct p2m_domain *p2m;
+    bool res;
+
+    if ( !is_hvm_domain(d) )
+        return false;
+
+    p2m = p2m_get_hostp2m(d);
+
+    pod_lock(p2m);
+    res = p2m->pod.entry_count | p2m->pod.count;
+    pod_unlock(p2m);
+
+    return res;
+}
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4778,7 +4778,6 @@  long arch_memory_op(unsigned long cmd, X
     {
         xen_pod_target_t target;
         struct domain *d;
-        struct p2m_domain *p2m;
 
         if ( copy_from_guest(&target, arg, 1) )
             return -EFAULT;
@@ -4787,7 +4786,9 @@  long arch_memory_op(unsigned long cmd, X
         if ( d == NULL )
             return -ESRCH;
 
-        if ( cmd == XENMEM_set_pod_target )
+        if ( !is_hvm_domain(d) )
+            rc = -EINVAL;
+        else if ( cmd == XENMEM_set_pod_target )
             rc = xsm_set_pod_target(XSM_PRIV, d);
         else
             rc = xsm_get_pod_target(XSM_PRIV, d);
@@ -4813,10 +4814,7 @@  long arch_memory_op(unsigned long cmd, X
         }
         else if ( rc >= 0 )
         {
-            p2m = p2m_get_hostp2m(d);
-            target.tot_pages       = domain_tot_pages(d);
-            target.pod_cache_pages = p2m->pod.count;
-            target.pod_entries     = p2m->pod.entry_count;
+            p2m_pod_get_mem_target(d, &target);
 
             if ( __copy_to_guest(arg, &target, 1) )
             {
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -639,7 +639,7 @@  int vm_event_domctl(struct domain *d, st
 
             rc = -EXDEV;
             /* Disallow paging in a PoD guest */
-            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
+            if ( p2m_pod_active(d) )
                 break;
 
             /* domain_pause() not required here, see XSA-99 */
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -454,11 +454,12 @@  bool arch_iommu_use_permitted(const stru
 {
     /*
      * Prevent device assign if mem paging, mem sharing or log-dirty
-     * have been enabled for this domain.
+     * have been enabled for this domain, or if PoD is still in active use.
      */
     return d == dom_io ||
            (likely(!mem_sharing_enabled(d)) &&
             likely(!mem_paging_enabled(d)) &&
+            likely(!p2m_pod_active(d)) &&
             likely(!p2m_get_hostp2m(d)->global_logdirty));
 }
 
--- a/xen/arch/x86/include/asm/p2m.h
+++ b/xen/arch/x86/include/asm/p2m.h
@@ -661,6 +661,12 @@  int p2m_pod_empty_cache(struct domain *d
  * domain matches target */
 int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
 
+/* Obtain a consistent snapshot of PoD related domain state. */
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
+
+/* Check whether PoD is (still) active in a domain. */
+bool p2m_pod_active(const struct domain *d);
+
 /* Scan pod cache when offline/broken page triggered */
 int
 p2m_pod_offline_or_broken_hit(struct page_info *p);
@@ -669,11 +675,6 @@  p2m_pod_offline_or_broken_hit(struct pag
 void
 p2m_pod_offline_or_broken_replace(struct page_info *p);
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return p2m->pod.entry_count;
-}
-
 void p2m_pod_init(struct p2m_domain *p2m);
 
 #else
@@ -689,6 +690,11 @@  static inline int p2m_pod_empty_cache(st
     return 0;
 }
 
+static inline bool p2m_pod_active(const struct domain *d)
+{
+    return false;
+}
+
 static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
 {
     return 0;
@@ -699,11 +705,6 @@  static inline void p2m_pod_offline_or_br
     ASSERT_UNREACHABLE();
 }
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return 0;
-}
-
 static inline void p2m_pod_init(struct p2m_domain *p2m) {}
 
 #endif