diff mbox series

[v3] x86/HVM: p2m_ram_ro is incompatible with device pass-through

Message ID 98fd2d35-30f6-961d-a03d-3354b77d49b0@suse.com (mailing list archive)
State New, archived
Headers show
Series [v3] x86/HVM: p2m_ram_ro is incompatible with device pass-through | expand

Commit Message

Jan Beulich Oct. 1, 2019, 9:07 a.m. UTC
The write-discard property of the type can't be represented in IOMMU
page table entries. Make sure the respective checks / tracking can't
race, by utilizing the domain lock. The other sides of the sharing/
paging/log-dirty exclusion checks should subsequently perhaps also be
put under that lock then.

This also fixes an unguarded d->arch.hvm access.

Take the opportunity and also convert neighboring bool_t to bool in
struct hvm_domain.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
---
v3: Re-base.
v2: Don't set p2m_ram_ro_used when failing the request.

Comments

Roger Pau Monné Oct. 1, 2019, 11:30 a.m. UTC | #1
On Tue, Oct 01, 2019 at 11:07:55AM +0200, Jan Beulich wrote:
> The write-discard property of the type can't be represented in IOMMU
> page table entries. Make sure the respective checks / tracking can't
> race, by utilizing the domain lock. The other sides of the sharing/
> paging/log-dirty exclusion checks should subsequently perhaps also be
> put under that lock then.
> 
> This also fixes an unguarded d->arch.hvm access.
> 
> Take the opportunity and also convert neighboring bool_t to bool in
> struct hvm_domain.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
> ---
> v3: Re-base.
> v2: Don't set p2m_ram_ro_used when failing the request.
> 
> --- a/xen/arch/x86/hvm/dm.c
> +++ b/xen/arch/x86/hvm/dm.c
> @@ -255,16 +255,33 @@ static int set_mem_type(struct domain *d
>  
>      mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
>  
> -    if ( mem_type == HVMMEM_ioreq_server )
> +    switch ( mem_type )
>      {
>          unsigned int flags;
>  
> +    case HVMMEM_ioreq_server:
>          if ( !hap_enabled(d) )
>              return -EOPNOTSUPP;
>  
>          /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
>          if ( !p2m_get_ioreq_server(d, &flags) )
>              return -EINVAL;
> +
> +        break;
> +
> +    case HVMMEM_ram_ro:
> +        /* p2m_ram_ro can't be represented in IOMMU mappings. */
> +        domain_lock(d);
> +        if ( has_arch_pdevs(d) )

I would use is_iommu_enabled because I think it's clearer in this
context (giving the comment above explicitly refers to having iommu
mappings).

> +            rc = -EXDEV;

EOPNOTSUPP might be better, since it's possible that future iommus
support such page type?

> +        else
> +            d->arch.hvm.p2m_ram_ro_used = true;
> +        domain_unlock(d);
> +
> +        if ( rc )
> +            return rc;
> +
> +        break;
>      }
>  
>      while ( iter < data->nr )
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -1486,15 +1486,33 @@ static int assign_device(struct domain *
>      if ( !is_iommu_enabled(d) )
>          return 0;
>  
> -    /* Prevent device assign if mem paging or mem sharing have been 
> -     * enabled for this domain */
> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
> -                  vm_event_check_ring(d->vm_event_paging) ||
> +    domain_lock(d);
> +
> +    /*
> +     * Prevent device assignment if any of
> +     * - mem paging
> +     * - mem sharing
> +     * - the p2m_ram_ro type
> +     * - global log-dirty mode
> +     * are in use by this domain.
> +     */
> +    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
> +#ifdef CONFIG_HVM
> +                  (is_hvm_domain(d) &&
> +                   (d->arch.hvm.mem_sharing_enabled ||
> +                    d->arch.hvm.p2m_ram_ro_used)) ||
> +#endif
>                    p2m_get_hostp2m(d)->global_logdirty) )

Is such check needed anymore?

With the enabling of the iommu right at domain creation it shouldn't
be possible to enable any of the above features at all anymore.

Thanks, Roger.
Jan Beulich Oct. 1, 2019, 11:40 a.m. UTC | #2
On 01.10.2019 13:30, Roger Pau Monné  wrote:
> On Tue, Oct 01, 2019 at 11:07:55AM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/dm.c
>> +++ b/xen/arch/x86/hvm/dm.c
>> @@ -255,16 +255,33 @@ static int set_mem_type(struct domain *d
>>  
>>      mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
>>  
>> -    if ( mem_type == HVMMEM_ioreq_server )
>> +    switch ( mem_type )
>>      {
>>          unsigned int flags;
>>  
>> +    case HVMMEM_ioreq_server:
>>          if ( !hap_enabled(d) )
>>              return -EOPNOTSUPP;
>>  
>>          /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
>>          if ( !p2m_get_ioreq_server(d, &flags) )
>>              return -EINVAL;
>> +
>> +        break;
>> +
>> +    case HVMMEM_ram_ro:
>> +        /* p2m_ram_ro can't be represented in IOMMU mappings. */
>> +        domain_lock(d);
>> +        if ( has_arch_pdevs(d) )
> 
> I would use is_iommu_enabled because I think it's clearer in this
> context (giving the comment above explicitly refers to having iommu
> mappings).

But the whole point of the re-basing over Paul's change is that now
the operation gets refused only if a device was actually assigned.

>> +            rc = -EXDEV;
> 
> EOPNOTSUPP might be better, since it's possible that future iommus
> support such page type?

I don't think future IOMMU behavior affects the choice of error
code. I wanted to use something half way reasonable, yet not too
common, in order to be able to easily identify the source of the
error. If you and others think this isn't a meaningful concern,
I'd be okay switching to -EOPNOTSUPP.

>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -1486,15 +1486,33 @@ static int assign_device(struct domain *
>>      if ( !is_iommu_enabled(d) )
>>          return 0;
>>  
>> -    /* Prevent device assign if mem paging or mem sharing have been 
>> -     * enabled for this domain */
>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
>> -                  vm_event_check_ring(d->vm_event_paging) ||
>> +    domain_lock(d);
>> +
>> +    /*
>> +     * Prevent device assignment if any of
>> +     * - mem paging
>> +     * - mem sharing
>> +     * - the p2m_ram_ro type
>> +     * - global log-dirty mode
>> +     * are in use by this domain.
>> +     */
>> +    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
>> +#ifdef CONFIG_HVM
>> +                  (is_hvm_domain(d) &&
>> +                   (d->arch.hvm.mem_sharing_enabled ||
>> +                    d->arch.hvm.p2m_ram_ro_used)) ||
>> +#endif
>>                    p2m_get_hostp2m(d)->global_logdirty) )
> 
> Is such check needed anymore?
> 
> With the enabling of the iommu right at domain creation it shouldn't
> be possible to enable any of the above features at all anymore.

See above - all such checks should now be / get converted to check
whether devices are assigned, not whether IOMMU page tables exist.
After all we want to refuse requests only if strictly necessary.

Jan
Roger Pau Monné Oct. 1, 2019, 2:15 p.m. UTC | #3
On Tue, Oct 01, 2019 at 01:40:57PM +0200, Jan Beulich wrote:
> On 01.10.2019 13:30, Roger Pau Monné  wrote:
> > On Tue, Oct 01, 2019 at 11:07:55AM +0200, Jan Beulich wrote:
> >> --- a/xen/arch/x86/hvm/dm.c
> >> +++ b/xen/arch/x86/hvm/dm.c
> >> @@ -255,16 +255,33 @@ static int set_mem_type(struct domain *d
> >>  
> >>      mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
> >>  
> >> -    if ( mem_type == HVMMEM_ioreq_server )
> >> +    switch ( mem_type )
> >>      {
> >>          unsigned int flags;
> >>  
> >> +    case HVMMEM_ioreq_server:
> >>          if ( !hap_enabled(d) )
> >>              return -EOPNOTSUPP;
> >>  
> >>          /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
> >>          if ( !p2m_get_ioreq_server(d, &flags) )
> >>              return -EINVAL;
> >> +
> >> +        break;
> >> +
> >> +    case HVMMEM_ram_ro:
> >> +        /* p2m_ram_ro can't be represented in IOMMU mappings. */
> >> +        domain_lock(d);
> >> +        if ( has_arch_pdevs(d) )
> > 
> > I would use is_iommu_enabled because I think it's clearer in this
> > context (giving the comment above explicitly refers to having iommu
> > mappings).
> 
> But the whole point of the re-basing over Paul's change is that now
> the operation gets refused only if a device was actually assigned.
> 
> >> +            rc = -EXDEV;
> > 
> > EOPNOTSUPP might be better, since it's possible that future iommus
> > support such page type?
> 
> I don't think future IOMMU behavior affects the choice of error
> code. I wanted to use something half way reasonable, yet not too
> common, in order to be able to easily identify the source of the
> error. If you and others think this isn't a meaningful concern,
> I'd be okay switching to -EOPNOTSUPP.
> 
> >> --- a/xen/drivers/passthrough/pci.c
> >> +++ b/xen/drivers/passthrough/pci.c
> >> @@ -1486,15 +1486,33 @@ static int assign_device(struct domain *
> >>      if ( !is_iommu_enabled(d) )
> >>          return 0;
> >>  
> >> -    /* Prevent device assign if mem paging or mem sharing have been 
> >> -     * enabled for this domain */
> >> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
> >> -                  vm_event_check_ring(d->vm_event_paging) ||
> >> +    domain_lock(d);
> >> +
> >> +    /*
> >> +     * Prevent device assignment if any of
> >> +     * - mem paging
> >> +     * - mem sharing
> >> +     * - the p2m_ram_ro type
> >> +     * - global log-dirty mode
> >> +     * are in use by this domain.
> >> +     */
> >> +    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||

Would be nice to have some syntactic sugar like vm_event_enabled or
some such.

> >> +#ifdef CONFIG_HVM
> >> +                  (is_hvm_domain(d) &&
> >> +                   (d->arch.hvm.mem_sharing_enabled ||
> >> +                    d->arch.hvm.p2m_ram_ro_used)) ||
> >> +#endif

Do you really need the CONFIG_HVM guards? is_hvm_domain already has a
IS_ENABLED(CONFIG_HVM).

> >>                    p2m_get_hostp2m(d)->global_logdirty) )
> > 
> > Is such check needed anymore?
> > 
> > With the enabling of the iommu right at domain creation it shouldn't
> > be possible to enable any of the above features at all anymore.
> 
> See above - all such checks should now be / get converted to check
> whether devices are assigned, not whether IOMMU page tables exist.
> After all we want to refuse requests only if strictly necessary.

Oh right, I was missing the whole point then. So we still keep the
iommu enabled together with introspection or ram_ro as long as there
are no devices assigned.

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.
Jan Beulich Oct. 1, 2019, 2:38 p.m. UTC | #4
On 01.10.2019 16:15, Roger Pau Monné  wrote:
> On Tue, Oct 01, 2019 at 01:40:57PM +0200, Jan Beulich wrote:
>> On 01.10.2019 13:30, Roger Pau Monné  wrote:
>>> On Tue, Oct 01, 2019 at 11:07:55AM +0200, Jan Beulich wrote:
>>>> --- a/xen/drivers/passthrough/pci.c
>>>> +++ b/xen/drivers/passthrough/pci.c
>>>> @@ -1486,15 +1486,33 @@ static int assign_device(struct domain *
>>>>      if ( !is_iommu_enabled(d) )
>>>>          return 0;
>>>>  
>>>> -    /* Prevent device assign if mem paging or mem sharing have been 
>>>> -     * enabled for this domain */
>>>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
>>>> -                  vm_event_check_ring(d->vm_event_paging) ||
>>>> +    domain_lock(d);
>>>> +
>>>> +    /*
>>>> +     * Prevent device assignment if any of
>>>> +     * - mem paging
>>>> +     * - mem sharing
>>>> +     * - the p2m_ram_ro type
>>>> +     * - global log-dirty mode
>>>> +     * are in use by this domain.
>>>> +     */
>>>> +    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
> 
> Would be nice to have some syntactic sugar like vm_event_enabled or
> some such.

I'll leave that to the VM event maintainers.

>>>> +#ifdef CONFIG_HVM
>>>> +                  (is_hvm_domain(d) &&
>>>> +                   (d->arch.hvm.mem_sharing_enabled ||
>>>> +                    d->arch.hvm.p2m_ram_ro_used)) ||
>>>> +#endif
> 
> Do you really need the CONFIG_HVM guards? is_hvm_domain already has a
> IS_ENABLED(CONFIG_HVM).

Strictly speaking at this point in time it wouldn't be needed.
But eventually I think we will want such, as there's no point
having a bigger than necessary struct arch_domain (and struct
arch_vcpu) in a !HVM build. Achieving that would likely imply
though that the entire d->arch.hvm disappears, and hence
without some kind of guards the above would then fail to
compile. (I have accumulated quite a bit of related work
already, which is probably why I felt like adding the #ifdef-s
here.)

>>>>                    p2m_get_hostp2m(d)->global_logdirty) )
>>>
>>> Is such check needed anymore?
>>>
>>> With the enabling of the iommu right at domain creation it shouldn't
>>> be possible to enable any of the above features at all anymore.
>>
>> See above - all such checks should now be / get converted to check
>> whether devices are assigned, not whether IOMMU page tables exist.
>> After all we want to refuse requests only if strictly necessary.
> 
> Oh right, I was missing the whole point then. So we still keep the
> iommu enabled together with introspection or ram_ro as long as there
> are no devices assigned.
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

Jan
Andrew Cooper Oct. 1, 2019, 6 p.m. UTC | #5
On 01/10/2019 10:07, Jan Beulich wrote:
> The write-discard property of the type can't be represented in IOMMU
> page table entries. Make sure the respective checks / tracking can't
> race, by utilizing the domain lock. The other sides of the sharing/
> paging/log-dirty exclusion checks should subsequently perhaps also be
> put under that lock then.
>
> This also fixes an unguarded d->arch.hvm access.
>
> Take the opportunity and also convert neighboring bool_t to bool in
> struct hvm_domain.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

Leaving aside the logdirty change which is an automatic no from me in
this form,  I can't see how this change avoids regressing the status quo.

Qemu really does set this flag for option ROMs, which will typically be
the iPXE image for net booting.  Currently, netbooting an HVM guest with
device passed through to it does work, because it is part of XenServers
basic testing.

It is entirely possible that this only "works" as long as DMA doesn't
touch the option ROM, and while this might not be ideal, it seems to be
less bad behaviour than breaking existing VMs using this configuration.

~Andrew
Jan Beulich Oct. 2, 2019, 7:59 a.m. UTC | #6
On 01.10.2019 20:00, Andrew Cooper wrote:
> On 01/10/2019 10:07, Jan Beulich wrote:
>> The write-discard property of the type can't be represented in IOMMU
>> page table entries. Make sure the respective checks / tracking can't
>> race, by utilizing the domain lock. The other sides of the sharing/
>> paging/log-dirty exclusion checks should subsequently perhaps also be
>> put under that lock then.
>>
>> This also fixes an unguarded d->arch.hvm access.
>>
>> Take the opportunity and also convert neighboring bool_t to bool in
>> struct hvm_domain.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
> 
> Leaving aside the logdirty change which is an automatic no from me in
> this form,

There's no log-dirty change here, that line is just patch context.
All the patch does is also include that case in the comment next to
the check.

>  I can't see how this change avoids regressing the status quo.
> 
> Qemu really does set this flag for option ROMs, which will typically be
> the iPXE image for net booting.  Currently, netbooting an HVM guest with
> device passed through to it does work, because it is part of XenServers
> basic testing.
> 
> It is entirely possible that this only "works" as long as DMA doesn't
> touch the option ROM, and while this might not be ideal, it seems to be
> less bad behaviour than breaking existing VMs using this configuration.

Hmm, yes, I have to admit I didn't consider this case, mis-remembering
that HVMMEM_ram_ro would be more "special" than it really is. I guess
I'll withdraw the patch then.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -255,16 +255,33 @@  static int set_mem_type(struct domain *d
 
     mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
 
-    if ( mem_type == HVMMEM_ioreq_server )
+    switch ( mem_type )
     {
         unsigned int flags;
 
+    case HVMMEM_ioreq_server:
         if ( !hap_enabled(d) )
             return -EOPNOTSUPP;
 
         /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
         if ( !p2m_get_ioreq_server(d, &flags) )
             return -EINVAL;
+
+        break;
+
+    case HVMMEM_ram_ro:
+        /* p2m_ram_ro can't be represented in IOMMU mappings. */
+        domain_lock(d);
+        if ( has_arch_pdevs(d) )
+            rc = -EXDEV;
+        else
+            d->arch.hvm.p2m_ram_ro_used = true;
+        domain_unlock(d);
+
+        if ( rc )
+            return rc;
+
+        break;
     }
 
     while ( iter < data->nr )
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1486,15 +1486,33 @@  static int assign_device(struct domain *
     if ( !is_iommu_enabled(d) )
         return 0;
 
-    /* Prevent device assign if mem paging or mem sharing have been 
-     * enabled for this domain */
-    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
-                  vm_event_check_ring(d->vm_event_paging) ||
+    domain_lock(d);
+
+    /*
+     * Prevent device assignment if any of
+     * - mem paging
+     * - mem sharing
+     * - the p2m_ram_ro type
+     * - global log-dirty mode
+     * are in use by this domain.
+     */
+    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
+#ifdef CONFIG_HVM
+                  (is_hvm_domain(d) &&
+                   (d->arch.hvm.mem_sharing_enabled ||
+                    d->arch.hvm.p2m_ram_ro_used)) ||
+#endif
                   p2m_get_hostp2m(d)->global_logdirty) )
+    {
+        domain_unlock(d);
         return -EXDEV;
+    }
 
     if ( !pcidevs_trylock() )
+    {
+        domain_unlock(d);
         return -ERESTART;
+    }
 
     pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
     if ( !pdev )
@@ -1525,6 +1534,7 @@  static int assign_device(struct domain *
 
  done:
     pcidevs_unlock();
+    domain_unlock(d);
 
     return rc;
 }
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -156,9 +156,10 @@  struct hvm_domain {
 
     struct viridian_domain *viridian;
 
-    bool_t                 mem_sharing_enabled;
-    bool_t                 qemu_mapcache_invalidate;
-    bool_t                 is_s3_suspended;
+    bool                   mem_sharing_enabled;
+    bool                   p2m_ram_ro_used;
+    bool                   qemu_mapcache_invalidate;
+    bool                   is_s3_suspended;
 
     /*
      * TSC value that VCPUs use to calculate their tsc_offset value.