diff mbox series

x86/HVM: p2m_ram_ro is incompatible with device pass-through

Message ID 5CCAE2A5020000780022B35E@prv1-mh.provo.novell.com (mailing list archive)
State Superseded
Headers show
Series x86/HVM: p2m_ram_ro is incompatible with device pass-through | expand

Commit Message

Jan Beulich May 2, 2019, 12:29 p.m. UTC
The write-discard property of the type can't be represented in IOMMU
page table entries. Make sure the respective checks / tracking can't
race, by utilizing the domain lock. The other sides of the sharing/
paging/log-dirty exclusion checks should subsequently perhaps also be
put under that lock then.

Take the opportunity and also convert neighboring bool_t to bool in
struct hvm_domain.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Paul Durrant May 2, 2019, 12:47 p.m. UTC | #1
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 02 May 2019 13:29
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Wei Liu <wei.liu2@citrix.com>; George Dunlap <George.Dunlap@citrix.com>
> Subject: [PATCH] x86/HVM: p2m_ram_ro is incompatible with device pass-through
> 
> The write-discard property of the type can't be represented in IOMMU
> page table entries. Make sure the respective checks / tracking can't
> race, by utilizing the domain lock. The other sides of the sharing/
> paging/log-dirty exclusion checks should subsequently perhaps also be
> put under that lock then.
> 
> Take the opportunity and also convert neighboring bool_t to bool in
> struct hvm_domain.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/dm.c
> +++ b/xen/arch/x86/hvm/dm.c
> @@ -255,16 +255,32 @@ static int set_mem_type(struct domain *d
> 
>      mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
> 
> -    if ( mem_type == HVMMEM_ioreq_server )
> +    switch ( mem_type )
>      {
>          unsigned int flags;
> 
> +    case HVMMEM_ioreq_server:
>          if ( !hap_enabled(d) )
>              return -EOPNOTSUPP;
> 
>          /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
>          if ( !p2m_get_ioreq_server(d, &flags) )
>              return -EINVAL;
> +
> +        break;
> +
> +    case HVMMEM_ram_ro:
> +        /* p2m_ram_ro can't be represented in IOMMU mappings. */
> +        domain_lock(d);
> +        if ( has_iommu_pt(d) )
> +            rc = -EXDEV;
> +        d->arch.hvm.p2m_ram_ro_used = true;

Do you really want to set this to true even if the op will fail?

  Paul

> +        domain_unlock(d);
> +
> +        if ( rc )
> +            return rc;
> +
> +        break;
>      }
> 
>      while ( iter < data->nr )
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
>      if ( !iommu_enabled || !hd->platform_ops )
>          return 0;
> 
> -    /* Prevent device assign if mem paging or mem sharing have been
> -     * enabled for this domain */
> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
> -                  vm_event_check_ring(d->vm_event_paging) ||
> +    domain_lock(d);
> +
> +    /*
> +     * Prevent device assignment if any of
> +     * - mem paging
> +     * - mem sharing
> +     * - the p2m_ram_ro type
> +     * - global log-dirty mode
> +     * are in use by this domain.
> +     */
> +    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
> +#ifdef CONFIG_HVM
> +                  (is_hvm_domain(d) &&
> +                   (d->arch.hvm.mem_sharing_enabled ||
> +                    d->arch.hvm.p2m_ram_ro_used)) ||
> +#endif
>                    p2m_get_hostp2m(d)->global_logdirty) )
> +    {
> +        domain_unlock(d);
>          return -EXDEV;
> +    }
> 
>      if ( !pcidevs_trylock() )
> +    {
> +        domain_unlock(d);
>          return -ERESTART;
> +    }
> 
>      rc = iommu_construct(d);
> +    domain_unlock(d);
>      if ( rc )
>      {
>          pcidevs_unlock();
> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -156,10 +156,11 @@ struct hvm_domain {
> 
>      struct viridian_domain *viridian;
> 
> -    bool_t                 hap_enabled;
> -    bool_t                 mem_sharing_enabled;
> -    bool_t                 qemu_mapcache_invalidate;
> -    bool_t                 is_s3_suspended;
> +    bool                   hap_enabled;
> +    bool                   mem_sharing_enabled;
> +    bool                   p2m_ram_ro_used;
> +    bool                   qemu_mapcache_invalidate;
> +    bool                   is_s3_suspended;
> 
>      /*
>       * TSC value that VCPUs use to calculate their tsc_offset value.
> 
> 
>
Jan Beulich May 2, 2019, 12:56 p.m. UTC | #2
>>> On 02.05.19 at 14:47, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 02 May 2019 13:29
>> 
>> --- a/xen/arch/x86/hvm/dm.c
>> +++ b/xen/arch/x86/hvm/dm.c
>> @@ -255,16 +255,32 @@ static int set_mem_type(struct domain *d
>> 
>>      mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
>> 
>> -    if ( mem_type == HVMMEM_ioreq_server )
>> +    switch ( mem_type )
>>      {
>>          unsigned int flags;
>> 
>> +    case HVMMEM_ioreq_server:
>>          if ( !hap_enabled(d) )
>>              return -EOPNOTSUPP;
>> 
>>          /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
>>          if ( !p2m_get_ioreq_server(d, &flags) )
>>              return -EINVAL;
>> +
>> +        break;
>> +
>> +    case HVMMEM_ram_ro:
>> +        /* p2m_ram_ro can't be represented in IOMMU mappings. */
>> +        domain_lock(d);
>> +        if ( has_iommu_pt(d) )
>> +            rc = -EXDEV;
>> +        d->arch.hvm.p2m_ram_ro_used = true;
> 
> Do you really want to set this to true even if the op will fail?

Oh, good point - there should be an "else" there.

Jan
Andrew Cooper May 2, 2019, 12:59 p.m. UTC | #3
On 02/05/2019 13:29, Jan Beulich wrote:
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
>      if ( !iommu_enabled || !hd->platform_ops )
>          return 0;
>  
> -    /* Prevent device assign if mem paging or mem sharing have been 
> -     * enabled for this domain */
> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
> -                  vm_event_check_ring(d->vm_event_paging) ||
> +    domain_lock(d);
> +
> +    /*
> +     * Prevent device assignment if any of
> +     * - mem paging
> +     * - mem sharing
> +     * - the p2m_ram_ro type
> +     * - global log-dirty mode

XenServer has working live migration with GPUs, which this change would
regress.

Behind the scenes, we combine Xen's logdirty bitmap with one provided by
the GPU, to ensure that all dirty updates are tracked.

~Andrew
Jan Beulich May 2, 2019, 1:16 p.m. UTC | #4
>>> On 02.05.19 at 14:59, <andrew.cooper3@citrix.com> wrote:
> On 02/05/2019 13:29, Jan Beulich wrote:
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
>>      if ( !iommu_enabled || !hd->platform_ops )
>>          return 0;
>>  
>> -    /* Prevent device assign if mem paging or mem sharing have been 
>> -     * enabled for this domain */
>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
>> -                  vm_event_check_ring(d->vm_event_paging) ||
>> +    domain_lock(d);
>> +
>> +    /*
>> +     * Prevent device assignment if any of
>> +     * - mem paging
>> +     * - mem sharing
>> +     * - the p2m_ram_ro type
>> +     * - global log-dirty mode
> 
> XenServer has working live migration with GPUs, which this change would
> regress.
> 
> Behind the scenes, we combine Xen's logdirty bitmap with one provided by
> the GPU, to ensure that all dirty updates are tracked.

But this says nothing for the patch here. As long as it doesn't
work in the staging tree, it should be prevented. In XenServer
you could then patch out that check and comment line together
with whatever other changes you have to make for thins to
work. Alternatively you could submit a patch (or more) to make
it work in staging too, at which point I'd have to re-base the
one here.

Jan
Andrew Cooper May 2, 2019, 1:42 p.m. UTC | #5
On 02/05/2019 14:16, Jan Beulich wrote:
>>>> On 02.05.19 at 14:59, <andrew.cooper3@citrix.com> wrote:
>> On 02/05/2019 13:29, Jan Beulich wrote:
>>> --- a/xen/drivers/passthrough/pci.c
>>> +++ b/xen/drivers/passthrough/pci.c
>>> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
>>>      if ( !iommu_enabled || !hd->platform_ops )
>>>          return 0;
>>>  
>>> -    /* Prevent device assign if mem paging or mem sharing have been 
>>> -     * enabled for this domain */
>>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
>>> -                  vm_event_check_ring(d->vm_event_paging) ||
>>> +    domain_lock(d);
>>> +
>>> +    /*
>>> +     * Prevent device assignment if any of
>>> +     * - mem paging
>>> +     * - mem sharing
>>> +     * - the p2m_ram_ro type
>>> +     * - global log-dirty mode
>> XenServer has working live migration with GPUs, which this change would
>> regress.
>>
>> Behind the scenes, we combine Xen's logdirty bitmap with one provided by
>> the GPU, to ensure that all dirty updates are tracked.
> But this says nothing for the patch here.

Yes it does.

There is nothing inherent about global log-dirty mode which is
incompatible with passthrough when the IOMMU pagetables are not shared
with EPT.

> As long as it doesn't work in the staging tree, it should be prevented.

... but it does work.

> In XenServer
> you could then patch out that check and comment line together
> with whatever other changes you have to make for thins to
> work.

Everything is upstream in the various projects, other than the vendor's
closed source library and kernel driver.

~Andrew
Jan Beulich May 2, 2019, 1:56 p.m. UTC | #6
>>> On 02.05.19 at 15:42, <andrew.cooper3@citrix.com> wrote:
> On 02/05/2019 14:16, Jan Beulich wrote:
>>>>> On 02.05.19 at 14:59, <andrew.cooper3@citrix.com> wrote:
>>> On 02/05/2019 13:29, Jan Beulich wrote:
>>>> --- a/xen/drivers/passthrough/pci.c
>>>> +++ b/xen/drivers/passthrough/pci.c
>>>> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
>>>>      if ( !iommu_enabled || !hd->platform_ops )
>>>>          return 0;
>>>>  
>>>> -    /* Prevent device assign if mem paging or mem sharing have been 
>>>> -     * enabled for this domain */
>>>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
>>>> -                  vm_event_check_ring(d->vm_event_paging) ||
>>>> +    domain_lock(d);
>>>> +
>>>> +    /*
>>>> +     * Prevent device assignment if any of
>>>> +     * - mem paging
>>>> +     * - mem sharing
>>>> +     * - the p2m_ram_ro type
>>>> +     * - global log-dirty mode
>>> XenServer has working live migration with GPUs, which this change would
>>> regress.
>>>
>>> Behind the scenes, we combine Xen's logdirty bitmap with one provided by
>>> the GPU, to ensure that all dirty updates are tracked.
>> But this says nothing for the patch here.
> 
> Yes it does.

Well, okay, then the wording of your reply plus me just adding
a comment for a pre-existing check has mislead me.

> There is nothing inherent about global log-dirty mode which is
> incompatible with passthrough when the IOMMU pagetables are not shared
> with EPT.
> 
>> As long as it doesn't work in the staging tree, it should be prevented.
> 
> ... but it does work.

Note how (as said above) the patch does _not_ add any
->global_logdirty check, it merely adds a comment enumerating
everything that's getting checked. I'm afraid I don't see how
adding a comment to state how things are can regress anything.

The only check the patch adds is that of the new
p2m_ram_ro_used flag.

Jan
Paul Durrant May 2, 2019, 2:12 p.m. UTC | #7
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 02 May 2019 14:57
> To: Andrew Cooper <Andrew.Cooper3@citrix.com>
> Cc: Paul Durrant <Paul.Durrant@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>; Wei Liu
> <wei.liu2@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; xen-devel <xen-
> devel@lists.xenproject.org>
> Subject: Re: [PATCH] x86/HVM: p2m_ram_ro is incompatible with device pass-through
> 
> >>> On 02.05.19 at 15:42, <andrew.cooper3@citrix.com> wrote:
> > On 02/05/2019 14:16, Jan Beulich wrote:
> >>>>> On 02.05.19 at 14:59, <andrew.cooper3@citrix.com> wrote:
> >>> On 02/05/2019 13:29, Jan Beulich wrote:
> >>>> --- a/xen/drivers/passthrough/pci.c
> >>>> +++ b/xen/drivers/passthrough/pci.c
> >>>> @@ -1450,17 +1450,36 @@ static int assign_device(struct domain *
> >>>>      if ( !iommu_enabled || !hd->platform_ops )
> >>>>          return 0;
> >>>>
> >>>> -    /* Prevent device assign if mem paging or mem sharing have been
> >>>> -     * enabled for this domain */
> >>>> -    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
> >>>> -                  vm_event_check_ring(d->vm_event_paging) ||
> >>>> +    domain_lock(d);
> >>>> +
> >>>> +    /*
> >>>> +     * Prevent device assignment if any of
> >>>> +     * - mem paging
> >>>> +     * - mem sharing
> >>>> +     * - the p2m_ram_ro type
> >>>> +     * - global log-dirty mode
> >>> XenServer has working live migration with GPUs, which this change would
> >>> regress.
> >>>
> >>> Behind the scenes, we combine Xen's logdirty bitmap with one provided by
> >>> the GPU, to ensure that all dirty updates are tracked.
> >> But this says nothing for the patch here.
> >
> > Yes it does.
> 
> Well, okay, then the wording of your reply plus me just adding
> a comment for a pre-existing check has mislead me.
> 
> > There is nothing inherent about global log-dirty mode which is
> > incompatible with passthrough when the IOMMU pagetables are not shared
> > with EPT.
> >
> >> As long as it doesn't work in the staging tree, it should be prevented.
> >
> > ... but it does work.
> 
> Note how (as said above) the patch does _not_ add any
> ->global_logdirty check, it merely adds a comment enumerating
> everything that's getting checked. I'm afraid I don't see how
> adding a comment to state how things are can regress anything.
> 
> The only check the patch adds is that of the new
> p2m_ram_ro_used flag.
> 

Actually, since global_logdirty is somewhat transient should we not also have a check to prevent p2m_ram_logdirty from being set for a domain with assigned devices and shared EPT?

  Paul

> Jan
>
Jan Beulich May 2, 2019, 2:24 p.m. UTC | #8
>>> On 02.05.19 at 16:12, <Paul.Durrant@citrix.com> wrote:
> Actually, since global_logdirty is somewhat transient should we not also 
> have a check to prevent p2m_ram_logdirty from being set for a domain with 
> assigned devices and shared EPT?

Probably (if we indeed don't), but imo not in this patch.

Jan
Paul Durrant May 2, 2019, 2:25 p.m. UTC | #9
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 02 May 2019 15:25
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Roger Pau
> Monne <roger.pau@citrix.com>; Wei Liu <wei.liu2@citrix.com>; xen-devel <xen-
> devel@lists.xenproject.org>
> Subject: RE: [PATCH] x86/HVM: p2m_ram_ro is incompatible with device pass-through
> 
> >>> On 02.05.19 at 16:12, <Paul.Durrant@citrix.com> wrote:
> > Actually, since global_logdirty is somewhat transient should we not also
> > have a check to prevent p2m_ram_logdirty from being set for a domain with
> > assigned devices and shared EPT?
> 
> Probably (if we indeed don't), but imo not in this patch.
> 

Yes, fair enough.

  Paul

> Jan
>
diff mbox series

Patch

--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -255,16 +255,32 @@  static int set_mem_type(struct domain *d
 
     mem_type = array_index_nospec(data->mem_type, ARRAY_SIZE(memtype));
 
-    if ( mem_type == HVMMEM_ioreq_server )
+    switch ( mem_type )
     {
         unsigned int flags;
 
+    case HVMMEM_ioreq_server:
         if ( !hap_enabled(d) )
             return -EOPNOTSUPP;
 
         /* Do not change to HVMMEM_ioreq_server if no ioreq server mapped. */
         if ( !p2m_get_ioreq_server(d, &flags) )
             return -EINVAL;
+
+        break;
+
+    case HVMMEM_ram_ro:
+        /* p2m_ram_ro can't be represented in IOMMU mappings. */
+        domain_lock(d);
+        if ( has_iommu_pt(d) )
+            rc = -EXDEV;
+        d->arch.hvm.p2m_ram_ro_used = true;
+        domain_unlock(d);
+
+        if ( rc )
+            return rc;
+
+        break;
     }
 
     while ( iter < data->nr )
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1450,17 +1450,36 @@  static int assign_device(struct domain *
     if ( !iommu_enabled || !hd->platform_ops )
         return 0;
 
-    /* Prevent device assign if mem paging or mem sharing have been 
-     * enabled for this domain */
-    if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
-                  vm_event_check_ring(d->vm_event_paging) ||
+    domain_lock(d);
+
+    /*
+     * Prevent device assignment if any of
+     * - mem paging
+     * - mem sharing
+     * - the p2m_ram_ro type
+     * - global log-dirty mode
+     * are in use by this domain.
+     */
+    if ( unlikely(vm_event_check_ring(d->vm_event_paging) ||
+#ifdef CONFIG_HVM
+                  (is_hvm_domain(d) &&
+                   (d->arch.hvm.mem_sharing_enabled ||
+                    d->arch.hvm.p2m_ram_ro_used)) ||
+#endif
                   p2m_get_hostp2m(d)->global_logdirty) )
+    {
+        domain_unlock(d);
         return -EXDEV;
+    }
 
     if ( !pcidevs_trylock() )
+    {
+        domain_unlock(d);
         return -ERESTART;
+    }
 
     rc = iommu_construct(d);
+    domain_unlock(d);
     if ( rc )
     {
         pcidevs_unlock();
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -156,10 +156,11 @@  struct hvm_domain {
 
     struct viridian_domain *viridian;
 
-    bool_t                 hap_enabled;
-    bool_t                 mem_sharing_enabled;
-    bool_t                 qemu_mapcache_invalidate;
-    bool_t                 is_s3_suspended;
+    bool                   hap_enabled;
+    bool                   mem_sharing_enabled;
+    bool                   p2m_ram_ro_used;
+    bool                   qemu_mapcache_invalidate;
+    bool                   is_s3_suspended;
 
     /*
      * TSC value that VCPUs use to calculate their tsc_offset value.