x86/PV: fix unintended dependency of m2p-strict mode on migration-v2
diff mbox

Message ID 5694DEAA02000078000C5D7A@prv-mh.provo.novell.com
State New, archived
Headers show

Commit Message

Jan Beulich Jan. 12, 2016, 10:08 a.m. UTC
This went unnoticed until a backport of this to an older Xen got used,
causing migration of guests enabling this VM assist to fail, because
page table pinning there preceeds vCPU context loading, and hence L4
tables get initialized for the wrong mode. Fix this by post-processing
L4 tables when setting the intended VM assist flags for the guest.

Note that this leaves in place a dependency on vCPU 0 getting its guest
context restored first, but afaict the logic here is not the only thing
depending on that.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86/PV: fix unintended dependency of m2p-strict mode on migration-v2

This went unnoticed until a backport of this to an older Xen got used,
causing migration of guests enabling this VM assist to fail, because
page table pinning there preceeds vCPU context loading, and hence L4
tables get initialized for the wrong mode. Fix this by post-processing
L4 tables when setting the intended VM assist flags for the guest.

Note that this leaves in place a dependency on vCPU 0 getting its guest
context restored first, but afaict the logic here is not the only thing
depending on that.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1067,8 +1067,48 @@ int arch_set_info_guest(
         goto out;
 
     if ( v->vcpu_id == 0 )
+    {
         d->vm_assist = c(vm_assist);
 
+        /*
+         * In the restore case we need to deal with L4 pages which got
+         * initialized with m2p_strict still clear (and which hence lack the
+         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
+         */
+        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
+             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
+             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
+        {
+            bool_t done = 0;
+
+            spin_lock_recursive(&d->page_alloc_lock);
+
+            for ( i = 0; ; )
+            {
+                struct page_info *page = page_list_remove_head(&d->page_list);
+
+                if ( page_lock(page) )
+                {
+                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
+                         PGT_l4_page_table )
+                        done = !fill_ro_mpt(page_to_mfn(page));
+
+                    page_unlock(page);
+                }
+
+                page_list_add_tail(page, &d->page_list);
+
+                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
+                    break;
+            }
+
+            spin_unlock_recursive(&d->page_alloc_lock);
+
+            if ( !done )
+                return -ERESTART;
+        }
+    }
+
     rc = put_old_guest_table(current);
     if ( rc )
         return rc;
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4
         l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
 }
 
-void fill_ro_mpt(unsigned long mfn)
+bool_t fill_ro_mpt(unsigned long mfn)
 {
     l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
+    bool_t ret = 0;
 
-    l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
-        idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+    if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
+    {
+        l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
+            idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+        ret = 1;
+    }
     unmap_domain_page(l4tab);
+
+    return ret;
 }
 
 void zap_ro_mpt(unsigned long mfn)
@@ -1527,10 +1534,15 @@ static int alloc_l4_table(struct page_in
         adjust_guest_l4e(pl4e[i], d);
     }
 
-    init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
+    if ( rc >= 0 )
+    {
+        init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
+        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
     unmap_domain_page(pl4e);
 
-    return rc > 0 ? 0 : rc;
+    return rc;
 }
 
 static void free_l1_table(struct page_info *page)
@@ -1648,7 +1660,13 @@ static int free_l4_table(struct page_inf
 
     unmap_domain_page(pl4e);
 
-    return rc > 0 ? 0 : rc;
+    if ( rc >= 0 )
+    {
+        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
+
+    return rc;
 }
 
 int page_lock(struct page_info *page)
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -248,6 +248,8 @@ struct pv_domain
 {
     l1_pgentry_t **gdt_ldt_l1tab;
 
+    atomic_t nr_l4_pages;
+
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
 };
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -322,7 +322,7 @@ int free_page_type(struct page_info *pag
 
 void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
                          bool_t zap_ro_mpt);
-void fill_ro_mpt(unsigned long mfn);
+bool_t fill_ro_mpt(unsigned long mfn);
 void zap_ro_mpt(unsigned long mfn);
 
 int is_iomem_page(unsigned long mfn);

Comments

Andrew Cooper Jan. 12, 2016, 11:55 a.m. UTC | #1
On 12/01/16 10:08, Jan Beulich wrote:
> This went unnoticed until a backport of this to an older Xen got used,
> causing migration of guests enabling this VM assist to fail, because
> page table pinning there preceeds vCPU context loading, and hence L4
> tables get initialized for the wrong mode. Fix this by post-processing
> L4 tables when setting the intended VM assist flags for the guest.
>
> Note that this leaves in place a dependency on vCPU 0 getting its guest
> context restored first, but afaict the logic here is not the only thing
> depending on that.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>          goto out;
>  
>      if ( v->vcpu_id == 0 )
> +    {
>          d->vm_assist = c(vm_assist);
>  
> +        /*
> +         * In the restore case we need to deal with L4 pages which got
> +         * initialized with m2p_strict still clear (and which hence lack the
> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
> +         */
> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
> +        {
> +            bool_t done = 0;
> +
> +            spin_lock_recursive(&d->page_alloc_lock);
> +
> +            for ( i = 0; ; )
> +            {
> +                struct page_info *page = page_list_remove_head(&d->page_list);
> +
> +                if ( page_lock(page) )
> +                {
> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
> +                         PGT_l4_page_table )
> +                        done = !fill_ro_mpt(page_to_mfn(page));
> +
> +                    page_unlock(page);
> +                }
> +
> +                page_list_add_tail(page, &d->page_list);
> +
> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
> +                    break;
> +            }
> +
> +            spin_unlock_recursive(&d->page_alloc_lock);
> +
> +            if ( !done )
> +                return -ERESTART;

This is a long loop.  It is preemptible, but will incur a time delay
proportional to the size of the domain during the VM downtime. 

Could you defer the loop until after %cr3 has set been set up, and only
enter the loop if the kernel l4 table is missing the RO mappings?  That
way, domains migrated with migration v2 will skip the loop entirely.

> +        }
> +    }
> +
>      rc = put_old_guest_table(current);
>      if ( rc )
>          return rc;
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4
>          l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
>  }
>  
> -void fill_ro_mpt(unsigned long mfn)
> +bool_t fill_ro_mpt(unsigned long mfn)
>  {
>      l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
> +    bool_t ret = 0;
>  
> -    l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
> -        idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
> +    if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
> +    {
> +        l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
> +            idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
> +        ret = 1;

This is a behavioural change.  Previously, the old value was clobbered.

It appears that you are now using this return value to indicate when the
entire pagelist has been walked, but it it relies on the slots being
zero, which is a fragile assumption.

~Andrew

> +    }
>      unmap_domain_page(l4tab);
> +
> +    return ret;
>  }
>  
>  void zap_ro_mpt(unsigned long mfn)
> @@ -1527,10 +1534,15 @@ static int alloc_l4_table(struct page_in
>          adjust_guest_l4e(pl4e[i], d);
>      }
>  
> -    init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
> +    if ( rc >= 0 )
> +    {
> +        init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
> +        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
> +        rc = 0;
> +    }
>      unmap_domain_page(pl4e);
>  
> -    return rc > 0 ? 0 : rc;
> +    return rc;
>  }
>  
>  static void free_l1_table(struct page_info *page)
> @@ -1648,7 +1660,13 @@ static int free_l4_table(struct page_inf
>  
>      unmap_domain_page(pl4e);
>  
> -    return rc > 0 ? 0 : rc;
> +    if ( rc >= 0 )
> +    {
> +        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
> +        rc = 0;
> +    }
> +
> +    return rc;
>  }
>  
>  int page_lock(struct page_info *page)
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -248,6 +248,8 @@ struct pv_domain
>  {
>      l1_pgentry_t **gdt_ldt_l1tab;
>  
> +    atomic_t nr_l4_pages;
> +
>      /* map_domain_page() mapping cache. */
>      struct mapcache_domain mapcache;
>  };
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -322,7 +322,7 @@ int free_page_type(struct page_info *pag
>  
>  void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
>                           bool_t zap_ro_mpt);
> -void fill_ro_mpt(unsigned long mfn);
> +bool_t fill_ro_mpt(unsigned long mfn);
>  void zap_ro_mpt(unsigned long mfn);
>  
>  int is_iomem_page(unsigned long mfn);
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
Jan Beulich Jan. 12, 2016, 3:19 p.m. UTC | #2
>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
> On 12/01/16 10:08, Jan Beulich wrote:
>> This went unnoticed until a backport of this to an older Xen got used,
>> causing migration of guests enabling this VM assist to fail, because
>> page table pinning there preceeds vCPU context loading, and hence L4
>> tables get initialized for the wrong mode. Fix this by post-processing
>> L4 tables when setting the intended VM assist flags for the guest.
>>
>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>> context restored first, but afaict the logic here is not the only thing
>> depending on that.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>          goto out;
>>  
>>      if ( v->vcpu_id == 0 )
>> +    {
>>          d->vm_assist = c(vm_assist);
>>  
>> +        /*
>> +         * In the restore case we need to deal with L4 pages which got
>> +         * initialized with m2p_strict still clear (and which hence lack the
>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>> +         */
>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>> +        {
>> +            bool_t done = 0;
>> +
>> +            spin_lock_recursive(&d->page_alloc_lock);
>> +
>> +            for ( i = 0; ; )
>> +            {
>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>> +
>> +                if ( page_lock(page) )
>> +                {
>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>> +                         PGT_l4_page_table )
>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>> +
>> +                    page_unlock(page);
>> +                }
>> +
>> +                page_list_add_tail(page, &d->page_list);
>> +
>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>> +                    break;
>> +            }
>> +
>> +            spin_unlock_recursive(&d->page_alloc_lock);
>> +
>> +            if ( !done )
>> +                return -ERESTART;
> 
> This is a long loop.  It is preemptible, but will incur a time delay
> proportional to the size of the domain during the VM downtime. 
> 
> Could you defer the loop until after %cr3 has set been set up, and only
> enter the loop if the kernel l4 table is missing the RO mappings?  That
> way, domains migrated with migration v2 will skip the loop entirely.

Well, first of all this would be the result only as long as you or
someone else don't re-think and possibly move pinning ahead of
context load again.

Deferring until after CR3 got set up is - afaict - not an option, as
it would defeat the purpose of m2p-strict mode as much as doing
the fixup e.g. in the #PF handler. This mode enabled needs to
strictly mean "L4s start with the slot filled, and user-mode uses
clear it", as documented.

There's a much simpler way we could avoid the loop being
entered: Check the previous setting of the flag. However, I
intentionally did not go that route in this initial version as I
didn't want to add more special casing than needed, plus to
make sure the new code isn't effectively dead.

>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4
>>          l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
>>  }
>>  
>> -void fill_ro_mpt(unsigned long mfn)
>> +bool_t fill_ro_mpt(unsigned long mfn)
>>  {
>>      l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
>> +    bool_t ret = 0;
>>  
>> -    l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
>> -        idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
>> +    if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
>> +    {
>> +        l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
>> +            idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
>> +        ret = 1;
> 
> This is a behavioural change.  Previously, the old value was clobbered.
> 
> It appears that you are now using this return value to indicate when the
> entire pagelist has been walked, but it it relies on the slots being
> zero, which is a fragile assumption.

There are only two values possible in this slot - zero or the one
referring to the _shared across domains_ sub-tree for the r/o
MPT. I.e. the change of behavior is only an apparent one, and
I don't see this being fragile either.

Jan
Andrew Cooper Jan. 13, 2016, 3:25 p.m. UTC | #3
On 12/01/16 15:19, Jan Beulich wrote:
>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>> On 12/01/16 10:08, Jan Beulich wrote:
>>> This went unnoticed until a backport of this to an older Xen got used,
>>> causing migration of guests enabling this VM assist to fail, because
>>> page table pinning there preceeds vCPU context loading, and hence L4
>>> tables get initialized for the wrong mode. Fix this by post-processing
>>> L4 tables when setting the intended VM assist flags for the guest.
>>>
>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>> context restored first, but afaict the logic here is not the only thing
>>> depending on that.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/arch/x86/domain.c
>>> +++ b/xen/arch/x86/domain.c
>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>          goto out;
>>>  
>>>      if ( v->vcpu_id == 0 )
>>> +    {
>>>          d->vm_assist = c(vm_assist);
>>>  
>>> +        /*
>>> +         * In the restore case we need to deal with L4 pages which got
>>> +         * initialized with m2p_strict still clear (and which hence lack the
>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>> +         */
>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>> +        {
>>> +            bool_t done = 0;
>>> +
>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>> +
>>> +            for ( i = 0; ; )
>>> +            {
>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>> +
>>> +                if ( page_lock(page) )
>>> +                {
>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>> +                         PGT_l4_page_table )
>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>> +
>>> +                    page_unlock(page);
>>> +                }
>>> +
>>> +                page_list_add_tail(page, &d->page_list);
>>> +
>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>> +                    break;
>>> +            }
>>> +
>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>> +
>>> +            if ( !done )
>>> +                return -ERESTART;
>> This is a long loop.  It is preemptible, but will incur a time delay
>> proportional to the size of the domain during the VM downtime. 
>>
>> Could you defer the loop until after %cr3 has set been set up, and only
>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>> way, domains migrated with migration v2 will skip the loop entirely.
> Well, first of all this would be the result only as long as you or
> someone else don't re-think and possibly move pinning ahead of
> context load again.

A second set_context() will unconditionally hit the loop though.

>
> Deferring until after CR3 got set up is - afaict - not an option, as
> it would defeat the purpose of m2p-strict mode as much as doing
> the fixup e.g. in the #PF handler. This mode enabled needs to
> strictly mean "L4s start with the slot filled, and user-mode uses
> clear it", as documented.

I just meant a little further down this function, i.e. gate the loop on
whether v->arch.guest_table has inappropriate m2p strictness.

~Andrew
Jan Beulich Jan. 13, 2016, 3:36 p.m. UTC | #4
>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote:
> On 12/01/16 15:19, Jan Beulich wrote:
>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>>> On 12/01/16 10:08, Jan Beulich wrote:
>>>> This went unnoticed until a backport of this to an older Xen got used,
>>>> causing migration of guests enabling this VM assist to fail, because
>>>> page table pinning there preceeds vCPU context loading, and hence L4
>>>> tables get initialized for the wrong mode. Fix this by post-processing
>>>> L4 tables when setting the intended VM assist flags for the guest.
>>>>
>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>>> context restored first, but afaict the logic here is not the only thing
>>>> depending on that.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>
>>>> --- a/xen/arch/x86/domain.c
>>>> +++ b/xen/arch/x86/domain.c
>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>>          goto out;
>>>>  
>>>>      if ( v->vcpu_id == 0 )
>>>> +    {
>>>>          d->vm_assist = c(vm_assist);
>>>>  
>>>> +        /*
>>>> +         * In the restore case we need to deal with L4 pages which got
>>>> +         * initialized with m2p_strict still clear (and which hence lack 
> the
>>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>>> +         */
>>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>>> +        {
>>>> +            bool_t done = 0;
>>>> +
>>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>>> +
>>>> +            for ( i = 0; ; )
>>>> +            {
>>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>>> +
>>>> +                if ( page_lock(page) )
>>>> +                {
>>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>>> +                         PGT_l4_page_table )
>>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>>> +
>>>> +                    page_unlock(page);
>>>> +                }
>>>> +
>>>> +                page_list_add_tail(page, &d->page_list);
>>>> +
>>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>>> +                    break;
>>>> +            }
>>>> +
>>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>>> +
>>>> +            if ( !done )
>>>> +                return -ERESTART;
>>> This is a long loop.  It is preemptible, but will incur a time delay
>>> proportional to the size of the domain during the VM downtime. 
>>>
>>> Could you defer the loop until after %cr3 has set been set up, and only
>>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>>> way, domains migrated with migration v2 will skip the loop entirely.
>> Well, first of all this would be the result only as long as you or
>> someone else don't re-think and possibly move pinning ahead of
>> context load again.
> 
> A second set_context() will unconditionally hit the loop though.

Right - another argument against making any change to what is
in the patch right now.

>> Deferring until after CR3 got set up is - afaict - not an option, as
>> it would defeat the purpose of m2p-strict mode as much as doing
>> the fixup e.g. in the #PF handler. This mode enabled needs to
>> strictly mean "L4s start with the slot filled, and user-mode uses
>> clear it", as documented.
> 
> I just meant a little further down this function, i.e. gate the loop on
> whether v->arch.guest_table has inappropriate m2p strictness.

I don't follow: Are you proposing to look at v->arch.guest_table's
respective L4 entry? If so - what condition and what action are
you thinking of? If not - what else?

Jan
Andrew Cooper Jan. 13, 2016, 4 p.m. UTC | #5
On 13/01/16 15:36, Jan Beulich wrote:
>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote:
>> On 12/01/16 15:19, Jan Beulich wrote:
>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>>>> On 12/01/16 10:08, Jan Beulich wrote:
>>>>> This went unnoticed until a backport of this to an older Xen got used,
>>>>> causing migration of guests enabling this VM assist to fail, because
>>>>> page table pinning there preceeds vCPU context loading, and hence L4
>>>>> tables get initialized for the wrong mode. Fix this by post-processing
>>>>> L4 tables when setting the intended VM assist flags for the guest.
>>>>>
>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>>>> context restored first, but afaict the logic here is not the only thing
>>>>> depending on that.
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>>
>>>>> --- a/xen/arch/x86/domain.c
>>>>> +++ b/xen/arch/x86/domain.c
>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>>>          goto out;
>>>>>  
>>>>>      if ( v->vcpu_id == 0 )
>>>>> +    {
>>>>>          d->vm_assist = c(vm_assist);
>>>>>  
>>>>> +        /*
>>>>> +         * In the restore case we need to deal with L4 pages which got
>>>>> +         * initialized with m2p_strict still clear (and which hence lack 
>> the
>>>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>>>> +         */
>>>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>>>> +        {
>>>>> +            bool_t done = 0;
>>>>> +
>>>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>>>> +
>>>>> +            for ( i = 0; ; )
>>>>> +            {
>>>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>>>> +
>>>>> +                if ( page_lock(page) )
>>>>> +                {
>>>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>>>> +                         PGT_l4_page_table )
>>>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>>>> +
>>>>> +                    page_unlock(page);
>>>>> +                }
>>>>> +
>>>>> +                page_list_add_tail(page, &d->page_list);
>>>>> +
>>>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>>>> +                    break;
>>>>> +            }
>>>>> +
>>>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>>>> +
>>>>> +            if ( !done )
>>>>> +                return -ERESTART;
>>>> This is a long loop.  It is preemptible, but will incur a time delay
>>>> proportional to the size of the domain during the VM downtime. 
>>>>
>>>> Could you defer the loop until after %cr3 has set been set up, and only
>>>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>>>> way, domains migrated with migration v2 will skip the loop entirely.
>>> Well, first of all this would be the result only as long as you or
>>> someone else don't re-think and possibly move pinning ahead of
>>> context load again.
>> A second set_context() will unconditionally hit the loop though.
> Right - another argument against making any change to what is
> in the patch right now.

If there are any L4 pages, the current code will unconditionally search
the pagelist on every entry to the function, even when it has already
fixed up the strictness.

A toolstack can enter this functions multiple times for the same vcpu,
by resetting the vcpu state inbetween.  How much do we care about this
usage?

~Andrew
Jan Beulich Jan. 13, 2016, 4:15 p.m. UTC | #6
>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote:
> On 13/01/16 15:36, Jan Beulich wrote:
>>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote:
>>> On 12/01/16 15:19, Jan Beulich wrote:
>>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>>>>> On 12/01/16 10:08, Jan Beulich wrote:
>>>>>> This went unnoticed until a backport of this to an older Xen got used,
>>>>>> causing migration of guests enabling this VM assist to fail, because
>>>>>> page table pinning there preceeds vCPU context loading, and hence L4
>>>>>> tables get initialized for the wrong mode. Fix this by post-processing
>>>>>> L4 tables when setting the intended VM assist flags for the guest.
>>>>>>
>>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>>>>> context restored first, but afaict the logic here is not the only thing
>>>>>> depending on that.
>>>>>>
>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>>>
>>>>>> --- a/xen/arch/x86/domain.c
>>>>>> +++ b/xen/arch/x86/domain.c
>>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>>>>          goto out;
>>>>>>  
>>>>>>      if ( v->vcpu_id == 0 )
>>>>>> +    {
>>>>>>          d->vm_assist = c(vm_assist);
>>>>>>  
>>>>>> +        /*
>>>>>> +         * In the restore case we need to deal with L4 pages which got
>>>>>> +         * initialized with m2p_strict still clear (and which hence lack 
>>> the
>>>>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>>>>> +         */
>>>>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>>>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>>>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>>>>> +        {
>>>>>> +            bool_t done = 0;
>>>>>> +
>>>>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>>>>> +
>>>>>> +            for ( i = 0; ; )
>>>>>> +            {
>>>>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>>>>> +
>>>>>> +                if ( page_lock(page) )
>>>>>> +                {
>>>>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>>>>> +                         PGT_l4_page_table )
>>>>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>>>>> +
>>>>>> +                    page_unlock(page);
>>>>>> +                }
>>>>>> +
>>>>>> +                page_list_add_tail(page, &d->page_list);
>>>>>> +
>>>>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>>>>> +                    break;
>>>>>> +            }
>>>>>> +
>>>>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>>>>> +
>>>>>> +            if ( !done )
>>>>>> +                return -ERESTART;
>>>>> This is a long loop.  It is preemptible, but will incur a time delay
>>>>> proportional to the size of the domain during the VM downtime. 
>>>>>
>>>>> Could you defer the loop until after %cr3 has set been set up, and only
>>>>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>>>>> way, domains migrated with migration v2 will skip the loop entirely.
>>>> Well, first of all this would be the result only as long as you or
>>>> someone else don't re-think and possibly move pinning ahead of
>>>> context load again.
>>> A second set_context() will unconditionally hit the loop though.
>> Right - another argument against making any change to what is
>> in the patch right now.
> 
> If there are any L4 pages, the current code will unconditionally search
> the pagelist on every entry to the function, even when it has already
> fixed up the strictness.
> 
> A toolstack can enter this functions multiple times for the same vcpu,
> by resetting the vcpu state inbetween.  How much do we care about this
> usage?

If we cared at all, we'd need to insert another similar piece of
code in the reset path (moving L4s back to m2p-relaxed mode).

Jan
Jan Beulich Feb. 1, 2016, 1:20 p.m. UTC | #7
Ping? (I'd really like to get this resolved, so we don't need to
indefinitely run with non-upstream behavior in our distros.)

Thanks, Jan

>>> On 13.01.16 at 17:15, <JBeulich@suse.com> wrote:
>>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote:
>> On 13/01/16 15:36, Jan Beulich wrote:
>>>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote:
>>>> On 12/01/16 15:19, Jan Beulich wrote:
>>>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>>>>>> On 12/01/16 10:08, Jan Beulich wrote:
>>>>>>> This went unnoticed until a backport of this to an older Xen got used,
>>>>>>> causing migration of guests enabling this VM assist to fail, because
>>>>>>> page table pinning there preceeds vCPU context loading, and hence L4
>>>>>>> tables get initialized for the wrong mode. Fix this by post-processing
>>>>>>> L4 tables when setting the intended VM assist flags for the guest.
>>>>>>>
>>>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>>>>>> context restored first, but afaict the logic here is not the only thing
>>>>>>> depending on that.
>>>>>>>
>>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>>>>
>>>>>>> --- a/xen/arch/x86/domain.c
>>>>>>> +++ b/xen/arch/x86/domain.c
>>>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>>>>>          goto out;
>>>>>>>  
>>>>>>>      if ( v->vcpu_id == 0 )
>>>>>>> +    {
>>>>>>>          d->vm_assist = c(vm_assist);
>>>>>>>  
>>>>>>> +        /*
>>>>>>> +         * In the restore case we need to deal with L4 pages which got
>>>>>>> +         * initialized with m2p_strict still clear (and which hence lack 
>>>> the
>>>>>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>>>>>> +         */
>>>>>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>>>>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>>>>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>>>>>> +        {
>>>>>>> +            bool_t done = 0;
>>>>>>> +
>>>>>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>>>>>> +
>>>>>>> +            for ( i = 0; ; )
>>>>>>> +            {
>>>>>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>>>>>> +
>>>>>>> +                if ( page_lock(page) )
>>>>>>> +                {
>>>>>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>>>>>> +                         PGT_l4_page_table )
>>>>>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>>>>>> +
>>>>>>> +                    page_unlock(page);
>>>>>>> +                }
>>>>>>> +
>>>>>>> +                page_list_add_tail(page, &d->page_list);
>>>>>>> +
>>>>>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>>>>>> +                    break;
>>>>>>> +            }
>>>>>>> +
>>>>>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>>>>>> +
>>>>>>> +            if ( !done )
>>>>>>> +                return -ERESTART;
>>>>>> This is a long loop.  It is preemptible, but will incur a time delay
>>>>>> proportional to the size of the domain during the VM downtime. 
>>>>>>
>>>>>> Could you defer the loop until after %cr3 has set been set up, and only
>>>>>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>>>>>> way, domains migrated with migration v2 will skip the loop entirely.
>>>>> Well, first of all this would be the result only as long as you or
>>>>> someone else don't re-think and possibly move pinning ahead of
>>>>> context load again.
>>>> A second set_context() will unconditionally hit the loop though.
>>> Right - another argument against making any change to what is
>>> in the patch right now.
>> 
>> If there are any L4 pages, the current code will unconditionally search
>> the pagelist on every entry to the function, even when it has already
>> fixed up the strictness.
>> 
>> A toolstack can enter this functions multiple times for the same vcpu,
>> by resetting the vcpu state inbetween.  How much do we care about this
>> usage?
> 
> If we cared at all, we'd need to insert another similar piece of
> code in the reset path (moving L4s back to m2p-relaxed mode).
> 
> Jan
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org 
> http://lists.xen.org/xen-devel
Andrew Cooper Feb. 1, 2016, 2:07 p.m. UTC | #8
On 01/02/16 13:20, Jan Beulich wrote:
> Ping? (I'd really like to get this resolved, so we don't need to
> indefinitely run with non-upstream behavior in our distros.)
>
> Thanks, Jan

My remaining issue is whether this loop gets executed by default.

I realise that there is a difference between legacy and v2 migration,
and that v2 migration by default worked.  If that means we managed to
skip this loop in its entirety for v2, then I am far less concerned
about the overhead.

~Andrew


>
>>>> On 13.01.16 at 17:15, <JBeulich@suse.com> wrote:
>>>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote:
>>> On 13/01/16 15:36, Jan Beulich wrote:
>>>>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote:
>>>>> On 12/01/16 15:19, Jan Beulich wrote:
>>>>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote:
>>>>>>> On 12/01/16 10:08, Jan Beulich wrote:
>>>>>>>> This went unnoticed until a backport of this to an older Xen got used,
>>>>>>>> causing migration of guests enabling this VM assist to fail, because
>>>>>>>> page table pinning there preceeds vCPU context loading, and hence L4
>>>>>>>> tables get initialized for the wrong mode. Fix this by post-processing
>>>>>>>> L4 tables when setting the intended VM assist flags for the guest.
>>>>>>>>
>>>>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest
>>>>>>>> context restored first, but afaict the logic here is not the only thing
>>>>>>>> depending on that.
>>>>>>>>
>>>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>>>>>
>>>>>>>> --- a/xen/arch/x86/domain.c
>>>>>>>> +++ b/xen/arch/x86/domain.c
>>>>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest(
>>>>>>>>          goto out;
>>>>>>>>  
>>>>>>>>      if ( v->vcpu_id == 0 )
>>>>>>>> +    {
>>>>>>>>          d->vm_assist = c(vm_assist);
>>>>>>>>  
>>>>>>>> +        /*
>>>>>>>> +         * In the restore case we need to deal with L4 pages which got
>>>>>>>> +         * initialized with m2p_strict still clear (and which hence lack 
>>>>> the
>>>>>>>> +         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
>>>>>>>> +         */
>>>>>>>> +        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
>>>>>>>> +             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
>>>>>>>> +             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
>>>>>>>> +        {
>>>>>>>> +            bool_t done = 0;
>>>>>>>> +
>>>>>>>> +            spin_lock_recursive(&d->page_alloc_lock);
>>>>>>>> +
>>>>>>>> +            for ( i = 0; ; )
>>>>>>>> +            {
>>>>>>>> +                struct page_info *page = page_list_remove_head(&d->page_list);
>>>>>>>> +
>>>>>>>> +                if ( page_lock(page) )
>>>>>>>> +                {
>>>>>>>> +                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
>>>>>>>> +                         PGT_l4_page_table )
>>>>>>>> +                        done = !fill_ro_mpt(page_to_mfn(page));
>>>>>>>> +
>>>>>>>> +                    page_unlock(page);
>>>>>>>> +                }
>>>>>>>> +
>>>>>>>> +                page_list_add_tail(page, &d->page_list);
>>>>>>>> +
>>>>>>>> +                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
>>>>>>>> +                    break;
>>>>>>>> +            }
>>>>>>>> +
>>>>>>>> +            spin_unlock_recursive(&d->page_alloc_lock);
>>>>>>>> +
>>>>>>>> +            if ( !done )
>>>>>>>> +                return -ERESTART;
>>>>>>> This is a long loop.  It is preemptible, but will incur a time delay
>>>>>>> proportional to the size of the domain during the VM downtime. 
>>>>>>>
>>>>>>> Could you defer the loop until after %cr3 has set been set up, and only
>>>>>>> enter the loop if the kernel l4 table is missing the RO mappings?  That
>>>>>>> way, domains migrated with migration v2 will skip the loop entirely.
>>>>>> Well, first of all this would be the result only as long as you or
>>>>>> someone else don't re-think and possibly move pinning ahead of
>>>>>> context load again.
>>>>> A second set_context() will unconditionally hit the loop though.
>>>> Right - another argument against making any change to what is
>>>> in the patch right now.
>>> If there are any L4 pages, the current code will unconditionally search
>>> the pagelist on every entry to the function, even when it has already
>>> fixed up the strictness.
>>>
>>> A toolstack can enter this functions multiple times for the same vcpu,
>>> by resetting the vcpu state inbetween.  How much do we care about this
>>> usage?
>> If we cared at all, we'd need to insert another similar piece of
>> code in the reset path (moving L4s back to m2p-relaxed mode).
>>
>> Jan
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org 
>> http://lists.xen.org/xen-devel 
>
Jan Beulich Feb. 1, 2016, 4:28 p.m. UTC | #9
>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
> On 01/02/16 13:20, Jan Beulich wrote:
>> Ping? (I'd really like to get this resolved, so we don't need to
>> indefinitely run with non-upstream behavior in our distros.)
> 
> My remaining issue is whether this loop gets executed by default.
> 
> I realise that there is a difference between legacy and v2 migration,
> and that v2 migration by default worked.  If that means we managed to
> skip this loop in its entirety for v2, then I am far less concerned
> about the overhead.

But had been there before: Of course we could skip the loop if
the bit in d->vm_assist doesn't change. But as expressed before,
with you having already indicated that perhaps it would be better
to have v2 migration do the relevant operations in the other (v1)
order, the moment that was actually done the benefit of avoiding
the loop would be gone.

To be clear - if rendering the code dead (which is what you ask
for) until v2 migration happens to get changed is the only way to
get this code in, I will submit a v2 with that extra conditional.

Jan
Andrew Cooper Feb. 1, 2016, 4:34 p.m. UTC | #10
On 01/02/16 16:28, Jan Beulich wrote:
>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
>> On 01/02/16 13:20, Jan Beulich wrote:
>>> Ping? (I'd really like to get this resolved, so we don't need to
>>> indefinitely run with non-upstream behavior in our distros.)
>> My remaining issue is whether this loop gets executed by default.
>>
>> I realise that there is a difference between legacy and v2 migration,
>> and that v2 migration by default worked.  If that means we managed to
>> skip this loop in its entirety for v2, then I am far less concerned
>> about the overhead.
> But had been there before: Of course we could skip the loop if
> the bit in d->vm_assist doesn't change. But as expressed before,
> with you having already indicated that perhaps it would be better
> to have v2 migration do the relevant operations in the other (v1)
> order, the moment that was actually done the benefit of avoiding
> the loop would be gone.
>
> To be clear - if rendering the code dead (which is what you ask
> for) until v2 migration happens to get changed is the only way to
> get this code in, I will submit a v2 with that extra conditional.

Migration v2 currently loads vcpu context before pinning the pagetables,
which means that the vm_assist should get set up properly, before L4
tables are processed.

It was my understanding that this is the correct way around, and
m2p-strict mode only broke when you backported it to migration v1 systems?

~Andrew
Jan Beulich Feb. 1, 2016, 4:51 p.m. UTC | #11
>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote:
> On 01/02/16 16:28, Jan Beulich wrote:
>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
>>> On 01/02/16 13:20, Jan Beulich wrote:
>>>> Ping? (I'd really like to get this resolved, so we don't need to
>>>> indefinitely run with non-upstream behavior in our distros.)
>>> My remaining issue is whether this loop gets executed by default.
>>>
>>> I realise that there is a difference between legacy and v2 migration,
>>> and that v2 migration by default worked.  If that means we managed to
>>> skip this loop in its entirety for v2, then I am far less concerned
>>> about the overhead.
>> But had been there before: Of course we could skip the loop if
>> the bit in d->vm_assist doesn't change. But as expressed before,
>> with you having already indicated that perhaps it would be better
>> to have v2 migration do the relevant operations in the other (v1)
>> order, the moment that was actually done the benefit of avoiding
>> the loop would be gone.
>>
>> To be clear - if rendering the code dead (which is what you ask
>> for) until v2 migration happens to get changed is the only way to
>> get this code in, I will submit a v2 with that extra conditional.
> 
> Migration v2 currently loads vcpu context before pinning the pagetables,
> which means that the vm_assist should get set up properly, before L4
> tables are processed.
> 
> It was my understanding that this is the correct way around, and
> m2p-strict mode only broke when you backported it to migration v1 systems?

Yes, but when we discussed this you said "in hindsight, it would have
been more sensible to swap pin and vCPU context, but I guess that
would break m2p-strict in the same way", and whether one is more
"correct" than the other is pretty questionable.

Jan
Andrew Cooper Feb. 1, 2016, 5:31 p.m. UTC | #12
On 01/02/16 16:51, Jan Beulich wrote:
>>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote:
>> On 01/02/16 16:28, Jan Beulich wrote:
>>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
>>>> On 01/02/16 13:20, Jan Beulich wrote:
>>>>> Ping? (I'd really like to get this resolved, so we don't need to
>>>>> indefinitely run with non-upstream behavior in our distros.)
>>>> My remaining issue is whether this loop gets executed by default.
>>>>
>>>> I realise that there is a difference between legacy and v2 migration,
>>>> and that v2 migration by default worked.  If that means we managed to
>>>> skip this loop in its entirety for v2, then I am far less concerned
>>>> about the overhead.
>>> But had been there before: Of course we could skip the loop if
>>> the bit in d->vm_assist doesn't change. But as expressed before,
>>> with you having already indicated that perhaps it would be better
>>> to have v2 migration do the relevant operations in the other (v1)
>>> order, the moment that was actually done the benefit of avoiding
>>> the loop would be gone.
>>>
>>> To be clear - if rendering the code dead (which is what you ask
>>> for) until v2 migration happens to get changed is the only way to
>>> get this code in, I will submit a v2 with that extra conditional.
>> Migration v2 currently loads vcpu context before pinning the pagetables,
>> which means that the vm_assist should get set up properly, before L4
>> tables are processed.
>>
>> It was my understanding that this is the correct way around, and
>> m2p-strict mode only broke when you backported it to migration v1 systems?
> Yes, but when we discussed this you said "in hindsight, it would have
> been more sensible to swap pin and vCPU context, but I guess that
> would break m2p-strict in the same way", and whether one is more
> "correct" than the other is pretty questionable.

Right, but as it currently stands, migration v2 gets things the correct
way around?

Does that mean that, at the moment, this loop ends up getting skipped?

If it does, then the patch is fine.

The suggestion to swap the actions around was only to work around the
apparent error caused by continutions while loading vcpu0's state,
caused by auditing and pinning all the pagetables hanging off %cr3/%cr1.

If however there is a legitimate reason, such as this, to keep the
current order of operations, then it would be counterproductive to make
any changes to migration v2.

~Andrew
Jan Beulich Feb. 2, 2016, 10:21 a.m. UTC | #13
>>> On 01.02.16 at 18:31, <andrew.cooper3@citrix.com> wrote:
> On 01/02/16 16:51, Jan Beulich wrote:
>>>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote:
>>> On 01/02/16 16:28, Jan Beulich wrote:
>>>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
>>>>> On 01/02/16 13:20, Jan Beulich wrote:
>>>>>> Ping? (I'd really like to get this resolved, so we don't need to
>>>>>> indefinitely run with non-upstream behavior in our distros.)
>>>>> My remaining issue is whether this loop gets executed by default.
>>>>>
>>>>> I realise that there is a difference between legacy and v2 migration,
>>>>> and that v2 migration by default worked.  If that means we managed to
>>>>> skip this loop in its entirety for v2, then I am far less concerned
>>>>> about the overhead.
>>>> But had been there before: Of course we could skip the loop if
>>>> the bit in d->vm_assist doesn't change. But as expressed before,
>>>> with you having already indicated that perhaps it would be better
>>>> to have v2 migration do the relevant operations in the other (v1)
>>>> order, the moment that was actually done the benefit of avoiding
>>>> the loop would be gone.
>>>>
>>>> To be clear - if rendering the code dead (which is what you ask
>>>> for) until v2 migration happens to get changed is the only way to
>>>> get this code in, I will submit a v2 with that extra conditional.
>>> Migration v2 currently loads vcpu context before pinning the pagetables,
>>> which means that the vm_assist should get set up properly, before L4
>>> tables are processed.
>>>
>>> It was my understanding that this is the correct way around, and
>>> m2p-strict mode only broke when you backported it to migration v1 systems?
>> Yes, but when we discussed this you said "in hindsight, it would have
>> been more sensible to swap pin and vCPU context, but I guess that
>> would break m2p-strict in the same way", and whether one is more
>> "correct" than the other is pretty questionable.
> 
> Right, but as it currently stands, migration v2 gets things the correct
> way around?

Yes.

> Does that mean that, at the moment, this loop ends up getting skipped?

No.

> If it does, then the patch is fine.
> 
> The suggestion to swap the actions around was only to work around the
> apparent error caused by continutions while loading vcpu0's state,
> caused by auditing and pinning all the pagetables hanging off %cr3/%cr1.
> 
> If however there is a legitimate reason, such as this, to keep the
> current order of operations, then it would be counterproductive to make
> any changes to migration v2.

Okay. I'll make the adjustment then.

Jan
Jan Beulich Feb. 2, 2016, 2:08 p.m. UTC | #14
>>> On 01.02.16 at 18:31, <andrew.cooper3@citrix.com> wrote:
> On 01/02/16 16:51, Jan Beulich wrote:
>>>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote:
>>> On 01/02/16 16:28, Jan Beulich wrote:
>>>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote:
>>>>> On 01/02/16 13:20, Jan Beulich wrote:
>>>>>> Ping? (I'd really like to get this resolved, so we don't need to
>>>>>> indefinitely run with non-upstream behavior in our distros.)
>>>>> My remaining issue is whether this loop gets executed by default.
>>>>>
>>>>> I realise that there is a difference between legacy and v2 migration,
>>>>> and that v2 migration by default worked.  If that means we managed to
>>>>> skip this loop in its entirety for v2, then I am far less concerned
>>>>> about the overhead.
>>>> But had been there before: Of course we could skip the loop if
>>>> the bit in d->vm_assist doesn't change. But as expressed before,
>>>> with you having already indicated that perhaps it would be better
>>>> to have v2 migration do the relevant operations in the other (v1)
>>>> order, the moment that was actually done the benefit of avoiding
>>>> the loop would be gone.
>>>>
>>>> To be clear - if rendering the code dead (which is what you ask
>>>> for) until v2 migration happens to get changed is the only way to
>>>> get this code in, I will submit a v2 with that extra conditional.
>>> Migration v2 currently loads vcpu context before pinning the pagetables,
>>> which means that the vm_assist should get set up properly, before L4
>>> tables are processed.
>>>
>>> It was my understanding that this is the correct way around, and
>>> m2p-strict mode only broke when you backported it to migration v1 systems?
>> Yes, but when we discussed this you said "in hindsight, it would have
>> been more sensible to swap pin and vCPU context, but I guess that
>> would break m2p-strict in the same way", and whether one is more
>> "correct" than the other is pretty questionable.
> 
> Right, but as it currently stands, migration v2 gets things the correct
> way around?
> 
> Does that mean that, at the moment, this loop ends up getting skipped?
> 
> If it does, then the patch is fine.

Actually while adding the extra logic it occurred to me that the
loop indeed would already get skipped, due to
d->arch.pv_domain.nr_l4_pages being zero if page table
pinning didn't happen yet. Except that due to preemption later
in the function the loop would have got entered once the first
L4 page appeared, and then it would perhaps execute multiple
times. I.e. the extra check is necessary regardless of your
desire to avoid the loop entirely.

Jan

Patch
diff mbox

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1067,8 +1067,48 @@  int arch_set_info_guest(
         goto out;
 
     if ( v->vcpu_id == 0 )
+    {
         d->vm_assist = c(vm_assist);
 
+        /*
+         * In the restore case we need to deal with L4 pages which got
+         * initialized with m2p_strict still clear (and which hence lack the
+         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
+         */
+        if ( d != current->domain && VM_ASSIST(d, m2p_strict) &&
+             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
+             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
+        {
+            bool_t done = 0;
+
+            spin_lock_recursive(&d->page_alloc_lock);
+
+            for ( i = 0; ; )
+            {
+                struct page_info *page = page_list_remove_head(&d->page_list);
+
+                if ( page_lock(page) )
+                {
+                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
+                         PGT_l4_page_table )
+                        done = !fill_ro_mpt(page_to_mfn(page));
+
+                    page_unlock(page);
+                }
+
+                page_list_add_tail(page, &d->page_list);
+
+                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
+                    break;
+            }
+
+            spin_unlock_recursive(&d->page_alloc_lock);
+
+            if ( !done )
+                return -ERESTART;
+        }
+    }
+
     rc = put_old_guest_table(current);
     if ( rc )
         return rc;
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1463,13 +1463,20 @@  void init_guest_l4_table(l4_pgentry_t l4
         l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
 }
 
-void fill_ro_mpt(unsigned long mfn)
+bool_t fill_ro_mpt(unsigned long mfn)
 {
     l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
+    bool_t ret = 0;
 
-    l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
-        idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+    if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
+    {
+        l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
+            idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+        ret = 1;
+    }
     unmap_domain_page(l4tab);
+
+    return ret;
 }
 
 void zap_ro_mpt(unsigned long mfn)
@@ -1527,10 +1534,15 @@  static int alloc_l4_table(struct page_in
         adjust_guest_l4e(pl4e[i], d);
     }
 
-    init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
+    if ( rc >= 0 )
+    {
+        init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
+        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
     unmap_domain_page(pl4e);
 
-    return rc > 0 ? 0 : rc;
+    return rc;
 }
 
 static void free_l1_table(struct page_info *page)
@@ -1648,7 +1660,13 @@  static int free_l4_table(struct page_inf
 
     unmap_domain_page(pl4e);
 
-    return rc > 0 ? 0 : rc;
+    if ( rc >= 0 )
+    {
+        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
+
+    return rc;
 }
 
 int page_lock(struct page_info *page)
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -248,6 +248,8 @@  struct pv_domain
 {
     l1_pgentry_t **gdt_ldt_l1tab;
 
+    atomic_t nr_l4_pages;
+
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
 };
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -322,7 +322,7 @@  int free_page_type(struct page_info *pag
 
 void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
                          bool_t zap_ro_mpt);
-void fill_ro_mpt(unsigned long mfn);
+bool_t fill_ro_mpt(unsigned long mfn);
 void zap_ro_mpt(unsigned long mfn);
 
 int is_iomem_page(unsigned long mfn);