Message ID | 5694DEAA02000078000C5D7A@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 12/01/16 10:08, Jan Beulich wrote: > This went unnoticed until a backport of this to an older Xen got used, > causing migration of guests enabling this VM assist to fail, because > page table pinning there preceeds vCPU context loading, and hence L4 > tables get initialized for the wrong mode. Fix this by post-processing > L4 tables when setting the intended VM assist flags for the guest. > > Note that this leaves in place a dependency on vCPU 0 getting its guest > context restored first, but afaict the logic here is not the only thing > depending on that. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -1067,8 +1067,48 @@ int arch_set_info_guest( > goto out; > > if ( v->vcpu_id == 0 ) > + { > d->vm_assist = c(vm_assist); > > + /* > + * In the restore case we need to deal with L4 pages which got > + * initialized with m2p_strict still clear (and which hence lack the > + * correct initial RO_MPT_VIRT_{START,END} L4 entry). > + */ > + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && > + is_pv_domain(d) && !is_pv_32bit_domain(d) && > + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) > + { > + bool_t done = 0; > + > + spin_lock_recursive(&d->page_alloc_lock); > + > + for ( i = 0; ; ) > + { > + struct page_info *page = page_list_remove_head(&d->page_list); > + > + if ( page_lock(page) ) > + { > + if ( (page->u.inuse.type_info & PGT_type_mask) == > + PGT_l4_page_table ) > + done = !fill_ro_mpt(page_to_mfn(page)); > + > + page_unlock(page); > + } > + > + page_list_add_tail(page, &d->page_list); > + > + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) > + break; > + } > + > + spin_unlock_recursive(&d->page_alloc_lock); > + > + if ( !done ) > + return -ERESTART; This is a long loop. It is preemptible, but will incur a time delay proportional to the size of the domain during the VM downtime. Could you defer the loop until after %cr3 has set been set up, and only enter the loop if the kernel l4 table is missing the RO mappings? That way, domains migrated with migration v2 will skip the loop entirely. > + } > + } > + > rc = put_old_guest_table(current); > if ( rc ) > return rc; > --- a/xen/arch/x86/mm.c > +++ b/xen/arch/x86/mm.c > @@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4 > l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty(); > } > > -void fill_ro_mpt(unsigned long mfn) > +bool_t fill_ro_mpt(unsigned long mfn) > { > l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn)); > + bool_t ret = 0; > > - l4tab[l4_table_offset(RO_MPT_VIRT_START)] = > - idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; > + if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) ) > + { > + l4tab[l4_table_offset(RO_MPT_VIRT_START)] = > + idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; > + ret = 1; This is a behavioural change. Previously, the old value was clobbered. It appears that you are now using this return value to indicate when the entire pagelist has been walked, but it it relies on the slots being zero, which is a fragile assumption. ~Andrew > + } > unmap_domain_page(l4tab); > + > + return ret; > } > > void zap_ro_mpt(unsigned long mfn) > @@ -1527,10 +1534,15 @@ static int alloc_l4_table(struct page_in > adjust_guest_l4e(pl4e[i], d); > } > > - init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict)); > + if ( rc >= 0 ) > + { > + init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict)); > + atomic_inc(&d->arch.pv_domain.nr_l4_pages); > + rc = 0; > + } > unmap_domain_page(pl4e); > > - return rc > 0 ? 0 : rc; > + return rc; > } > > static void free_l1_table(struct page_info *page) > @@ -1648,7 +1660,13 @@ static int free_l4_table(struct page_inf > > unmap_domain_page(pl4e); > > - return rc > 0 ? 0 : rc; > + if ( rc >= 0 ) > + { > + atomic_dec(&d->arch.pv_domain.nr_l4_pages); > + rc = 0; > + } > + > + return rc; > } > > int page_lock(struct page_info *page) > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -248,6 +248,8 @@ struct pv_domain > { > l1_pgentry_t **gdt_ldt_l1tab; > > + atomic_t nr_l4_pages; > + > /* map_domain_page() mapping cache. */ > struct mapcache_domain mapcache; > }; > --- a/xen/include/asm-x86/mm.h > +++ b/xen/include/asm-x86/mm.h > @@ -322,7 +322,7 @@ int free_page_type(struct page_info *pag > > void init_guest_l4_table(l4_pgentry_t[], const struct domain *, > bool_t zap_ro_mpt); > -void fill_ro_mpt(unsigned long mfn); > +bool_t fill_ro_mpt(unsigned long mfn); > void zap_ro_mpt(unsigned long mfn); > > int is_iomem_page(unsigned long mfn); > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: > On 12/01/16 10:08, Jan Beulich wrote: >> This went unnoticed until a backport of this to an older Xen got used, >> causing migration of guests enabling this VM assist to fail, because >> page table pinning there preceeds vCPU context loading, and hence L4 >> tables get initialized for the wrong mode. Fix this by post-processing >> L4 tables when setting the intended VM assist flags for the guest. >> >> Note that this leaves in place a dependency on vCPU 0 getting its guest >> context restored first, but afaict the logic here is not the only thing >> depending on that. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> >> --- a/xen/arch/x86/domain.c >> +++ b/xen/arch/x86/domain.c >> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >> goto out; >> >> if ( v->vcpu_id == 0 ) >> + { >> d->vm_assist = c(vm_assist); >> >> + /* >> + * In the restore case we need to deal with L4 pages which got >> + * initialized with m2p_strict still clear (and which hence lack the >> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >> + */ >> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >> + { >> + bool_t done = 0; >> + >> + spin_lock_recursive(&d->page_alloc_lock); >> + >> + for ( i = 0; ; ) >> + { >> + struct page_info *page = page_list_remove_head(&d->page_list); >> + >> + if ( page_lock(page) ) >> + { >> + if ( (page->u.inuse.type_info & PGT_type_mask) == >> + PGT_l4_page_table ) >> + done = !fill_ro_mpt(page_to_mfn(page)); >> + >> + page_unlock(page); >> + } >> + >> + page_list_add_tail(page, &d->page_list); >> + >> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >> + break; >> + } >> + >> + spin_unlock_recursive(&d->page_alloc_lock); >> + >> + if ( !done ) >> + return -ERESTART; > > This is a long loop. It is preemptible, but will incur a time delay > proportional to the size of the domain during the VM downtime. > > Could you defer the loop until after %cr3 has set been set up, and only > enter the loop if the kernel l4 table is missing the RO mappings? That > way, domains migrated with migration v2 will skip the loop entirely. Well, first of all this would be the result only as long as you or someone else don't re-think and possibly move pinning ahead of context load again. Deferring until after CR3 got set up is - afaict - not an option, as it would defeat the purpose of m2p-strict mode as much as doing the fixup e.g. in the #PF handler. This mode enabled needs to strictly mean "L4s start with the slot filled, and user-mode uses clear it", as documented. There's a much simpler way we could avoid the loop being entered: Check the previous setting of the flag. However, I intentionally did not go that route in this initial version as I didn't want to add more special casing than needed, plus to make sure the new code isn't effectively dead. >> --- a/xen/arch/x86/mm.c >> +++ b/xen/arch/x86/mm.c >> @@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4 >> l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty(); >> } >> >> -void fill_ro_mpt(unsigned long mfn) >> +bool_t fill_ro_mpt(unsigned long mfn) >> { >> l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn)); >> + bool_t ret = 0; >> >> - l4tab[l4_table_offset(RO_MPT_VIRT_START)] = >> - idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; >> + if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) ) >> + { >> + l4tab[l4_table_offset(RO_MPT_VIRT_START)] = >> + idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; >> + ret = 1; > > This is a behavioural change. Previously, the old value was clobbered. > > It appears that you are now using this return value to indicate when the > entire pagelist has been walked, but it it relies on the slots being > zero, which is a fragile assumption. There are only two values possible in this slot - zero or the one referring to the _shared across domains_ sub-tree for the r/o MPT. I.e. the change of behavior is only an apparent one, and I don't see this being fragile either. Jan
On 12/01/16 15:19, Jan Beulich wrote: >>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >> On 12/01/16 10:08, Jan Beulich wrote: >>> This went unnoticed until a backport of this to an older Xen got used, >>> causing migration of guests enabling this VM assist to fail, because >>> page table pinning there preceeds vCPU context loading, and hence L4 >>> tables get initialized for the wrong mode. Fix this by post-processing >>> L4 tables when setting the intended VM assist flags for the guest. >>> >>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>> context restored first, but afaict the logic here is not the only thing >>> depending on that. >>> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>> >>> --- a/xen/arch/x86/domain.c >>> +++ b/xen/arch/x86/domain.c >>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>> goto out; >>> >>> if ( v->vcpu_id == 0 ) >>> + { >>> d->vm_assist = c(vm_assist); >>> >>> + /* >>> + * In the restore case we need to deal with L4 pages which got >>> + * initialized with m2p_strict still clear (and which hence lack the >>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>> + */ >>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>> + { >>> + bool_t done = 0; >>> + >>> + spin_lock_recursive(&d->page_alloc_lock); >>> + >>> + for ( i = 0; ; ) >>> + { >>> + struct page_info *page = page_list_remove_head(&d->page_list); >>> + >>> + if ( page_lock(page) ) >>> + { >>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>> + PGT_l4_page_table ) >>> + done = !fill_ro_mpt(page_to_mfn(page)); >>> + >>> + page_unlock(page); >>> + } >>> + >>> + page_list_add_tail(page, &d->page_list); >>> + >>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>> + break; >>> + } >>> + >>> + spin_unlock_recursive(&d->page_alloc_lock); >>> + >>> + if ( !done ) >>> + return -ERESTART; >> This is a long loop. It is preemptible, but will incur a time delay >> proportional to the size of the domain during the VM downtime. >> >> Could you defer the loop until after %cr3 has set been set up, and only >> enter the loop if the kernel l4 table is missing the RO mappings? That >> way, domains migrated with migration v2 will skip the loop entirely. > Well, first of all this would be the result only as long as you or > someone else don't re-think and possibly move pinning ahead of > context load again. A second set_context() will unconditionally hit the loop though. > > Deferring until after CR3 got set up is - afaict - not an option, as > it would defeat the purpose of m2p-strict mode as much as doing > the fixup e.g. in the #PF handler. This mode enabled needs to > strictly mean "L4s start with the slot filled, and user-mode uses > clear it", as documented. I just meant a little further down this function, i.e. gate the loop on whether v->arch.guest_table has inappropriate m2p strictness. ~Andrew
>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote: > On 12/01/16 15:19, Jan Beulich wrote: >>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >>> On 12/01/16 10:08, Jan Beulich wrote: >>>> This went unnoticed until a backport of this to an older Xen got used, >>>> causing migration of guests enabling this VM assist to fail, because >>>> page table pinning there preceeds vCPU context loading, and hence L4 >>>> tables get initialized for the wrong mode. Fix this by post-processing >>>> L4 tables when setting the intended VM assist flags for the guest. >>>> >>>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>>> context restored first, but afaict the logic here is not the only thing >>>> depending on that. >>>> >>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>> >>>> --- a/xen/arch/x86/domain.c >>>> +++ b/xen/arch/x86/domain.c >>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>>> goto out; >>>> >>>> if ( v->vcpu_id == 0 ) >>>> + { >>>> d->vm_assist = c(vm_assist); >>>> >>>> + /* >>>> + * In the restore case we need to deal with L4 pages which got >>>> + * initialized with m2p_strict still clear (and which hence lack > the >>>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>>> + */ >>>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>>> + { >>>> + bool_t done = 0; >>>> + >>>> + spin_lock_recursive(&d->page_alloc_lock); >>>> + >>>> + for ( i = 0; ; ) >>>> + { >>>> + struct page_info *page = page_list_remove_head(&d->page_list); >>>> + >>>> + if ( page_lock(page) ) >>>> + { >>>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>>> + PGT_l4_page_table ) >>>> + done = !fill_ro_mpt(page_to_mfn(page)); >>>> + >>>> + page_unlock(page); >>>> + } >>>> + >>>> + page_list_add_tail(page, &d->page_list); >>>> + >>>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>>> + break; >>>> + } >>>> + >>>> + spin_unlock_recursive(&d->page_alloc_lock); >>>> + >>>> + if ( !done ) >>>> + return -ERESTART; >>> This is a long loop. It is preemptible, but will incur a time delay >>> proportional to the size of the domain during the VM downtime. >>> >>> Could you defer the loop until after %cr3 has set been set up, and only >>> enter the loop if the kernel l4 table is missing the RO mappings? That >>> way, domains migrated with migration v2 will skip the loop entirely. >> Well, first of all this would be the result only as long as you or >> someone else don't re-think and possibly move pinning ahead of >> context load again. > > A second set_context() will unconditionally hit the loop though. Right - another argument against making any change to what is in the patch right now. >> Deferring until after CR3 got set up is - afaict - not an option, as >> it would defeat the purpose of m2p-strict mode as much as doing >> the fixup e.g. in the #PF handler. This mode enabled needs to >> strictly mean "L4s start with the slot filled, and user-mode uses >> clear it", as documented. > > I just meant a little further down this function, i.e. gate the loop on > whether v->arch.guest_table has inappropriate m2p strictness. I don't follow: Are you proposing to look at v->arch.guest_table's respective L4 entry? If so - what condition and what action are you thinking of? If not - what else? Jan
On 13/01/16 15:36, Jan Beulich wrote: >>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote: >> On 12/01/16 15:19, Jan Beulich wrote: >>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >>>> On 12/01/16 10:08, Jan Beulich wrote: >>>>> This went unnoticed until a backport of this to an older Xen got used, >>>>> causing migration of guests enabling this VM assist to fail, because >>>>> page table pinning there preceeds vCPU context loading, and hence L4 >>>>> tables get initialized for the wrong mode. Fix this by post-processing >>>>> L4 tables when setting the intended VM assist flags for the guest. >>>>> >>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>>>> context restored first, but afaict the logic here is not the only thing >>>>> depending on that. >>>>> >>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>>> >>>>> --- a/xen/arch/x86/domain.c >>>>> +++ b/xen/arch/x86/domain.c >>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>>>> goto out; >>>>> >>>>> if ( v->vcpu_id == 0 ) >>>>> + { >>>>> d->vm_assist = c(vm_assist); >>>>> >>>>> + /* >>>>> + * In the restore case we need to deal with L4 pages which got >>>>> + * initialized with m2p_strict still clear (and which hence lack >> the >>>>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>>>> + */ >>>>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>>>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>>>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>>>> + { >>>>> + bool_t done = 0; >>>>> + >>>>> + spin_lock_recursive(&d->page_alloc_lock); >>>>> + >>>>> + for ( i = 0; ; ) >>>>> + { >>>>> + struct page_info *page = page_list_remove_head(&d->page_list); >>>>> + >>>>> + if ( page_lock(page) ) >>>>> + { >>>>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>>>> + PGT_l4_page_table ) >>>>> + done = !fill_ro_mpt(page_to_mfn(page)); >>>>> + >>>>> + page_unlock(page); >>>>> + } >>>>> + >>>>> + page_list_add_tail(page, &d->page_list); >>>>> + >>>>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>>>> + break; >>>>> + } >>>>> + >>>>> + spin_unlock_recursive(&d->page_alloc_lock); >>>>> + >>>>> + if ( !done ) >>>>> + return -ERESTART; >>>> This is a long loop. It is preemptible, but will incur a time delay >>>> proportional to the size of the domain during the VM downtime. >>>> >>>> Could you defer the loop until after %cr3 has set been set up, and only >>>> enter the loop if the kernel l4 table is missing the RO mappings? That >>>> way, domains migrated with migration v2 will skip the loop entirely. >>> Well, first of all this would be the result only as long as you or >>> someone else don't re-think and possibly move pinning ahead of >>> context load again. >> A second set_context() will unconditionally hit the loop though. > Right - another argument against making any change to what is > in the patch right now. If there are any L4 pages, the current code will unconditionally search the pagelist on every entry to the function, even when it has already fixed up the strictness. A toolstack can enter this functions multiple times for the same vcpu, by resetting the vcpu state inbetween. How much do we care about this usage? ~Andrew
>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote: > On 13/01/16 15:36, Jan Beulich wrote: >>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote: >>> On 12/01/16 15:19, Jan Beulich wrote: >>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >>>>> On 12/01/16 10:08, Jan Beulich wrote: >>>>>> This went unnoticed until a backport of this to an older Xen got used, >>>>>> causing migration of guests enabling this VM assist to fail, because >>>>>> page table pinning there preceeds vCPU context loading, and hence L4 >>>>>> tables get initialized for the wrong mode. Fix this by post-processing >>>>>> L4 tables when setting the intended VM assist flags for the guest. >>>>>> >>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>>>>> context restored first, but afaict the logic here is not the only thing >>>>>> depending on that. >>>>>> >>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>>>> >>>>>> --- a/xen/arch/x86/domain.c >>>>>> +++ b/xen/arch/x86/domain.c >>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>>>>> goto out; >>>>>> >>>>>> if ( v->vcpu_id == 0 ) >>>>>> + { >>>>>> d->vm_assist = c(vm_assist); >>>>>> >>>>>> + /* >>>>>> + * In the restore case we need to deal with L4 pages which got >>>>>> + * initialized with m2p_strict still clear (and which hence lack >>> the >>>>>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>>>>> + */ >>>>>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>>>>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>>>>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>>>>> + { >>>>>> + bool_t done = 0; >>>>>> + >>>>>> + spin_lock_recursive(&d->page_alloc_lock); >>>>>> + >>>>>> + for ( i = 0; ; ) >>>>>> + { >>>>>> + struct page_info *page = page_list_remove_head(&d->page_list); >>>>>> + >>>>>> + if ( page_lock(page) ) >>>>>> + { >>>>>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>>>>> + PGT_l4_page_table ) >>>>>> + done = !fill_ro_mpt(page_to_mfn(page)); >>>>>> + >>>>>> + page_unlock(page); >>>>>> + } >>>>>> + >>>>>> + page_list_add_tail(page, &d->page_list); >>>>>> + >>>>>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>>>>> + break; >>>>>> + } >>>>>> + >>>>>> + spin_unlock_recursive(&d->page_alloc_lock); >>>>>> + >>>>>> + if ( !done ) >>>>>> + return -ERESTART; >>>>> This is a long loop. It is preemptible, but will incur a time delay >>>>> proportional to the size of the domain during the VM downtime. >>>>> >>>>> Could you defer the loop until after %cr3 has set been set up, and only >>>>> enter the loop if the kernel l4 table is missing the RO mappings? That >>>>> way, domains migrated with migration v2 will skip the loop entirely. >>>> Well, first of all this would be the result only as long as you or >>>> someone else don't re-think and possibly move pinning ahead of >>>> context load again. >>> A second set_context() will unconditionally hit the loop though. >> Right - another argument against making any change to what is >> in the patch right now. > > If there are any L4 pages, the current code will unconditionally search > the pagelist on every entry to the function, even when it has already > fixed up the strictness. > > A toolstack can enter this functions multiple times for the same vcpu, > by resetting the vcpu state inbetween. How much do we care about this > usage? If we cared at all, we'd need to insert another similar piece of code in the reset path (moving L4s back to m2p-relaxed mode). Jan
Ping? (I'd really like to get this resolved, so we don't need to indefinitely run with non-upstream behavior in our distros.) Thanks, Jan >>> On 13.01.16 at 17:15, <JBeulich@suse.com> wrote: >>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote: >> On 13/01/16 15:36, Jan Beulich wrote: >>>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote: >>>> On 12/01/16 15:19, Jan Beulich wrote: >>>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >>>>>> On 12/01/16 10:08, Jan Beulich wrote: >>>>>>> This went unnoticed until a backport of this to an older Xen got used, >>>>>>> causing migration of guests enabling this VM assist to fail, because >>>>>>> page table pinning there preceeds vCPU context loading, and hence L4 >>>>>>> tables get initialized for the wrong mode. Fix this by post-processing >>>>>>> L4 tables when setting the intended VM assist flags for the guest. >>>>>>> >>>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>>>>>> context restored first, but afaict the logic here is not the only thing >>>>>>> depending on that. >>>>>>> >>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>>>>> >>>>>>> --- a/xen/arch/x86/domain.c >>>>>>> +++ b/xen/arch/x86/domain.c >>>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>>>>>> goto out; >>>>>>> >>>>>>> if ( v->vcpu_id == 0 ) >>>>>>> + { >>>>>>> d->vm_assist = c(vm_assist); >>>>>>> >>>>>>> + /* >>>>>>> + * In the restore case we need to deal with L4 pages which got >>>>>>> + * initialized with m2p_strict still clear (and which hence lack >>>> the >>>>>>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>>>>>> + */ >>>>>>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>>>>>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>>>>>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>>>>>> + { >>>>>>> + bool_t done = 0; >>>>>>> + >>>>>>> + spin_lock_recursive(&d->page_alloc_lock); >>>>>>> + >>>>>>> + for ( i = 0; ; ) >>>>>>> + { >>>>>>> + struct page_info *page = page_list_remove_head(&d->page_list); >>>>>>> + >>>>>>> + if ( page_lock(page) ) >>>>>>> + { >>>>>>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>>>>>> + PGT_l4_page_table ) >>>>>>> + done = !fill_ro_mpt(page_to_mfn(page)); >>>>>>> + >>>>>>> + page_unlock(page); >>>>>>> + } >>>>>>> + >>>>>>> + page_list_add_tail(page, &d->page_list); >>>>>>> + >>>>>>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>>>>>> + break; >>>>>>> + } >>>>>>> + >>>>>>> + spin_unlock_recursive(&d->page_alloc_lock); >>>>>>> + >>>>>>> + if ( !done ) >>>>>>> + return -ERESTART; >>>>>> This is a long loop. It is preemptible, but will incur a time delay >>>>>> proportional to the size of the domain during the VM downtime. >>>>>> >>>>>> Could you defer the loop until after %cr3 has set been set up, and only >>>>>> enter the loop if the kernel l4 table is missing the RO mappings? That >>>>>> way, domains migrated with migration v2 will skip the loop entirely. >>>>> Well, first of all this would be the result only as long as you or >>>>> someone else don't re-think and possibly move pinning ahead of >>>>> context load again. >>>> A second set_context() will unconditionally hit the loop though. >>> Right - another argument against making any change to what is >>> in the patch right now. >> >> If there are any L4 pages, the current code will unconditionally search >> the pagelist on every entry to the function, even when it has already >> fixed up the strictness. >> >> A toolstack can enter this functions multiple times for the same vcpu, >> by resetting the vcpu state inbetween. How much do we care about this >> usage? > > If we cared at all, we'd need to insert another similar piece of > code in the reset path (moving L4s back to m2p-relaxed mode). > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
On 01/02/16 13:20, Jan Beulich wrote: > Ping? (I'd really like to get this resolved, so we don't need to > indefinitely run with non-upstream behavior in our distros.) > > Thanks, Jan My remaining issue is whether this loop gets executed by default. I realise that there is a difference between legacy and v2 migration, and that v2 migration by default worked. If that means we managed to skip this loop in its entirety for v2, then I am far less concerned about the overhead. ~Andrew > >>>> On 13.01.16 at 17:15, <JBeulich@suse.com> wrote: >>>>> On 13.01.16 at 17:00, <andrew.cooper3@citrix.com> wrote: >>> On 13/01/16 15:36, Jan Beulich wrote: >>>>>>> On 13.01.16 at 16:25, <andrew.cooper3@citrix.com> wrote: >>>>> On 12/01/16 15:19, Jan Beulich wrote: >>>>>>>>> On 12.01.16 at 12:55, <andrew.cooper3@citrix.com> wrote: >>>>>>> On 12/01/16 10:08, Jan Beulich wrote: >>>>>>>> This went unnoticed until a backport of this to an older Xen got used, >>>>>>>> causing migration of guests enabling this VM assist to fail, because >>>>>>>> page table pinning there preceeds vCPU context loading, and hence L4 >>>>>>>> tables get initialized for the wrong mode. Fix this by post-processing >>>>>>>> L4 tables when setting the intended VM assist flags for the guest. >>>>>>>> >>>>>>>> Note that this leaves in place a dependency on vCPU 0 getting its guest >>>>>>>> context restored first, but afaict the logic here is not the only thing >>>>>>>> depending on that. >>>>>>>> >>>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>>>>>> >>>>>>>> --- a/xen/arch/x86/domain.c >>>>>>>> +++ b/xen/arch/x86/domain.c >>>>>>>> @@ -1067,8 +1067,48 @@ int arch_set_info_guest( >>>>>>>> goto out; >>>>>>>> >>>>>>>> if ( v->vcpu_id == 0 ) >>>>>>>> + { >>>>>>>> d->vm_assist = c(vm_assist); >>>>>>>> >>>>>>>> + /* >>>>>>>> + * In the restore case we need to deal with L4 pages which got >>>>>>>> + * initialized with m2p_strict still clear (and which hence lack >>>>> the >>>>>>>> + * correct initial RO_MPT_VIRT_{START,END} L4 entry). >>>>>>>> + */ >>>>>>>> + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && >>>>>>>> + is_pv_domain(d) && !is_pv_32bit_domain(d) && >>>>>>>> + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) >>>>>>>> + { >>>>>>>> + bool_t done = 0; >>>>>>>> + >>>>>>>> + spin_lock_recursive(&d->page_alloc_lock); >>>>>>>> + >>>>>>>> + for ( i = 0; ; ) >>>>>>>> + { >>>>>>>> + struct page_info *page = page_list_remove_head(&d->page_list); >>>>>>>> + >>>>>>>> + if ( page_lock(page) ) >>>>>>>> + { >>>>>>>> + if ( (page->u.inuse.type_info & PGT_type_mask) == >>>>>>>> + PGT_l4_page_table ) >>>>>>>> + done = !fill_ro_mpt(page_to_mfn(page)); >>>>>>>> + >>>>>>>> + page_unlock(page); >>>>>>>> + } >>>>>>>> + >>>>>>>> + page_list_add_tail(page, &d->page_list); >>>>>>>> + >>>>>>>> + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) >>>>>>>> + break; >>>>>>>> + } >>>>>>>> + >>>>>>>> + spin_unlock_recursive(&d->page_alloc_lock); >>>>>>>> + >>>>>>>> + if ( !done ) >>>>>>>> + return -ERESTART; >>>>>>> This is a long loop. It is preemptible, but will incur a time delay >>>>>>> proportional to the size of the domain during the VM downtime. >>>>>>> >>>>>>> Could you defer the loop until after %cr3 has set been set up, and only >>>>>>> enter the loop if the kernel l4 table is missing the RO mappings? That >>>>>>> way, domains migrated with migration v2 will skip the loop entirely. >>>>>> Well, first of all this would be the result only as long as you or >>>>>> someone else don't re-think and possibly move pinning ahead of >>>>>> context load again. >>>>> A second set_context() will unconditionally hit the loop though. >>>> Right - another argument against making any change to what is >>>> in the patch right now. >>> If there are any L4 pages, the current code will unconditionally search >>> the pagelist on every entry to the function, even when it has already >>> fixed up the strictness. >>> >>> A toolstack can enter this functions multiple times for the same vcpu, >>> by resetting the vcpu state inbetween. How much do we care about this >>> usage? >> If we cared at all, we'd need to insert another similar piece of >> code in the reset path (moving L4s back to m2p-relaxed mode). >> >> Jan >> >> >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> http://lists.xen.org/xen-devel >
>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: > On 01/02/16 13:20, Jan Beulich wrote: >> Ping? (I'd really like to get this resolved, so we don't need to >> indefinitely run with non-upstream behavior in our distros.) > > My remaining issue is whether this loop gets executed by default. > > I realise that there is a difference between legacy and v2 migration, > and that v2 migration by default worked. If that means we managed to > skip this loop in its entirety for v2, then I am far less concerned > about the overhead. But had been there before: Of course we could skip the loop if the bit in d->vm_assist doesn't change. But as expressed before, with you having already indicated that perhaps it would be better to have v2 migration do the relevant operations in the other (v1) order, the moment that was actually done the benefit of avoiding the loop would be gone. To be clear - if rendering the code dead (which is what you ask for) until v2 migration happens to get changed is the only way to get this code in, I will submit a v2 with that extra conditional. Jan
On 01/02/16 16:28, Jan Beulich wrote: >>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: >> On 01/02/16 13:20, Jan Beulich wrote: >>> Ping? (I'd really like to get this resolved, so we don't need to >>> indefinitely run with non-upstream behavior in our distros.) >> My remaining issue is whether this loop gets executed by default. >> >> I realise that there is a difference between legacy and v2 migration, >> and that v2 migration by default worked. If that means we managed to >> skip this loop in its entirety for v2, then I am far less concerned >> about the overhead. > But had been there before: Of course we could skip the loop if > the bit in d->vm_assist doesn't change. But as expressed before, > with you having already indicated that perhaps it would be better > to have v2 migration do the relevant operations in the other (v1) > order, the moment that was actually done the benefit of avoiding > the loop would be gone. > > To be clear - if rendering the code dead (which is what you ask > for) until v2 migration happens to get changed is the only way to > get this code in, I will submit a v2 with that extra conditional. Migration v2 currently loads vcpu context before pinning the pagetables, which means that the vm_assist should get set up properly, before L4 tables are processed. It was my understanding that this is the correct way around, and m2p-strict mode only broke when you backported it to migration v1 systems? ~Andrew
>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote: > On 01/02/16 16:28, Jan Beulich wrote: >>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: >>> On 01/02/16 13:20, Jan Beulich wrote: >>>> Ping? (I'd really like to get this resolved, so we don't need to >>>> indefinitely run with non-upstream behavior in our distros.) >>> My remaining issue is whether this loop gets executed by default. >>> >>> I realise that there is a difference between legacy and v2 migration, >>> and that v2 migration by default worked. If that means we managed to >>> skip this loop in its entirety for v2, then I am far less concerned >>> about the overhead. >> But had been there before: Of course we could skip the loop if >> the bit in d->vm_assist doesn't change. But as expressed before, >> with you having already indicated that perhaps it would be better >> to have v2 migration do the relevant operations in the other (v1) >> order, the moment that was actually done the benefit of avoiding >> the loop would be gone. >> >> To be clear - if rendering the code dead (which is what you ask >> for) until v2 migration happens to get changed is the only way to >> get this code in, I will submit a v2 with that extra conditional. > > Migration v2 currently loads vcpu context before pinning the pagetables, > which means that the vm_assist should get set up properly, before L4 > tables are processed. > > It was my understanding that this is the correct way around, and > m2p-strict mode only broke when you backported it to migration v1 systems? Yes, but when we discussed this you said "in hindsight, it would have been more sensible to swap pin and vCPU context, but I guess that would break m2p-strict in the same way", and whether one is more "correct" than the other is pretty questionable. Jan
On 01/02/16 16:51, Jan Beulich wrote: >>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote: >> On 01/02/16 16:28, Jan Beulich wrote: >>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: >>>> On 01/02/16 13:20, Jan Beulich wrote: >>>>> Ping? (I'd really like to get this resolved, so we don't need to >>>>> indefinitely run with non-upstream behavior in our distros.) >>>> My remaining issue is whether this loop gets executed by default. >>>> >>>> I realise that there is a difference between legacy and v2 migration, >>>> and that v2 migration by default worked. If that means we managed to >>>> skip this loop in its entirety for v2, then I am far less concerned >>>> about the overhead. >>> But had been there before: Of course we could skip the loop if >>> the bit in d->vm_assist doesn't change. But as expressed before, >>> with you having already indicated that perhaps it would be better >>> to have v2 migration do the relevant operations in the other (v1) >>> order, the moment that was actually done the benefit of avoiding >>> the loop would be gone. >>> >>> To be clear - if rendering the code dead (which is what you ask >>> for) until v2 migration happens to get changed is the only way to >>> get this code in, I will submit a v2 with that extra conditional. >> Migration v2 currently loads vcpu context before pinning the pagetables, >> which means that the vm_assist should get set up properly, before L4 >> tables are processed. >> >> It was my understanding that this is the correct way around, and >> m2p-strict mode only broke when you backported it to migration v1 systems? > Yes, but when we discussed this you said "in hindsight, it would have > been more sensible to swap pin and vCPU context, but I guess that > would break m2p-strict in the same way", and whether one is more > "correct" than the other is pretty questionable. Right, but as it currently stands, migration v2 gets things the correct way around? Does that mean that, at the moment, this loop ends up getting skipped? If it does, then the patch is fine. The suggestion to swap the actions around was only to work around the apparent error caused by continutions while loading vcpu0's state, caused by auditing and pinning all the pagetables hanging off %cr3/%cr1. If however there is a legitimate reason, such as this, to keep the current order of operations, then it would be counterproductive to make any changes to migration v2. ~Andrew
>>> On 01.02.16 at 18:31, <andrew.cooper3@citrix.com> wrote: > On 01/02/16 16:51, Jan Beulich wrote: >>>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote: >>> On 01/02/16 16:28, Jan Beulich wrote: >>>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: >>>>> On 01/02/16 13:20, Jan Beulich wrote: >>>>>> Ping? (I'd really like to get this resolved, so we don't need to >>>>>> indefinitely run with non-upstream behavior in our distros.) >>>>> My remaining issue is whether this loop gets executed by default. >>>>> >>>>> I realise that there is a difference between legacy and v2 migration, >>>>> and that v2 migration by default worked. If that means we managed to >>>>> skip this loop in its entirety for v2, then I am far less concerned >>>>> about the overhead. >>>> But had been there before: Of course we could skip the loop if >>>> the bit in d->vm_assist doesn't change. But as expressed before, >>>> with you having already indicated that perhaps it would be better >>>> to have v2 migration do the relevant operations in the other (v1) >>>> order, the moment that was actually done the benefit of avoiding >>>> the loop would be gone. >>>> >>>> To be clear - if rendering the code dead (which is what you ask >>>> for) until v2 migration happens to get changed is the only way to >>>> get this code in, I will submit a v2 with that extra conditional. >>> Migration v2 currently loads vcpu context before pinning the pagetables, >>> which means that the vm_assist should get set up properly, before L4 >>> tables are processed. >>> >>> It was my understanding that this is the correct way around, and >>> m2p-strict mode only broke when you backported it to migration v1 systems? >> Yes, but when we discussed this you said "in hindsight, it would have >> been more sensible to swap pin and vCPU context, but I guess that >> would break m2p-strict in the same way", and whether one is more >> "correct" than the other is pretty questionable. > > Right, but as it currently stands, migration v2 gets things the correct > way around? Yes. > Does that mean that, at the moment, this loop ends up getting skipped? No. > If it does, then the patch is fine. > > The suggestion to swap the actions around was only to work around the > apparent error caused by continutions while loading vcpu0's state, > caused by auditing and pinning all the pagetables hanging off %cr3/%cr1. > > If however there is a legitimate reason, such as this, to keep the > current order of operations, then it would be counterproductive to make > any changes to migration v2. Okay. I'll make the adjustment then. Jan
>>> On 01.02.16 at 18:31, <andrew.cooper3@citrix.com> wrote: > On 01/02/16 16:51, Jan Beulich wrote: >>>>> On 01.02.16 at 17:34, <andrew.cooper3@citrix.com> wrote: >>> On 01/02/16 16:28, Jan Beulich wrote: >>>>>>> On 01.02.16 at 15:07, <andrew.cooper3@citrix.com> wrote: >>>>> On 01/02/16 13:20, Jan Beulich wrote: >>>>>> Ping? (I'd really like to get this resolved, so we don't need to >>>>>> indefinitely run with non-upstream behavior in our distros.) >>>>> My remaining issue is whether this loop gets executed by default. >>>>> >>>>> I realise that there is a difference between legacy and v2 migration, >>>>> and that v2 migration by default worked. If that means we managed to >>>>> skip this loop in its entirety for v2, then I am far less concerned >>>>> about the overhead. >>>> But had been there before: Of course we could skip the loop if >>>> the bit in d->vm_assist doesn't change. But as expressed before, >>>> with you having already indicated that perhaps it would be better >>>> to have v2 migration do the relevant operations in the other (v1) >>>> order, the moment that was actually done the benefit of avoiding >>>> the loop would be gone. >>>> >>>> To be clear - if rendering the code dead (which is what you ask >>>> for) until v2 migration happens to get changed is the only way to >>>> get this code in, I will submit a v2 with that extra conditional. >>> Migration v2 currently loads vcpu context before pinning the pagetables, >>> which means that the vm_assist should get set up properly, before L4 >>> tables are processed. >>> >>> It was my understanding that this is the correct way around, and >>> m2p-strict mode only broke when you backported it to migration v1 systems? >> Yes, but when we discussed this you said "in hindsight, it would have >> been more sensible to swap pin and vCPU context, but I guess that >> would break m2p-strict in the same way", and whether one is more >> "correct" than the other is pretty questionable. > > Right, but as it currently stands, migration v2 gets things the correct > way around? > > Does that mean that, at the moment, this loop ends up getting skipped? > > If it does, then the patch is fine. Actually while adding the extra logic it occurred to me that the loop indeed would already get skipped, due to d->arch.pv_domain.nr_l4_pages being zero if page table pinning didn't happen yet. Except that due to preemption later in the function the loop would have got entered once the first L4 page appeared, and then it would perhaps execute multiple times. I.e. the extra check is necessary regardless of your desire to avoid the loop entirely. Jan
--- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1067,8 +1067,48 @@ int arch_set_info_guest( goto out; if ( v->vcpu_id == 0 ) + { d->vm_assist = c(vm_assist); + /* + * In the restore case we need to deal with L4 pages which got + * initialized with m2p_strict still clear (and which hence lack the + * correct initial RO_MPT_VIRT_{START,END} L4 entry). + */ + if ( d != current->domain && VM_ASSIST(d, m2p_strict) && + is_pv_domain(d) && !is_pv_32bit_domain(d) && + atomic_read(&d->arch.pv_domain.nr_l4_pages) ) + { + bool_t done = 0; + + spin_lock_recursive(&d->page_alloc_lock); + + for ( i = 0; ; ) + { + struct page_info *page = page_list_remove_head(&d->page_list); + + if ( page_lock(page) ) + { + if ( (page->u.inuse.type_info & PGT_type_mask) == + PGT_l4_page_table ) + done = !fill_ro_mpt(page_to_mfn(page)); + + page_unlock(page); + } + + page_list_add_tail(page, &d->page_list); + + if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) + break; + } + + spin_unlock_recursive(&d->page_alloc_lock); + + if ( !done ) + return -ERESTART; + } + } + rc = put_old_guest_table(current); if ( rc ) return rc; --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1463,13 +1463,20 @@ void init_guest_l4_table(l4_pgentry_t l4 l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty(); } -void fill_ro_mpt(unsigned long mfn) +bool_t fill_ro_mpt(unsigned long mfn) { l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn)); + bool_t ret = 0; - l4tab[l4_table_offset(RO_MPT_VIRT_START)] = - idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; + if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) ) + { + l4tab[l4_table_offset(RO_MPT_VIRT_START)] = + idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; + ret = 1; + } unmap_domain_page(l4tab); + + return ret; } void zap_ro_mpt(unsigned long mfn) @@ -1527,10 +1534,15 @@ static int alloc_l4_table(struct page_in adjust_guest_l4e(pl4e[i], d); } - init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict)); + if ( rc >= 0 ) + { + init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict)); + atomic_inc(&d->arch.pv_domain.nr_l4_pages); + rc = 0; + } unmap_domain_page(pl4e); - return rc > 0 ? 0 : rc; + return rc; } static void free_l1_table(struct page_info *page) @@ -1648,7 +1660,13 @@ static int free_l4_table(struct page_inf unmap_domain_page(pl4e); - return rc > 0 ? 0 : rc; + if ( rc >= 0 ) + { + atomic_dec(&d->arch.pv_domain.nr_l4_pages); + rc = 0; + } + + return rc; } int page_lock(struct page_info *page) --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -248,6 +248,8 @@ struct pv_domain { l1_pgentry_t **gdt_ldt_l1tab; + atomic_t nr_l4_pages; + /* map_domain_page() mapping cache. */ struct mapcache_domain mapcache; }; --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -322,7 +322,7 @@ int free_page_type(struct page_info *pag void init_guest_l4_table(l4_pgentry_t[], const struct domain *, bool_t zap_ro_mpt); -void fill_ro_mpt(unsigned long mfn); +bool_t fill_ro_mpt(unsigned long mfn); void zap_ro_mpt(unsigned long mfn); int is_iomem_page(unsigned long mfn);