Message ID | e47d51c6-1b4b-2f72-6e12-070df66c30e2@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | x86/PoD: defer nested P2M flushes | expand |
On 11.10.2021 10:17, Jan Beulich wrote: > With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> > write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock > order violation when the PoD lock is held around it. Hence such flushing > needs to be deferred. Steal the approach from p2m_change_type_range(). > > Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> > ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. > > Reported-by: Elliott Mitchell <ehem+xen@m5p.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> Thoughts? Thanks, Jan > --- a/xen/arch/x86/mm/p2m-pod.c > +++ b/xen/arch/x86/mm/p2m-pod.c > @@ -24,6 +24,7 @@ > #include <xen/mm.h> > #include <xen/sched.h> > #include <xen/trace.h> > +#include <asm/hvm/nestedhvm.h> > #include <asm/page.h> > #include <asm/paging.h> > #include <asm/p2m.h> > @@ -494,6 +495,13 @@ p2m_pod_offline_or_broken_replace(struct > static int > p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); > > +static void pod_unlock_and_flush(struct p2m_domain *p2m) > +{ > + pod_unlock(p2m); > + p2m->defer_nested_flush = false; > + if ( nestedhvm_enabled(p2m->domain) ) > + p2m_flush_nestedp2m(p2m->domain); > +} > > /* > * This function is needed for two reasons: > @@ -514,6 +522,7 @@ p2m_pod_decrease_reservation(struct doma > > gfn_lock(p2m, gfn, order); > pod_lock(p2m); > + p2m->defer_nested_flush = true; > > /* > * If we don't have any outstanding PoD entries, let things take their > @@ -665,7 +674,7 @@ out_entry_check: > } > > out_unlock: > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > gfn_unlock(p2m, gfn, order); > return ret; > } > @@ -1144,8 +1153,10 @@ p2m_pod_demand_populate(struct p2m_domai > * won't start until we're done. > */ > if ( unlikely(d->is_dying) ) > - goto out_fail; > - > + { > + pod_unlock(p2m); > + return false; > + } > > /* > * Because PoD does not have cache list for 1GB pages, it has to remap > @@ -1167,6 +1178,8 @@ p2m_pod_demand_populate(struct p2m_domai > p2m_populate_on_demand, p2m->default_access); > } > > + p2m->defer_nested_flush = true; > + > /* Only reclaim if we're in actual need of more cache. */ > if ( p2m->pod.entry_count > p2m->pod.count ) > pod_eager_reclaim(p2m); > @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai > __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); > } > > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > return true; > + > out_of_memory: > pod_unlock(p2m); > > @@ -1239,12 +1253,14 @@ out_of_memory: > p2m->pod.entry_count, current->domain->domain_id); > domain_crash(d); > return false; > + > out_fail: > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > return false; > + > remap_and_retry: > BUG_ON(order != PAGE_ORDER_2M); > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > > /* > * Remap this 2-meg region in singleton chunks. See the comment on the > >
On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: > With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> > write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock > order violation when the PoD lock is held around it. Hence such flushing > needs to be deferred. Steal the approach from p2m_change_type_range(). > > Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> > ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. > > Reported-by: Elliott Mitchell <ehem+xen@m5p.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/mm/p2m-pod.c > +++ b/xen/arch/x86/mm/p2m-pod.c > @@ -24,6 +24,7 @@ > #include <xen/mm.h> > #include <xen/sched.h> > #include <xen/trace.h> > +#include <asm/hvm/nestedhvm.h> > #include <asm/page.h> > #include <asm/paging.h> > #include <asm/p2m.h> > @@ -494,6 +495,13 @@ p2m_pod_offline_or_broken_replace(struct > static int > p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); > > +static void pod_unlock_and_flush(struct p2m_domain *p2m) > +{ > + pod_unlock(p2m); > + p2m->defer_nested_flush = false; > + if ( nestedhvm_enabled(p2m->domain) ) > + p2m_flush_nestedp2m(p2m->domain); > +} > > /* > * This function is needed for two reasons: > @@ -514,6 +522,7 @@ p2m_pod_decrease_reservation(struct doma > > gfn_lock(p2m, gfn, order); > pod_lock(p2m); > + p2m->defer_nested_flush = true; > > /* > * If we don't have any outstanding PoD entries, let things take their > @@ -665,7 +674,7 @@ out_entry_check: > } > > out_unlock: > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > gfn_unlock(p2m, gfn, order); > return ret; > } > @@ -1144,8 +1153,10 @@ p2m_pod_demand_populate(struct p2m_domai > * won't start until we're done. > */ > if ( unlikely(d->is_dying) ) > - goto out_fail; > - > + { > + pod_unlock(p2m); > + return false; > + } > > /* > * Because PoD does not have cache list for 1GB pages, it has to remap > @@ -1167,6 +1178,8 @@ p2m_pod_demand_populate(struct p2m_domai > p2m_populate_on_demand, p2m->default_access); > } > > + p2m->defer_nested_flush = true; > + > /* Only reclaim if we're in actual need of more cache. */ > if ( p2m->pod.entry_count > p2m->pod.count ) > pod_eager_reclaim(p2m); > @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai > __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); > } > > - pod_unlock(p2m); > + pod_unlock_and_flush(p2m); > return true; > + > out_of_memory: > pod_unlock(p2m); Don't you need to set defer_nested_flush = false in the out_of_memory label? (as you don't call pod_unlock_and_flush that would do it) Thanks, Roger.
On 19.10.2021 10:09, Roger Pau Monné wrote: > On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: >> @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai >> __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); >> } >> >> - pod_unlock(p2m); >> + pod_unlock_and_flush(p2m); >> return true; >> + >> out_of_memory: >> pod_unlock(p2m); > > Don't you need to set defer_nested_flush = false in the out_of_memory > label? (as you don't call pod_unlock_and_flush that would do it) Yes of course - thanks for spotting. I had pod_unlock_and_flush() here too initially, and when switching back I forgot to convert rather than just delete that. Jan
On 19.10.2021 10:17, Jan Beulich wrote: > On 19.10.2021 10:09, Roger Pau Monné wrote: >> On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: >>> @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai >>> __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); >>> } >>> >>> - pod_unlock(p2m); >>> + pod_unlock_and_flush(p2m); >>> return true; >>> + >>> out_of_memory: >>> pod_unlock(p2m); >> >> Don't you need to set defer_nested_flush = false in the out_of_memory >> label? (as you don't call pod_unlock_and_flush that would do it) > > Yes of course - thanks for spotting. I had pod_unlock_and_flush() here > too initially, and when switching back I forgot to convert rather than > just delete that. Oh, wait, that was on purpose: There's no point clearing the flag when the next thing we do is invoke domain_crash(). If it wasn't that way, I don't think I could avoid using pod_unlock_and_flush() here as well. Jan
On Tue, Oct 19, 2021 at 10:19:57AM +0200, Jan Beulich wrote: > On 19.10.2021 10:17, Jan Beulich wrote: > > On 19.10.2021 10:09, Roger Pau Monné wrote: > >> On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: > >>> @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai > >>> __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); > >>> } > >>> > >>> - pod_unlock(p2m); > >>> + pod_unlock_and_flush(p2m); > >>> return true; > >>> + > >>> out_of_memory: > >>> pod_unlock(p2m); > >> > >> Don't you need to set defer_nested_flush = false in the out_of_memory > >> label? (as you don't call pod_unlock_and_flush that would do it) > > > > Yes of course - thanks for spotting. I had pod_unlock_and_flush() here > > too initially, and when switching back I forgot to convert rather than > > just delete that. > > Oh, wait, that was on purpose: There's no point clearing the flag > when the next thing we do is invoke domain_crash(). If it wasn't > that way, I don't think I could avoid using pod_unlock_and_flush() > here as well. Oh, I see. We would need to be careful if that domain crash is ever removed. Thanks, Roger.
On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: > With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> > write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock > order violation when the PoD lock is held around it. Hence such flushing > needs to be deferred. Steal the approach from p2m_change_type_range(). > > Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> > ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. I'm slightly worried by this path because it doesn't seem to acknowledge defer_nested_flush. Maybe the flag should be checked by p2m_flush_nestedp2m instead of leaving it to the callers? Thanks, Roger.
On 19.10.2021 12:39, Roger Pau Monné wrote: > On Tue, Oct 19, 2021 at 10:19:57AM +0200, Jan Beulich wrote: >> On 19.10.2021 10:17, Jan Beulich wrote: >>> On 19.10.2021 10:09, Roger Pau Monné wrote: >>>> On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: >>>>> @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai >>>>> __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); >>>>> } >>>>> >>>>> - pod_unlock(p2m); >>>>> + pod_unlock_and_flush(p2m); >>>>> return true; >>>>> + >>>>> out_of_memory: >>>>> pod_unlock(p2m); >>>> >>>> Don't you need to set defer_nested_flush = false in the out_of_memory >>>> label? (as you don't call pod_unlock_and_flush that would do it) >>> >>> Yes of course - thanks for spotting. I had pod_unlock_and_flush() here >>> too initially, and when switching back I forgot to convert rather than >>> just delete that. >> >> Oh, wait, that was on purpose: There's no point clearing the flag >> when the next thing we do is invoke domain_crash(). If it wasn't >> that way, I don't think I could avoid using pod_unlock_and_flush() >> here as well. > > Oh, I see. We would need to be careful if that domain crash is ever > removed. Well, I can change the call there as well. Doing so would seem preferable over adding a respective comment there. I didn't do so originally because as it stands this would be meaningless code churn. Let me know what you think. Jan
On 19.10.2021 12:41, Roger Pau Monné wrote: > On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: >> With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> >> write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock >> order violation when the PoD lock is held around it. Hence such flushing >> needs to be deferred. Steal the approach from p2m_change_type_range(). >> >> Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> >> ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. > > I'm slightly worried by this path because it doesn't seem to > acknowledge defer_nested_flush. Oh, yes. Iirc I did look at that logic, write down the remark, and then forgot to add the conditional to ept_sync_domain_prepare(). The interactions with the real (host) flush are kind of ugly there, though - we then further depend on the ->defer_flush / ->need_flush logic, which is EPT-only. But I think I've convinced myself that this ought to work out. > Maybe the flag should be checked by > p2m_flush_nestedp2m instead of leaving it to the callers? I'm not sure this would be a good idea, as there are more callers. Jan
On Tue, Oct 19, 2021 at 01:58:38PM +0200, Jan Beulich wrote: > On 19.10.2021 12:41, Roger Pau Monné wrote: > > On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: > >> With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> > >> write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock > >> order violation when the PoD lock is held around it. Hence such flushing > >> needs to be deferred. Steal the approach from p2m_change_type_range(). > >> > >> Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> > >> ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. > > > > I'm slightly worried by this path because it doesn't seem to > > acknowledge defer_nested_flush. > > Oh, yes. Iirc I did look at that logic, write down the remark, and > then forgot to add the conditional to ept_sync_domain_prepare(). > The interactions with the real (host) flush are kind of ugly there, > though - we then further depend on the ->defer_flush / ->need_flush > logic, which is EPT-only. But I think I've convinced myself that > this ought to work out. > > > Maybe the flag should be checked by > > p2m_flush_nestedp2m instead of leaving it to the callers? > > I'm not sure this would be a good idea, as there are more callers. We should maybe add an ASSERT to p2m_flush_nestedp2m to make sure it's not called with defer_nested_flush being set then, or else it's possible for new callers of p2m_flush_nestedp2m to forget checking defer_nested_flush. Thanks, Roger.
On 19.10.2021 14:59, Roger Pau Monné wrote: > On Tue, Oct 19, 2021 at 01:58:38PM +0200, Jan Beulich wrote: >> On 19.10.2021 12:41, Roger Pau Monné wrote: >>> On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: >>>> With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> >>>> write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock >>>> order violation when the PoD lock is held around it. Hence such flushing >>>> needs to be deferred. Steal the approach from p2m_change_type_range(). >>>> >>>> Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> >>>> ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. >>> >>> I'm slightly worried by this path because it doesn't seem to >>> acknowledge defer_nested_flush. >> >> Oh, yes. Iirc I did look at that logic, write down the remark, and >> then forgot to add the conditional to ept_sync_domain_prepare(). >> The interactions with the real (host) flush are kind of ugly there, >> though - we then further depend on the ->defer_flush / ->need_flush >> logic, which is EPT-only. But I think I've convinced myself that >> this ought to work out. >> >>> Maybe the flag should be checked by >>> p2m_flush_nestedp2m instead of leaving it to the callers? >> >> I'm not sure this would be a good idea, as there are more callers. > > We should maybe add an ASSERT to p2m_flush_nestedp2m to make sure it's > not called with defer_nested_flush being set then, or else it's > possible for new callers of p2m_flush_nestedp2m to forget checking > defer_nested_flush. I'm afraid we can't do that, or at least not this easily: The flag assumes the p2m lock to be held when it gets set. Hence callers not holding the lock (hvm_set_efer(), nvmx_handle_invept()) may trigger such an assert just because another CPU set the flag. Jan
On Tue, Oct 19, 2021 at 03:14:25PM +0200, Jan Beulich wrote: > On 19.10.2021 14:59, Roger Pau Monné wrote: > > On Tue, Oct 19, 2021 at 01:58:38PM +0200, Jan Beulich wrote: > >> On 19.10.2021 12:41, Roger Pau Monné wrote: > >>> On Mon, Oct 11, 2021 at 10:17:08AM +0200, Jan Beulich wrote: > >>>> With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> > >>>> write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock > >>>> order violation when the PoD lock is held around it. Hence such flushing > >>>> needs to be deferred. Steal the approach from p2m_change_type_range(). > >>>> > >>>> Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> > >>>> ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. > >>> > >>> I'm slightly worried by this path because it doesn't seem to > >>> acknowledge defer_nested_flush. > >> > >> Oh, yes. Iirc I did look at that logic, write down the remark, and > >> then forgot to add the conditional to ept_sync_domain_prepare(). > >> The interactions with the real (host) flush are kind of ugly there, > >> though - we then further depend on the ->defer_flush / ->need_flush > >> logic, which is EPT-only. But I think I've convinced myself that > >> this ought to work out. > >> > >>> Maybe the flag should be checked by > >>> p2m_flush_nestedp2m instead of leaving it to the callers? > >> > >> I'm not sure this would be a good idea, as there are more callers. > > > > We should maybe add an ASSERT to p2m_flush_nestedp2m to make sure it's > > not called with defer_nested_flush being set then, or else it's > > possible for new callers of p2m_flush_nestedp2m to forget checking > > defer_nested_flush. > > I'm afraid we can't do that, or at least not this easily: The flag > assumes the p2m lock to be held when it gets set. Hence callers not > holding the lock (hvm_set_efer(), nvmx_handle_invept()) may trigger > such an assert just because another CPU set the flag. Hm, indeed. Forcing those to take the p2m lock might be too much overhead for the sake of correctness. Thanks, Roger.
--- a/xen/arch/x86/mm/p2m-pod.c +++ b/xen/arch/x86/mm/p2m-pod.c @@ -24,6 +24,7 @@ #include <xen/mm.h> #include <xen/sched.h> #include <xen/trace.h> +#include <asm/hvm/nestedhvm.h> #include <asm/page.h> #include <asm/paging.h> #include <asm/p2m.h> @@ -494,6 +495,13 @@ p2m_pod_offline_or_broken_replace(struct static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); +static void pod_unlock_and_flush(struct p2m_domain *p2m) +{ + pod_unlock(p2m); + p2m->defer_nested_flush = false; + if ( nestedhvm_enabled(p2m->domain) ) + p2m_flush_nestedp2m(p2m->domain); +} /* * This function is needed for two reasons: @@ -514,6 +522,7 @@ p2m_pod_decrease_reservation(struct doma gfn_lock(p2m, gfn, order); pod_lock(p2m); + p2m->defer_nested_flush = true; /* * If we don't have any outstanding PoD entries, let things take their @@ -665,7 +674,7 @@ out_entry_check: } out_unlock: - pod_unlock(p2m); + pod_unlock_and_flush(p2m); gfn_unlock(p2m, gfn, order); return ret; } @@ -1144,8 +1153,10 @@ p2m_pod_demand_populate(struct p2m_domai * won't start until we're done. */ if ( unlikely(d->is_dying) ) - goto out_fail; - + { + pod_unlock(p2m); + return false; + } /* * Because PoD does not have cache list for 1GB pages, it has to remap @@ -1167,6 +1178,8 @@ p2m_pod_demand_populate(struct p2m_domai p2m_populate_on_demand, p2m->default_access); } + p2m->defer_nested_flush = true; + /* Only reclaim if we're in actual need of more cache. */ if ( p2m->pod.entry_count > p2m->pod.count ) pod_eager_reclaim(p2m); @@ -1229,8 +1242,9 @@ p2m_pod_demand_populate(struct p2m_domai __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); } - pod_unlock(p2m); + pod_unlock_and_flush(p2m); return true; + out_of_memory: pod_unlock(p2m); @@ -1239,12 +1253,14 @@ out_of_memory: p2m->pod.entry_count, current->domain->domain_id); domain_crash(d); return false; + out_fail: - pod_unlock(p2m); + pod_unlock_and_flush(p2m); return false; + remap_and_retry: BUG_ON(order != PAGE_ORDER_2M); - pod_unlock(p2m); + pod_unlock_and_flush(p2m); /* * Remap this 2-meg region in singleton chunks. See the comment on the
With NPT or shadow in use, the p2m_set_entry() -> p2m_pt_set_entry() -> write_p2m_entry() -> p2m_flush_nestedp2m() call sequence triggers a lock order violation when the PoD lock is held around it. Hence such flushing needs to be deferred. Steal the approach from p2m_change_type_range(). Similarly for EPT I think ept_set_entry() -> ept_sync_domain() -> ept_sync_domain_prepare() -> p2m_flush_nestedp2m() is affected. Reported-by: Elliott Mitchell <ehem+xen@m5p.com> Signed-off-by: Jan Beulich <jbeulich@suse.com>