Message ID | 20210201232703.29275-3-andrew.cooper3@citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | acquire_resource size and external IPT monitoring | expand |
On 02.02.2021 00:26, Andrew Cooper wrote: > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v) > v->vcpu_info_mfn = INVALID_MFN; > } > > +static void vmtrace_free_buffer(struct vcpu *v) > +{ > + const struct domain *d = v->domain; > + struct page_info *pg = v->vmtrace.pg; > + unsigned int i; > + > + if ( !pg ) > + return; > + > + v->vmtrace.pg = NULL; > + > + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) > + { > + put_page_alloc_ref(&pg[i]); > + put_page_and_type(&pg[i]); > + } > +} > + > +static int vmtrace_alloc_buffer(struct vcpu *v) > +{ > + struct domain *d = v->domain; > + struct page_info *pg; > + unsigned int i; > + > + if ( !d->vmtrace_size ) > + return 0; > + > + pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size), > + MEMF_no_refcount); > + if ( !pg ) > + return -ENOMEM; > + > + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) > + if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) > + goto refcnt_err; > + > + /* > + * We must only let vmtrace_free_buffer() take any action in the success > + * case when we've taken all the refs it intends to drop. > + */ > + v->vmtrace.pg = pg; > + return 0; > + > + refcnt_err: > + while ( i-- ) > + put_page_and_type(&pg[i]); > + > + return -ENODATA; Would you mind at least logging how many pages may be leaked here? I also don't understand why you don't call put_page_alloc_ref() in the loop - that's fine to do prior to the put_page_and_type(), and will at least limit the leak. The buffer size here typically isn't insignificant, and it may be helpful to not unnecessarily defer the freeing to relinquish_resources() (assuming we will make that one also traverse the list of "extra" pages, but I understand that's not going to happen for 4.15 anymore anyway). Additionally, while I understand you're not in favor of the comments we have on all three similar code paths, I think replicating their comments here would help easily spotting (by grep-ing e.g. for "fishy") all instances that will need adjusting once we have figured a better overall solution. Jan
On 02/02/2021 09:04, Jan Beulich wrote: > On 02.02.2021 00:26, Andrew Cooper wrote: >> --- a/xen/common/domain.c >> +++ b/xen/common/domain.c >> @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v) >> v->vcpu_info_mfn = INVALID_MFN; >> } >> >> +static void vmtrace_free_buffer(struct vcpu *v) >> +{ >> + const struct domain *d = v->domain; >> + struct page_info *pg = v->vmtrace.pg; >> + unsigned int i; >> + >> + if ( !pg ) >> + return; >> + >> + v->vmtrace.pg = NULL; >> + >> + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) >> + { >> + put_page_alloc_ref(&pg[i]); >> + put_page_and_type(&pg[i]); >> + } >> +} >> + >> +static int vmtrace_alloc_buffer(struct vcpu *v) >> +{ >> + struct domain *d = v->domain; >> + struct page_info *pg; >> + unsigned int i; >> + >> + if ( !d->vmtrace_size ) >> + return 0; >> + >> + pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size), >> + MEMF_no_refcount); >> + if ( !pg ) >> + return -ENOMEM; >> + >> + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) >> + if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) >> + goto refcnt_err; >> + >> + /* >> + * We must only let vmtrace_free_buffer() take any action in the success >> + * case when we've taken all the refs it intends to drop. >> + */ >> + v->vmtrace.pg = pg; >> + return 0; >> + >> + refcnt_err: >> + while ( i-- ) >> + put_page_and_type(&pg[i]); >> + >> + return -ENODATA; > Would you mind at least logging how many pages may be leaked > here? I also don't understand why you don't call > put_page_alloc_ref() in the loop - that's fine to do prior to > the put_page_and_type(), and will at least limit the leak. > The buffer size here typically isn't insignificant, and it > may be helpful to not unnecessarily defer the freeing to > relinquish_resources() (assuming we will make that one also > traverse the list of "extra" pages, but I understand that's > not going to happen for 4.15 anymore anyway). > > Additionally, while I understand you're not in favor of the > comments we have on all three similar code paths, I think > replicating their comments here would help easily spotting > (by grep-ing e.g. for "fishy") all instances that will need > adjusting once we have figured a better overall solution. How is: for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) /* * The domain can't possibly know about this page yet, so failure * here is a clear indication of something fishy going on. */ goto refcnt_err; /* * We must only let vmtrace_free_buffer() take any action in the success * case when we've taken all the refs it intends to drop. */ v->vmtrace.pg = pg; return 0; refcnt_err: /* * We can theoretically reach this point if someone has taken 2^43 refs on * the frames in the time the above loop takes to execute, or someone has * made a blind decrease reservation hypercall and managed to pick the * right mfn. Free the memory we safely can, and leak the rest. */ while ( i-- ) { put_page_alloc_ref(&pg[i]); put_page_and_type(&pg[i]); } return -ENODATA; this? ~Andrew
On 03.02.2021 17:04, Andrew Cooper wrote: > On 02/02/2021 09:04, Jan Beulich wrote: >> On 02.02.2021 00:26, Andrew Cooper wrote: >>> --- a/xen/common/domain.c >>> +++ b/xen/common/domain.c >>> @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v) >>> v->vcpu_info_mfn = INVALID_MFN; >>> } >>> >>> +static void vmtrace_free_buffer(struct vcpu *v) >>> +{ >>> + const struct domain *d = v->domain; >>> + struct page_info *pg = v->vmtrace.pg; >>> + unsigned int i; >>> + >>> + if ( !pg ) >>> + return; >>> + >>> + v->vmtrace.pg = NULL; >>> + >>> + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) >>> + { >>> + put_page_alloc_ref(&pg[i]); >>> + put_page_and_type(&pg[i]); >>> + } >>> +} >>> + >>> +static int vmtrace_alloc_buffer(struct vcpu *v) >>> +{ >>> + struct domain *d = v->domain; >>> + struct page_info *pg; >>> + unsigned int i; >>> + >>> + if ( !d->vmtrace_size ) >>> + return 0; >>> + >>> + pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size), >>> + MEMF_no_refcount); >>> + if ( !pg ) >>> + return -ENOMEM; >>> + >>> + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) >>> + if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) >>> + goto refcnt_err; >>> + >>> + /* >>> + * We must only let vmtrace_free_buffer() take any action in the success >>> + * case when we've taken all the refs it intends to drop. >>> + */ >>> + v->vmtrace.pg = pg; >>> + return 0; >>> + >>> + refcnt_err: >>> + while ( i-- ) >>> + put_page_and_type(&pg[i]); >>> + >>> + return -ENODATA; >> Would you mind at least logging how many pages may be leaked >> here? I also don't understand why you don't call >> put_page_alloc_ref() in the loop - that's fine to do prior to >> the put_page_and_type(), and will at least limit the leak. >> The buffer size here typically isn't insignificant, and it >> may be helpful to not unnecessarily defer the freeing to >> relinquish_resources() (assuming we will make that one also >> traverse the list of "extra" pages, but I understand that's >> not going to happen for 4.15 anymore anyway). >> >> Additionally, while I understand you're not in favor of the >> comments we have on all three similar code paths, I think >> replicating their comments here would help easily spotting >> (by grep-ing e.g. for "fishy") all instances that will need >> adjusting once we have figured a better overall solution. > > How is: > > for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) > if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) > /* > * The domain can't possibly know about this page yet, so > failure > * here is a clear indication of something fishy going on. > */ > goto refcnt_err; > > /* > * We must only let vmtrace_free_buffer() take any action in the success > * case when we've taken all the refs it intends to drop. > */ > v->vmtrace.pg = pg; > return 0; > > refcnt_err: > /* > * We can theoretically reach this point if someone has taken 2^43 > refs on > * the frames in the time the above loop takes to execute, or > someone has > * made a blind decrease reservation hypercall and managed to pick the > * right mfn. Free the memory we safely can, and leak the rest. > */ > while ( i-- ) > { > put_page_alloc_ref(&pg[i]); > put_page_and_type(&pg[i]); > } > > return -ENODATA; > > this? Much better, thanks. Remains the question of logging the suspected leak of perhaps many pages. But either way Acked-by: Jan Beulich <jbeulich@suse.com> Jan
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index b9ba04633e..6c7ee25f3b 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -660,6 +660,29 @@ int arch_sanitise_domain_config(struct xen_domctl_createdomain *config) return -EINVAL; } + if ( config->vmtrace_size ) + { + unsigned int size = config->vmtrace_size; + + ASSERT(vmtrace_available); /* Checked by common code. */ + + /* + * For now, vmtrace is restricted to HVM guests, and using a + * power-of-2 buffer between 4k and 64M in size. + */ + if ( !hvm ) + { + dprintk(XENLOG_INFO, "vmtrace not supported for PV\n"); + return -EINVAL; + } + + if ( size < PAGE_SIZE || size > MB(64) || (size & (size - 1)) ) + { + dprintk(XENLOG_INFO, "Unsupported vmtrace size: %#x\n", size); + return -EINVAL; + } + } + return 0; } diff --git a/xen/common/domain.c b/xen/common/domain.c index d1e94d88cf..b6f8d2f536 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v) v->vcpu_info_mfn = INVALID_MFN; } +static void vmtrace_free_buffer(struct vcpu *v) +{ + const struct domain *d = v->domain; + struct page_info *pg = v->vmtrace.pg; + unsigned int i; + + if ( !pg ) + return; + + v->vmtrace.pg = NULL; + + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) + { + put_page_alloc_ref(&pg[i]); + put_page_and_type(&pg[i]); + } +} + +static int vmtrace_alloc_buffer(struct vcpu *v) +{ + struct domain *d = v->domain; + struct page_info *pg; + unsigned int i; + + if ( !d->vmtrace_size ) + return 0; + + pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size), + MEMF_no_refcount); + if ( !pg ) + return -ENOMEM; + + for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ ) + if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) ) + goto refcnt_err; + + /* + * We must only let vmtrace_free_buffer() take any action in the success + * case when we've taken all the refs it intends to drop. + */ + v->vmtrace.pg = pg; + return 0; + + refcnt_err: + while ( i-- ) + put_page_and_type(&pg[i]); + + return -ENODATA; +} + /* * Release resources held by a vcpu. There may or may not be live references * to the vcpu, and it may or may not be fully constructed. @@ -140,6 +190,8 @@ static void vcpu_info_reset(struct vcpu *v) */ static int vcpu_teardown(struct vcpu *v) { + vmtrace_free_buffer(v); + return 0; } @@ -201,6 +253,9 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id) if ( sched_init_vcpu(v) != 0 ) goto fail_wq; + if ( vmtrace_alloc_buffer(v) != 0 ) + goto fail_wq; + if ( arch_vcpu_create(v) != 0 ) goto fail_sched; @@ -449,6 +504,12 @@ static int sanitise_domain_config(struct xen_domctl_createdomain *config) } } + if ( config->vmtrace_size && !vmtrace_available ) + { + dprintk(XENLOG_INFO, "vmtrace requested but not available\n"); + return -EINVAL; + } + return arch_sanitise_domain_config(config); } @@ -474,7 +535,10 @@ struct domain *domain_create(domid_t domid, ASSERT(is_system_domain(d) ? config == NULL : config != NULL); if ( config ) + { d->options = config->flags; + d->vmtrace_size = config->vmtrace_size; + } /* Sort out our idea of is_control_domain(). */ d->is_privileged = is_priv; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 666aeb71bf..88a5b1ef5d 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -95,6 +95,9 @@ struct xen_domctl_createdomain { int32_t max_grant_frames; int32_t max_maptrack_frames; + /* Per-vCPU buffer size in bytes. 0 to disable. */ + uint32_t vmtrace_size; + struct xen_arch_domainconfig arch; }; diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 06dba1a397..bc78a09a53 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -272,6 +272,10 @@ struct vcpu /* vPCI per-vCPU area, used to store data for long running operations. */ struct vpci_vcpu vpci; + struct { + struct page_info *pg; /* One contiguous allocation of d->vmtrace_size */ + } vmtrace; + struct arch_vcpu arch; #ifdef CONFIG_IOREQ_SERVER @@ -547,6 +551,8 @@ struct domain unsigned int guest_request_sync : 1; } monitor; + unsigned int vmtrace_size; /* Buffer size in bytes, or 0 to disable. */ + #ifdef CONFIG_ARGO /* Argo interdomain communication support */ struct argo_domain *argo;