diff mbox series

[v9,02/11] xen/domain: Add vmtrace_size domain creation parameter

Message ID 20210201232703.29275-3-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series acquire_resource size and external IPT monitoring | expand

Commit Message

Andrew Cooper Feb. 1, 2021, 11:26 p.m. UTC
From: Michał Leszczyński <michal.leszczynski@cert.pl>

To use vmtrace, buffers of a suitable size need allocating, and different
tasks will want different sizes.

Add a domain creation parameter, and audit it appropriately in the
{arch_,}sanitise_domain_config() functions.

For now, the x86 specific auditing is tuned to Processor Trace running in
Single Output mode, which requires a single contiguous range of memory.

The size is given an arbitrary limit of 64M which is expected to be enough for
anticipated usecases, but not large enough to get into long-running-hypercall
problems.

Signed-off-by: Michał Leszczyński <michal.leszczynski@cert.pl>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Anthony PERARD <anthony.perard@citrix.com>
CC: Michał Leszczyński <michal.leszczynski@cert.pl>
CC: Tamas K Lengyel <tamas@tklengyel.com>

When support for later generations of IPT get added, we can in principle start
to use ToTP which is a scatter list of smaller trace regions to use, if we
need to massively up the buffer size available.

v9:
 * Drop misleading comments in vmtrace_alloc_buffer().  Memory still gets
   leaked in theoretical corner cases, but the pattern needs fixing across the
   board when we figure out a solution.

v8:
 * Rename vmtrace_frames to vmtrace_size.  Reposition to fill a hole.
 * Rename vmtrace.buf to vmtrace.pg.
 * Rework the refcounting logic and comment it *very* clearly.

v7:
 * Major chop&change within the series.
 * Use the name 'vmtrace' consistently.
 * Use the (new) common vcpu_teardown() functionality, rather than leaving a
   latent memory leak on ARM.
---
 xen/arch/x86/domain.c       | 23 ++++++++++++++++
 xen/common/domain.c         | 64 +++++++++++++++++++++++++++++++++++++++++++++
 xen/include/public/domctl.h |  3 +++
 xen/include/xen/sched.h     |  6 +++++
 4 files changed, 96 insertions(+)

Comments

Jan Beulich Feb. 2, 2021, 9:04 a.m. UTC | #1
On 02.02.2021 00:26, Andrew Cooper wrote:
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v)
>      v->vcpu_info_mfn = INVALID_MFN;
>  }
>  
> +static void vmtrace_free_buffer(struct vcpu *v)
> +{
> +    const struct domain *d = v->domain;
> +    struct page_info *pg = v->vmtrace.pg;
> +    unsigned int i;
> +
> +    if ( !pg )
> +        return;
> +
> +    v->vmtrace.pg = NULL;
> +
> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
> +    {
> +        put_page_alloc_ref(&pg[i]);
> +        put_page_and_type(&pg[i]);
> +    }
> +}
> +
> +static int vmtrace_alloc_buffer(struct vcpu *v)
> +{
> +    struct domain *d = v->domain;
> +    struct page_info *pg;
> +    unsigned int i;
> +
> +    if ( !d->vmtrace_size )
> +        return 0;
> +
> +    pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size),
> +                             MEMF_no_refcount);
> +    if ( !pg )
> +        return -ENOMEM;
> +
> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
> +        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
> +            goto refcnt_err;
> +
> +    /*
> +     * We must only let vmtrace_free_buffer() take any action in the success
> +     * case when we've taken all the refs it intends to drop.
> +     */
> +    v->vmtrace.pg = pg;
> +    return 0;
> +
> + refcnt_err:
> +    while ( i-- )
> +        put_page_and_type(&pg[i]);
> +
> +    return -ENODATA;

Would you mind at least logging how many pages may be leaked
here? I also don't understand why you don't call
put_page_alloc_ref() in the loop - that's fine to do prior to
the put_page_and_type(), and will at least limit the leak.
The buffer size here typically isn't insignificant, and it
may be helpful to not unnecessarily defer the freeing to
relinquish_resources() (assuming we will make that one also
traverse the list of "extra" pages, but I understand that's
not going to happen for 4.15 anymore anyway).

Additionally, while I understand you're not in favor of the
comments we have on all three similar code paths, I think
replicating their comments here would help easily spotting
(by grep-ing e.g. for "fishy") all instances that will need
adjusting once we have figured a better overall solution.

Jan
Andrew Cooper Feb. 3, 2021, 4:04 p.m. UTC | #2
On 02/02/2021 09:04, Jan Beulich wrote:
> On 02.02.2021 00:26, Andrew Cooper wrote:
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v)
>>      v->vcpu_info_mfn = INVALID_MFN;
>>  }
>>  
>> +static void vmtrace_free_buffer(struct vcpu *v)
>> +{
>> +    const struct domain *d = v->domain;
>> +    struct page_info *pg = v->vmtrace.pg;
>> +    unsigned int i;
>> +
>> +    if ( !pg )
>> +        return;
>> +
>> +    v->vmtrace.pg = NULL;
>> +
>> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
>> +    {
>> +        put_page_alloc_ref(&pg[i]);
>> +        put_page_and_type(&pg[i]);
>> +    }
>> +}
>> +
>> +static int vmtrace_alloc_buffer(struct vcpu *v)
>> +{
>> +    struct domain *d = v->domain;
>> +    struct page_info *pg;
>> +    unsigned int i;
>> +
>> +    if ( !d->vmtrace_size )
>> +        return 0;
>> +
>> +    pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size),
>> +                             MEMF_no_refcount);
>> +    if ( !pg )
>> +        return -ENOMEM;
>> +
>> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
>> +        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
>> +            goto refcnt_err;
>> +
>> +    /*
>> +     * We must only let vmtrace_free_buffer() take any action in the success
>> +     * case when we've taken all the refs it intends to drop.
>> +     */
>> +    v->vmtrace.pg = pg;
>> +    return 0;
>> +
>> + refcnt_err:
>> +    while ( i-- )
>> +        put_page_and_type(&pg[i]);
>> +
>> +    return -ENODATA;
> Would you mind at least logging how many pages may be leaked
> here? I also don't understand why you don't call
> put_page_alloc_ref() in the loop - that's fine to do prior to
> the put_page_and_type(), and will at least limit the leak.
> The buffer size here typically isn't insignificant, and it
> may be helpful to not unnecessarily defer the freeing to
> relinquish_resources() (assuming we will make that one also
> traverse the list of "extra" pages, but I understand that's
> not going to happen for 4.15 anymore anyway).
>
> Additionally, while I understand you're not in favor of the
> comments we have on all three similar code paths, I think
> replicating their comments here would help easily spotting
> (by grep-ing e.g. for "fishy") all instances that will need
> adjusting once we have figured a better overall solution.

How is:

    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
            /*
             * The domain can't possibly know about this page yet, so
failure
             * here is a clear indication of something fishy going on.
             */
            goto refcnt_err;

    /*
     * We must only let vmtrace_free_buffer() take any action in the success
     * case when we've taken all the refs it intends to drop.
     */
    v->vmtrace.pg = pg;
    return 0;

 refcnt_err:
    /*
     * We can theoretically reach this point if someone has taken 2^43
refs on
     * the frames in the time the above loop takes to execute, or
someone has
     * made a blind decrease reservation hypercall and managed to pick the
     * right mfn.  Free the memory we safely can, and leak the rest.
     */
    while ( i-- )
    {
        put_page_alloc_ref(&pg[i]);
        put_page_and_type(&pg[i]);
    }

    return -ENODATA;

this?

~Andrew
Jan Beulich Feb. 4, 2021, 11:11 a.m. UTC | #3
On 03.02.2021 17:04, Andrew Cooper wrote:
> On 02/02/2021 09:04, Jan Beulich wrote:
>> On 02.02.2021 00:26, Andrew Cooper wrote:
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -132,6 +132,56 @@ static void vcpu_info_reset(struct vcpu *v)
>>>      v->vcpu_info_mfn = INVALID_MFN;
>>>  }
>>>  
>>> +static void vmtrace_free_buffer(struct vcpu *v)
>>> +{
>>> +    const struct domain *d = v->domain;
>>> +    struct page_info *pg = v->vmtrace.pg;
>>> +    unsigned int i;
>>> +
>>> +    if ( !pg )
>>> +        return;
>>> +
>>> +    v->vmtrace.pg = NULL;
>>> +
>>> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
>>> +    {
>>> +        put_page_alloc_ref(&pg[i]);
>>> +        put_page_and_type(&pg[i]);
>>> +    }
>>> +}
>>> +
>>> +static int vmtrace_alloc_buffer(struct vcpu *v)
>>> +{
>>> +    struct domain *d = v->domain;
>>> +    struct page_info *pg;
>>> +    unsigned int i;
>>> +
>>> +    if ( !d->vmtrace_size )
>>> +        return 0;
>>> +
>>> +    pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size),
>>> +                             MEMF_no_refcount);
>>> +    if ( !pg )
>>> +        return -ENOMEM;
>>> +
>>> +    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
>>> +        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
>>> +            goto refcnt_err;
>>> +
>>> +    /*
>>> +     * We must only let vmtrace_free_buffer() take any action in the success
>>> +     * case when we've taken all the refs it intends to drop.
>>> +     */
>>> +    v->vmtrace.pg = pg;
>>> +    return 0;
>>> +
>>> + refcnt_err:
>>> +    while ( i-- )
>>> +        put_page_and_type(&pg[i]);
>>> +
>>> +    return -ENODATA;
>> Would you mind at least logging how many pages may be leaked
>> here? I also don't understand why you don't call
>> put_page_alloc_ref() in the loop - that's fine to do prior to
>> the put_page_and_type(), and will at least limit the leak.
>> The buffer size here typically isn't insignificant, and it
>> may be helpful to not unnecessarily defer the freeing to
>> relinquish_resources() (assuming we will make that one also
>> traverse the list of "extra" pages, but I understand that's
>> not going to happen for 4.15 anymore anyway).
>>
>> Additionally, while I understand you're not in favor of the
>> comments we have on all three similar code paths, I think
>> replicating their comments here would help easily spotting
>> (by grep-ing e.g. for "fishy") all instances that will need
>> adjusting once we have figured a better overall solution.
> 
> How is:
> 
>     for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
>         if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
>             /*
>              * The domain can't possibly know about this page yet, so
> failure
>              * here is a clear indication of something fishy going on.
>              */
>             goto refcnt_err;
> 
>     /*
>      * We must only let vmtrace_free_buffer() take any action in the success
>      * case when we've taken all the refs it intends to drop.
>      */
>     v->vmtrace.pg = pg;
>     return 0;
> 
>  refcnt_err:
>     /*
>      * We can theoretically reach this point if someone has taken 2^43
> refs on
>      * the frames in the time the above loop takes to execute, or
> someone has
>      * made a blind decrease reservation hypercall and managed to pick the
>      * right mfn.  Free the memory we safely can, and leak the rest.
>      */
>     while ( i-- )
>     {
>         put_page_alloc_ref(&pg[i]);
>         put_page_and_type(&pg[i]);
>     }
> 
>     return -ENODATA;
> 
> this?

Much better, thanks. Remains the question of logging the
suspected leak of perhaps many pages. But either way
Acked-by: Jan Beulich <jbeulich@suse.com>

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index b9ba04633e..6c7ee25f3b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -660,6 +660,29 @@  int arch_sanitise_domain_config(struct xen_domctl_createdomain *config)
         return -EINVAL;
     }
 
+    if ( config->vmtrace_size )
+    {
+        unsigned int size = config->vmtrace_size;
+
+        ASSERT(vmtrace_available); /* Checked by common code. */
+
+        /*
+         * For now, vmtrace is restricted to HVM guests, and using a
+         * power-of-2 buffer between 4k and 64M in size.
+         */
+        if ( !hvm )
+        {
+            dprintk(XENLOG_INFO, "vmtrace not supported for PV\n");
+            return -EINVAL;
+        }
+
+        if ( size < PAGE_SIZE || size > MB(64) || (size & (size - 1)) )
+        {
+            dprintk(XENLOG_INFO, "Unsupported vmtrace size: %#x\n", size);
+            return -EINVAL;
+        }
+    }
+
     return 0;
 }
 
diff --git a/xen/common/domain.c b/xen/common/domain.c
index d1e94d88cf..b6f8d2f536 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -132,6 +132,56 @@  static void vcpu_info_reset(struct vcpu *v)
     v->vcpu_info_mfn = INVALID_MFN;
 }
 
+static void vmtrace_free_buffer(struct vcpu *v)
+{
+    const struct domain *d = v->domain;
+    struct page_info *pg = v->vmtrace.pg;
+    unsigned int i;
+
+    if ( !pg )
+        return;
+
+    v->vmtrace.pg = NULL;
+
+    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
+    {
+        put_page_alloc_ref(&pg[i]);
+        put_page_and_type(&pg[i]);
+    }
+}
+
+static int vmtrace_alloc_buffer(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct page_info *pg;
+    unsigned int i;
+
+    if ( !d->vmtrace_size )
+        return 0;
+
+    pg = alloc_domheap_pages(d, get_order_from_bytes(d->vmtrace_size),
+                             MEMF_no_refcount);
+    if ( !pg )
+        return -ENOMEM;
+
+    for ( i = 0; i < (d->vmtrace_size >> PAGE_SHIFT); i++ )
+        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
+            goto refcnt_err;
+
+    /*
+     * We must only let vmtrace_free_buffer() take any action in the success
+     * case when we've taken all the refs it intends to drop.
+     */
+    v->vmtrace.pg = pg;
+    return 0;
+
+ refcnt_err:
+    while ( i-- )
+        put_page_and_type(&pg[i]);
+
+    return -ENODATA;
+}
+
 /*
  * Release resources held by a vcpu.  There may or may not be live references
  * to the vcpu, and it may or may not be fully constructed.
@@ -140,6 +190,8 @@  static void vcpu_info_reset(struct vcpu *v)
  */
 static int vcpu_teardown(struct vcpu *v)
 {
+    vmtrace_free_buffer(v);
+
     return 0;
 }
 
@@ -201,6 +253,9 @@  struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
     if ( sched_init_vcpu(v) != 0 )
         goto fail_wq;
 
+    if ( vmtrace_alloc_buffer(v) != 0 )
+        goto fail_wq;
+
     if ( arch_vcpu_create(v) != 0 )
         goto fail_sched;
 
@@ -449,6 +504,12 @@  static int sanitise_domain_config(struct xen_domctl_createdomain *config)
         }
     }
 
+    if ( config->vmtrace_size && !vmtrace_available )
+    {
+        dprintk(XENLOG_INFO, "vmtrace requested but not available\n");
+        return -EINVAL;
+    }
+
     return arch_sanitise_domain_config(config);
 }
 
@@ -474,7 +535,10 @@  struct domain *domain_create(domid_t domid,
     ASSERT(is_system_domain(d) ? config == NULL : config != NULL);
 
     if ( config )
+    {
         d->options = config->flags;
+        d->vmtrace_size = config->vmtrace_size;
+    }
 
     /* Sort out our idea of is_control_domain(). */
     d->is_privileged = is_priv;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 666aeb71bf..88a5b1ef5d 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -95,6 +95,9 @@  struct xen_domctl_createdomain {
     int32_t max_grant_frames;
     int32_t max_maptrack_frames;
 
+    /* Per-vCPU buffer size in bytes.  0 to disable. */
+    uint32_t vmtrace_size;
+
     struct xen_arch_domainconfig arch;
 };
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 06dba1a397..bc78a09a53 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -272,6 +272,10 @@  struct vcpu
     /* vPCI per-vCPU area, used to store data for long running operations. */
     struct vpci_vcpu vpci;
 
+    struct {
+        struct page_info *pg; /* One contiguous allocation of d->vmtrace_size */
+    } vmtrace;
+
     struct arch_vcpu arch;
 
 #ifdef CONFIG_IOREQ_SERVER
@@ -547,6 +551,8 @@  struct domain
         unsigned int guest_request_sync          : 1;
     } monitor;
 
+    unsigned int vmtrace_size; /* Buffer size in bytes, or 0 to disable. */
+
 #ifdef CONFIG_ARGO
     /* Argo interdomain communication support */
     struct argo_domain *argo;