diff mbox series

[v7,06/10] xen/domctl: Add XEN_DOMCTL_vmtrace_op

Message ID 20210121212718.2441-7-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series Implement support for external IPT monitoring | expand

Commit Message

Andrew Cooper Jan. 21, 2021, 9:27 p.m. UTC
From: Michał Leszczyński <michal.leszczynski@cert.pl>

Implement an interface to configure and control tracing operations.  Reuse the
existing SETDEBUGGING flask vector rather than inventing a new one.

Userspace using this interface is going to need platform specific knowledge
anyway to interpret the contents of the trace buffer.  While some operations
(e.g. enable/disable) can reasonably be generic, others cannot.  Provide an
explicitly-platform specific pair of get/set operations to reduce API churn as
new options get added/enabled.

For the VMX specific Processor Trace implementation, tolerate reading and
modifying a safe subset of bits in CTL, STATUS and OUTPUT_MASK.  This permits
userspace to control the content which gets logged, but prevents modification
of details such as the position/size of the output buffer.

Signed-off-by: Michał Leszczyński <michal.leszczynski@cert.pl>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Jun Nakajima <jun.nakajima@intel.com>
CC: Kevin Tian <kevin.tian@intel.com>
CC: Michał Leszczyński <michal.leszczynski@cert.pl>
CC: Tamas K Lengyel <tamas@tklengyel.com>

v7:
 * Major chop&change within the series.
---
 xen/arch/x86/domctl.c         |  55 +++++++++++++++
 xen/arch/x86/hvm/vmx/vmx.c    | 151 ++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/hvm.h |  63 ++++++++++++++++++
 xen/include/public/domctl.h   |  35 ++++++++++
 xen/xsm/flask/hooks.c         |   1 +
 5 files changed, 305 insertions(+)

Comments

Jan Beulich Jan. 26, 2021, 2:18 p.m. UTC | #1
On 21.01.2021 22:27, Andrew Cooper wrote:
> --- a/xen/arch/x86/domctl.c
> +++ b/xen/arch/x86/domctl.c
> @@ -155,6 +155,55 @@ void arch_get_domain_info(const struct domain *d,
>      info->arch_config.emulation_flags = d->arch.emulation_flags;
>  }
>  
> +static int do_vmtrace_op(struct domain *d, struct xen_domctl_vmtrace_op *op,
> +                         XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
> +{
> +    struct vcpu *v;
> +    int rc;
> +
> +    if ( !d->vmtrace_frames || d == current->domain /* No vcpu_pause() */ )
> +        return -EINVAL;
> +
> +    ASSERT(is_hvm_domain(d)); /* Restricted by domain creation logic. */
> +
> +    v = domain_vcpu(d, op->vcpu);
> +    if ( !v )
> +        return -ENOENT;
> +
> +    vcpu_pause(v);
> +    switch ( op->cmd )
> +    {
> +    case XEN_DOMCTL_vmtrace_enable:
> +    case XEN_DOMCTL_vmtrace_disable:
> +    case XEN_DOMCTL_vmtrace_reset_and_enable:
> +        rc = hvm_vmtrace_control(
> +            v, op->cmd != XEN_DOMCTL_vmtrace_disable,
> +            op->cmd == XEN_DOMCTL_vmtrace_reset_and_enable);
> +        break;
> +
> +    case XEN_DOMCTL_vmtrace_output_position:
> +        rc = hvm_vmtrace_output_position(v, &op->value);
> +        if ( rc >= 0 )
> +            rc = 0;

So vmtrace_output_position() effectively returns a boolean, and
there is no other caller of it afaics. I understand the hook and
function return int to allow for error indicators. But what's
the purpose of returning ipt_active when the only caller doesn't
care?

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2261,6 +2261,153 @@ static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
>      return true;
>  }
>  
> +static int vmtrace_get_option(struct vcpu *v, uint64_t key, uint64_t *output)
> +{
> +    const struct vcpu_msrs *msrs = v->arch.msrs;
> +
> +    /*
> +     * We only let vmtrace agents see and modify a subset of bits in
> +     * MSR_RTIT_CTL.  These all pertain to date emitted into the trace

s/date/data/ ?

> +     * buffer(s).  Must not include controls pertaining to the
> +     * structure/position of the trace buffer(s).
> +     */
> +#define RTIT_CTL_MASK                                                   \
> +    (RTIT_CTL_TRACE_EN | RTIT_CTL_OS | RTIT_CTL_USR | RTIT_CTL_TSC_EN | \
> +     RTIT_CTL_DIS_RETC | RTIT_CTL_BRANCH_EN)
> +
> +    /*
> +     * Status bits restricted to the first-gen subset (i.e. no further CPUID
> +     * requirements.)
> +     */
> +#define RTIT_STATUS_MASK \
> +    (RTIT_STATUS_FILTER_EN | RTIT_STATUS_CONTEXT_EN | RTIT_STATUS_TRIGGER_EN | \
> +     RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED)

The placement of these two #define-s kind of suggests they're
intended for this function only, but the next one (at least)
also uses them. May I suggest to move these ahead of this
function?

> +static int vmtrace_set_option(struct vcpu *v, uint64_t key, uint64_t value)
> +{
> +    struct vcpu_msrs *msrs = v->arch.msrs;
> +    bool new_en, old_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
> +
> +    switch ( key )
> +    {
> +    case MSR_RTIT_OUTPUT_MASK:
> +        /*
> +         * MSR_RTIT_OUTPUT_MASK, when using Single Output mode, has a limit
> +         * field in the lower 32 bits, and an offset in the upper 32 bits.
> +         *
> +         * Limit is fixed by the vmtrace buffer size and must not be
> +         * controlled by userspace, while offset must be within the limit.
> +         *
> +         * Drop writes to the limit field to simply userspace wanting to reset
> +         * the offset by writing 0.
> +         */
> +        if ( (value >> 32) > msrs->rtit.output_limit )
> +            return -EINVAL;
> +        msrs->rtit.output_offset = value >> 32;
> +        break;
> +
> +    case MSR_RTIT_CTL:
> +        if ( value & ~RTIT_CTL_MASK )
> +            return -EINVAL;
> +
> +        msrs->rtit.ctl &= ~RTIT_CTL_MASK;
> +        msrs->rtit.ctl |= (value & RTIT_CTL_MASK);
> +        break;
> +
> +    case MSR_RTIT_STATUS:
> +        if ( value & ~RTIT_STATUS_MASK )
> +            return -EINVAL;
> +
> +        msrs->rtit.status &= ~RTIT_STATUS_MASK;
> +        msrs->rtit.status |= (value & RTIT_STATUS_MASK);
> +        break;
> +
> +    default:
> +        return -EINVAL;
> +    }
> +
> +    new_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
> +
> +    /* ctl.trace_en changed => update MSR load/save lists appropriately. */
> +    if ( !old_en && new_en )
> +    {
> +        if ( vmx_add_guest_msr(v, MSR_RTIT_CTL, msrs->rtit.ctl) ||
> +             vmx_add_host_load_msr(v, MSR_RTIT_CTL, 0) )
> +        {
> +            /*
> +             * The only failure cases here are failing the
> +             * singleton-per-domain memory allocation, or exceeding the space
> +             * in the allocation.  We could unwind in principle, but there is
> +             * nothing userspace can usefully do to continue using this VM.
> +             */
> +            domain_crash(v->domain);
> +            return -ENXIO;

I don't think I fully agree with the 2nd half of the last
sentence, but well, so be it then for the time being at least.
Why could userspace not decide to continue running this VM
with ipt disabled?

> +static int vmtrace_control(struct vcpu *v, bool enable, bool reset)
> +{
> +    struct vcpu_msrs *msrs = v->arch.msrs;
> +    uint64_t new_ctl;
> +    int rc;
> +
> +    if ( v->arch.hvm.vmx.ipt_active == enable )
> +        return -EINVAL;

Why is XEN_DOMCTL_vmtrace_reset_and_enable not permitted
when ipt_active is true? And, considering ...

> +    if ( reset )
> +    {
> +        msrs->rtit.status = 0;
> +        msrs->rtit.output_offset = 0;
> +    }
> +
> +    new_ctl = msrs->rtit.ctl & ~RTIT_CTL_TRACE_EN;
> +    if ( enable )
> +        new_ctl |= RTIT_CTL_TRACE_EN;
> +
> +    rc = vmtrace_set_option(v, MSR_RTIT_CTL, new_ctl);

... this is just a wrapper around a function directly
reachable via XEN_DOMCTL_vmtrace_set_option, why any
restriction at all?

> +    if ( rc )
> +        return rc;
> +
> +    v->arch.hvm.vmx.ipt_active = enable;

Shouldn't this be done in vmtrace_set_option(), to also
cover the other path leading there?

Jan
Andrew Cooper Jan. 29, 2021, 11:01 p.m. UTC | #2
On 26/01/2021 14:18, Jan Beulich wrote:
>> +static int vmtrace_set_option(struct vcpu *v, uint64_t key, uint64_t value)
>> +{
>> +    struct vcpu_msrs *msrs = v->arch.msrs;
>> +    bool new_en, old_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
>> +
>> +    switch ( key )
>> +    {
>> +    case MSR_RTIT_OUTPUT_MASK:
>> +        /*
>> +         * MSR_RTIT_OUTPUT_MASK, when using Single Output mode, has a limit
>> +         * field in the lower 32 bits, and an offset in the upper 32 bits.
>> +         *
>> +         * Limit is fixed by the vmtrace buffer size and must not be
>> +         * controlled by userspace, while offset must be within the limit.
>> +         *
>> +         * Drop writes to the limit field to simply userspace wanting to reset
>> +         * the offset by writing 0.
>> +         */
>> +        if ( (value >> 32) > msrs->rtit.output_limit )
>> +            return -EINVAL;
>> +        msrs->rtit.output_offset = value >> 32;
>> +        break;
>> +
>> +    case MSR_RTIT_CTL:
>> +        if ( value & ~RTIT_CTL_MASK )
>> +            return -EINVAL;
>> +
>> +        msrs->rtit.ctl &= ~RTIT_CTL_MASK;
>> +        msrs->rtit.ctl |= (value & RTIT_CTL_MASK);
>> +        break;
>> +
>> +    case MSR_RTIT_STATUS:
>> +        if ( value & ~RTIT_STATUS_MASK )
>> +            return -EINVAL;
>> +
>> +        msrs->rtit.status &= ~RTIT_STATUS_MASK;
>> +        msrs->rtit.status |= (value & RTIT_STATUS_MASK);
>> +        break;
>> +
>> +    default:
>> +        return -EINVAL;
>> +    }
>> +
>> +    new_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
>> +
>> +    /* ctl.trace_en changed => update MSR load/save lists appropriately. */
>> +    if ( !old_en && new_en )
>> +    {
>> +        if ( vmx_add_guest_msr(v, MSR_RTIT_CTL, msrs->rtit.ctl) ||
>> +             vmx_add_host_load_msr(v, MSR_RTIT_CTL, 0) )
>> +        {
>> +            /*
>> +             * The only failure cases here are failing the
>> +             * singleton-per-domain memory allocation, or exceeding the space
>> +             * in the allocation.  We could unwind in principle, but there is
>> +             * nothing userspace can usefully do to continue using this VM.
>> +             */
>> +            domain_crash(v->domain);
>> +            return -ENXIO;
> I don't think I fully agree with the 2nd half of the last
> sentence, but well, so be it then for the time being at least.
> Why could userspace not decide to continue running this VM
> with ipt disabled?

Technically speaking, it could.  That wouldn't malfunction.

However, it would be exceedingly poor behaviour.

One major limitation IPT has is that it cant pause on a full ring (or at
least, not in any shipping hardware yet, and this series works back to
Broadwell).  You can't just leave IPT enabled and let the VM run,
because the buffer will wrap and corrupt itself.

The driving usecase for adding IPT is introspection based.  Frequent
breaks, combined with massive trace buffers, is the best effort attempt
not to lose data.

IPT is a niche usecase - it does come with a substantial frequency hit,
and lots of userspace complexity to do anything interesting with. 
Anyone who turns it on to begin with has a usecase which totally depends
on it working.

>> +static int vmtrace_control(struct vcpu *v, bool enable, bool reset)
>> +{
>> +    struct vcpu_msrs *msrs = v->arch.msrs;
>> +    uint64_t new_ctl;
>> +    int rc;
>> +
>> +    if ( v->arch.hvm.vmx.ipt_active == enable )
>> +        return -EINVAL;
> Why is XEN_DOMCTL_vmtrace_reset_and_enable not permitted
> when ipt_active is true?

Because absolutely nothing good can come of userspace and Xen getting
out of sync with their combined idea of whether IPT is active or not.

And I really don't feel like doing an ipt_pause reference count, because
there cannot plausibly be more than one entity handling the data.

>  And, considering ...
>
>> +    if ( reset )
>> +    {
>> +        msrs->rtit.status = 0;
>> +        msrs->rtit.output_offset = 0;
>> +    }
>> +
>> +    new_ctl = msrs->rtit.ctl & ~RTIT_CTL_TRACE_EN;
>> +    if ( enable )
>> +        new_ctl |= RTIT_CTL_TRACE_EN;
>> +
>> +    rc = vmtrace_set_option(v, MSR_RTIT_CTL, new_ctl);
> ... this is just a wrapper around a function directly
> reachable via XEN_DOMCTL_vmtrace_set_option, why any
> restriction at all?

This partial alias is a consequence of the split between the platform
neutral, and platform specific parts of the interface.

It is by no means certain that such an alias would exist on other
platforms, and passing TRACE_EN to set_option() falls firmly in the
"don't do that" category IMO.

~Andrew
diff mbox series

Patch

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index b28cfe9817..aa6dfe8eed 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -155,6 +155,55 @@  void arch_get_domain_info(const struct domain *d,
     info->arch_config.emulation_flags = d->arch.emulation_flags;
 }
 
+static int do_vmtrace_op(struct domain *d, struct xen_domctl_vmtrace_op *op,
+                         XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    struct vcpu *v;
+    int rc;
+
+    if ( !d->vmtrace_frames || d == current->domain /* No vcpu_pause() */ )
+        return -EINVAL;
+
+    ASSERT(is_hvm_domain(d)); /* Restricted by domain creation logic. */
+
+    v = domain_vcpu(d, op->vcpu);
+    if ( !v )
+        return -ENOENT;
+
+    vcpu_pause(v);
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_vmtrace_enable:
+    case XEN_DOMCTL_vmtrace_disable:
+    case XEN_DOMCTL_vmtrace_reset_and_enable:
+        rc = hvm_vmtrace_control(
+            v, op->cmd != XEN_DOMCTL_vmtrace_disable,
+            op->cmd == XEN_DOMCTL_vmtrace_reset_and_enable);
+        break;
+
+    case XEN_DOMCTL_vmtrace_output_position:
+        rc = hvm_vmtrace_output_position(v, &op->value);
+        if ( rc >= 0 )
+            rc = 0;
+        break;
+
+    case XEN_DOMCTL_vmtrace_get_option:
+        rc = hvm_vmtrace_get_option(v, op->key, &op->value);
+        break;
+
+    case XEN_DOMCTL_vmtrace_set_option:
+        rc = hvm_vmtrace_set_option(v, op->key, op->value);
+        break;
+
+    default:
+        rc = -EOPNOTSUPP;
+        break;
+    }
+    vcpu_unpause(v);
+
+    return rc;
+}
+
 #define MAX_IOPORTS 0x10000
 
 long arch_do_domctl(
@@ -1320,6 +1369,12 @@  long arch_do_domctl(
         domain_unpause(d);
         break;
 
+    case XEN_DOMCTL_vmtrace_op:
+        ret = do_vmtrace_op(d, &domctl->u.vmtrace_op, u_domctl);
+        if ( !ret )
+            copyback = true;
+        break;
+
     default:
         ret = iommu_do_domctl(domctl, d, u_domctl);
         break;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 93121fbf27..d4e7b50b8a 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2261,6 +2261,153 @@  static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
     return true;
 }
 
+static int vmtrace_get_option(struct vcpu *v, uint64_t key, uint64_t *output)
+{
+    const struct vcpu_msrs *msrs = v->arch.msrs;
+
+    /*
+     * We only let vmtrace agents see and modify a subset of bits in
+     * MSR_RTIT_CTL.  These all pertain to date emitted into the trace
+     * buffer(s).  Must not include controls pertaining to the
+     * structure/position of the trace buffer(s).
+     */
+#define RTIT_CTL_MASK                                                   \
+    (RTIT_CTL_TRACE_EN | RTIT_CTL_OS | RTIT_CTL_USR | RTIT_CTL_TSC_EN | \
+     RTIT_CTL_DIS_RETC | RTIT_CTL_BRANCH_EN)
+
+    /*
+     * Status bits restricted to the first-gen subset (i.e. no further CPUID
+     * requirements.)
+     */
+#define RTIT_STATUS_MASK \
+    (RTIT_STATUS_FILTER_EN | RTIT_STATUS_CONTEXT_EN | RTIT_STATUS_TRIGGER_EN | \
+     RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED)
+
+    switch ( key )
+    {
+    case MSR_RTIT_OUTPUT_MASK:
+        *output = msrs->rtit.output_mask;
+        break;
+
+    case MSR_RTIT_CTL:
+        *output = msrs->rtit.ctl & RTIT_CTL_MASK;
+        break;
+
+    case MSR_RTIT_STATUS:
+        *output = msrs->rtit.status & RTIT_STATUS_MASK;
+        break;
+
+    default:
+        *output = 0;
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static int vmtrace_set_option(struct vcpu *v, uint64_t key, uint64_t value)
+{
+    struct vcpu_msrs *msrs = v->arch.msrs;
+    bool new_en, old_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
+
+    switch ( key )
+    {
+    case MSR_RTIT_OUTPUT_MASK:
+        /*
+         * MSR_RTIT_OUTPUT_MASK, when using Single Output mode, has a limit
+         * field in the lower 32 bits, and an offset in the upper 32 bits.
+         *
+         * Limit is fixed by the vmtrace buffer size and must not be
+         * controlled by userspace, while offset must be within the limit.
+         *
+         * Drop writes to the limit field to simply userspace wanting to reset
+         * the offset by writing 0.
+         */
+        if ( (value >> 32) > msrs->rtit.output_limit )
+            return -EINVAL;
+        msrs->rtit.output_offset = value >> 32;
+        break;
+
+    case MSR_RTIT_CTL:
+        if ( value & ~RTIT_CTL_MASK )
+            return -EINVAL;
+
+        msrs->rtit.ctl &= ~RTIT_CTL_MASK;
+        msrs->rtit.ctl |= (value & RTIT_CTL_MASK);
+        break;
+
+    case MSR_RTIT_STATUS:
+        if ( value & ~RTIT_STATUS_MASK )
+            return -EINVAL;
+
+        msrs->rtit.status &= ~RTIT_STATUS_MASK;
+        msrs->rtit.status |= (value & RTIT_STATUS_MASK);
+        break;
+
+    default:
+        return -EINVAL;
+    }
+
+    new_en = msrs->rtit.ctl & RTIT_CTL_TRACE_EN;
+
+    /* ctl.trace_en changed => update MSR load/save lists appropriately. */
+    if ( !old_en && new_en )
+    {
+        if ( vmx_add_guest_msr(v, MSR_RTIT_CTL, msrs->rtit.ctl) ||
+             vmx_add_host_load_msr(v, MSR_RTIT_CTL, 0) )
+        {
+            /*
+             * The only failure cases here are failing the
+             * singleton-per-domain memory allocation, or exceeding the space
+             * in the allocation.  We could unwind in principle, but there is
+             * nothing userspace can usefully do to continue using this VM.
+             */
+            domain_crash(v->domain);
+            return -ENXIO;
+        }
+    }
+    else if ( old_en && !new_en )
+    {
+        vmx_del_msr(v, MSR_RTIT_CTL, VMX_MSR_GUEST);
+        vmx_del_msr(v, MSR_RTIT_CTL, VMX_MSR_HOST);
+    }
+
+    return 0;
+}
+
+static int vmtrace_control(struct vcpu *v, bool enable, bool reset)
+{
+    struct vcpu_msrs *msrs = v->arch.msrs;
+    uint64_t new_ctl;
+    int rc;
+
+    if ( v->arch.hvm.vmx.ipt_active == enable )
+        return -EINVAL;
+
+    if ( reset )
+    {
+        msrs->rtit.status = 0;
+        msrs->rtit.output_offset = 0;
+    }
+
+    new_ctl = msrs->rtit.ctl & ~RTIT_CTL_TRACE_EN;
+    if ( enable )
+        new_ctl |= RTIT_CTL_TRACE_EN;
+
+    rc = vmtrace_set_option(v, MSR_RTIT_CTL, new_ctl);
+    if ( rc )
+        return rc;
+
+    v->arch.hvm.vmx.ipt_active = enable;
+
+    return 0;
+}
+
+static int vmtrace_output_position(struct vcpu *v, uint64_t *pos)
+{
+    *pos = v->arch.msrs->rtit.output_offset;
+    return v->arch.hvm.vmx.ipt_active;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -2316,6 +2463,10 @@  static struct hvm_function_table __initdata vmx_function_table = {
     .altp2m_vcpu_update_vmfunc_ve = vmx_vcpu_update_vmfunc_ve,
     .altp2m_vcpu_emulate_ve = vmx_vcpu_emulate_ve,
     .altp2m_vcpu_emulate_vmfunc = vmx_vcpu_emulate_vmfunc,
+    .vmtrace_control = vmtrace_control,
+    .vmtrace_output_position = vmtrace_output_position,
+    .vmtrace_set_option = vmtrace_set_option,
+    .vmtrace_get_option = vmtrace_get_option,
     .tsc_scaling = {
         .max_ratio = VMX_TSC_MULTIPLIER_MAX,
     },
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 334bd573b9..960ec03917 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -214,6 +214,12 @@  struct hvm_function_table {
     bool_t (*altp2m_vcpu_emulate_ve)(struct vcpu *v);
     int (*altp2m_vcpu_emulate_vmfunc)(const struct cpu_user_regs *regs);
 
+    /* vmtrace */
+    int (*vmtrace_control)(struct vcpu *v, bool enable, bool reset);
+    int (*vmtrace_output_position)(struct vcpu *v, uint64_t *pos);
+    int (*vmtrace_set_option)(struct vcpu *v, uint64_t key, uint64_t value);
+    int (*vmtrace_get_option)(struct vcpu *v, uint64_t key, uint64_t *value);
+
     /*
      * Parameters and callbacks for hardware-assisted TSC scaling,
      * which are valid only when the hardware feature is available.
@@ -655,6 +661,41 @@  static inline bool altp2m_vcpu_emulate_ve(struct vcpu *v)
     return false;
 }
 
+static inline int hvm_vmtrace_control(struct vcpu *v, bool enable, bool reset)
+{
+    if ( hvm_funcs.vmtrace_control )
+        return hvm_funcs.vmtrace_control(v, enable, reset);
+
+    return -EOPNOTSUPP;
+}
+
+/* Returns -errno, or a boolean of whether tracing is currently active. */
+static inline int hvm_vmtrace_output_position(struct vcpu *v, uint64_t *pos)
+{
+    if ( hvm_funcs.vmtrace_output_position )
+        return hvm_funcs.vmtrace_output_position(v, pos);
+
+    return -EOPNOTSUPP;
+}
+
+static inline int hvm_vmtrace_set_option(
+    struct vcpu *v, uint64_t key, uint64_t value)
+{
+    if ( hvm_funcs.vmtrace_set_option )
+        return hvm_funcs.vmtrace_set_option(v, key, value);
+
+    return -EOPNOTSUPP;
+}
+
+static inline int hvm_vmtrace_get_option(
+    struct vcpu *v, uint64_t key, uint64_t *value)
+{
+    if ( hvm_funcs.vmtrace_get_option )
+        return hvm_funcs.vmtrace_get_option(v, key, value);
+
+    return -EOPNOTSUPP;
+}
+
 /*
  * This must be defined as a macro instead of an inline function,
  * because it uses 'struct vcpu' and 'struct domain' which have
@@ -751,6 +792,28 @@  static inline bool hvm_has_set_descriptor_access_exiting(void)
     return false;
 }
 
+static inline int hvm_vmtrace_control(struct vcpu *v, bool enable, bool reset)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int hvm_vmtrace_output_position(struct vcpu *v, uint64_t *pos)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int hvm_vmtrace_set_option(
+    struct vcpu *v, uint64_t key, uint64_t value)
+{
+    return -EOPNOTSUPP;
+}
+
+static inline int hvm_vmtrace_get_option(
+    struct vcpu *v, uint64_t key, uint64_t *value)
+{
+    return -EOPNOTSUPP;
+}
+
 #define is_viridian_domain(d) ((void)(d), false)
 #define is_viridian_vcpu(v) ((void)(v), false)
 #define has_viridian_time_ref_count(d) ((void)(d), false)
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 1585678d50..218593e548 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1133,6 +1133,39 @@  struct xen_domctl_vuart_op {
                                  */
 };
 
+/* XEN_DOMCTL_vmtrace_op: Perform VM tracing operations. */
+struct xen_domctl_vmtrace_op {
+    uint32_t cmd;           /* IN */
+    uint32_t vcpu;          /* IN */
+    uint64_aligned_t key;   /* IN     - @cmd specific data. */
+    uint64_aligned_t value; /* IN/OUT - @cmd specific data. */
+
+    /*
+     * General enable/disable of tracing.
+     *
+     * XEN_DOMCTL_vmtrace_reset_and_enable is provided as optimisation for
+     * common usecases, which want to reset status and position information
+     * when turning tracing back on.
+     */
+#define XEN_DOMCTL_vmtrace_enable             1
+#define XEN_DOMCTL_vmtrace_disable            2
+#define XEN_DOMCTL_vmtrace_reset_and_enable   3
+
+    /* Obtain the current output position within the buffer.  Fills @value. */
+#define XEN_DOMCTL_vmtrace_output_position    4
+
+    /*
+     * Get/Set platform specific configuration.
+     *
+     * For Intel Processor Trace, @key/@value are interpreted as MSR
+     * reads/writes to MSR_RTIT_*, filtered to a safe subset.
+     */
+#define XEN_DOMCTL_vmtrace_get_option         5
+#define XEN_DOMCTL_vmtrace_set_option         6
+};
+typedef struct xen_domctl_vmtrace_op xen_domctl_vmtrace_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vmtrace_op_t);
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1217,6 +1250,7 @@  struct xen_domctl {
 #define XEN_DOMCTL_vuart_op                      81
 #define XEN_DOMCTL_get_cpu_policy                82
 #define XEN_DOMCTL_set_cpu_policy                83
+#define XEN_DOMCTL_vmtrace_op                    84
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1277,6 +1311,7 @@  struct xen_domctl {
         struct xen_domctl_monitor_op        monitor_op;
         struct xen_domctl_psr_alloc         psr_alloc;
         struct xen_domctl_vuart_op          vuart_op;
+        struct xen_domctl_vmtrace_op        vmtrace_op;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 19b0d9e3eb..3eba495ab3 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -703,6 +703,7 @@  static int flask_domctl(struct domain *d, int cmd)
         return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__VM_EVENT);
 
     case XEN_DOMCTL_debug_op:
+    case XEN_DOMCTL_vmtrace_op:
     case XEN_DOMCTL_gdbsx_guestmemio:
     case XEN_DOMCTL_gdbsx_pausevcpu:
     case XEN_DOMCTL_gdbsx_unpausevcpu: