diff mbox

[v7,06/13] KVM: x86: Add Intel Processor Trace virtualization mode

Message ID 1525349323-9938-7-git-send-email-luwei.kang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Luwei Kang May 3, 2018, 12:08 p.m. UTC
From: Chao Peng <chao.p.peng@linux.intel.com>

Intel PT virtualization can be work in one of 3 possible modes:
a. system-wide: trace both host/guest and output to host buffer;
b. host-only: only trace host and output to host buffer;
c. host-guest: trace host/guest simultaneous and output to their
   respective buffer.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.kang@intel.com>
---
 arch/x86/include/asm/intel_pt.h  |  6 ++++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h       |  8 +++++
 arch/x86/kvm/vmx.c               | 68 +++++++++++++++++++++++++++++++++++++---
 4 files changed, 79 insertions(+), 4 deletions(-)

Comments

Alexander Shishkin May 3, 2018, 11:32 a.m. UTC | #1
On Thu, May 03, 2018 at 08:08:36PM +0800, Luwei Kang wrote:
> From: Chao Peng <chao.p.peng@linux.intel.com>
> 
> Intel PT virtualization can be work in one of 3 possible modes:
> a. system-wide: trace both host/guest and output to host buffer;
> b. host-only: only trace host and output to host buffer;
> c. host-guest: trace host/guest simultaneous and output to their
>    respective buffer.

You also need to explain what this patch is doing, how and why. I think
I figured it out from reading the rest of the patch, but it should really
be mentioned in the description.

> @@ -5,6 +5,12 @@
>  #define PT_CPUID_LEAVES		2
>  #define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
>  
> +enum pt_mode {
> +	PT_MODE_SYSTEM = 0,
> +	PT_MODE_HOST,
> +	PT_MODE_HOST_GUEST,
> +};
> +
>  enum pt_capabilities {
>  	PT_CAP_max_subleaf = 0,
>  	PT_CAP_cr3_filtering,
> @@ -187,6 +188,10 @@
>  static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
>  module_param(ple_window_max, uint, 0444);
>  
> +/* Default is SYSTEM mode. */
> +static int __read_mostly pt_mode = PT_MODE_SYSTEM;
> +module_param(pt_mode, int, S_IRUGO);

So, it's an explicit module parameter? One apparent problem with this
is that one would need to reload kvm module(s) to be able to use PT,
which is not ideal.

> +
>  extern const ulong vmx_return;
>  
>  struct kvm_vmx {
> @@ -1488,6 +1493,19 @@ static inline bool cpu_has_vmx_vmfunc(void)
>  		SECONDARY_EXEC_ENABLE_VMFUNC;
>  }
>  
> +static inline bool cpu_has_vmx_intel_pt(void)
> +{
> +	u64 vmx_msr;
> +
> +	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
> +	return vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT;

This is an implicit cast. return !!(...) would clarify your intention.

Also, does it make sense to write an accessor to pt_pmu.vmx instead?

> +}
> +
> +static inline bool cpu_has_vmx_pt_use_gpa(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA;
> +}

I can deduce the meaning of the previous one, but not this one, and there's
no explanation.

> @@ -5780,6 +5810,28 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
>  	return exec_control;
>  }
>  
> +static u32 vmx_vmexit_control(struct vcpu_vmx *vmx)
> +{
> +	u32 vmexit_control = vmcs_config.vmexit_ctrl;
> +
> +	if (pt_mode == PT_MODE_SYSTEM)
> +		vmexit_control &= ~(VM_EXIT_CLEAR_IA32_RTIT_CTL |
> +				    VM_EXIT_PT_CONCEAL_PIP);

Ok, so what we really want to know is: is there an encompassing PT
event on this cpu when we go into VMLAUNCH/VMRESTORE, right?
We can find this out from the pt_ctx and avoid the pt_mode entirely.
IOW, instead of having the 3 modes that you describe at the top, you
can use something like the following:

1. Do we have an event in pt_ctx?
 * No -> Set up the context for VMX.
 * Yes -> 2. Is attr.exclude_guest set?
             * No -> Guest trace goes to the host's buffer, do nothing.
	     * Yes -> Set up/switch the context for VMX.

Regards,
--
Alex
Paolo Bonzini May 3, 2018, 11:50 a.m. UTC | #2
On 03/05/2018 13:32, Alexander Shishkin wrote:
>>  
>> +/* Default is SYSTEM mode. */
>> +static int __read_mostly pt_mode = PT_MODE_SYSTEM;
>> +module_param(pt_mode, int, S_IRUGO);
> So, it's an explicit module parameter? One apparent problem with this
> is that one would need to reload kvm module(s) to be able to use PT,
> which is not ideal.

If you want to do tracing system-wide, that by definition must disable
guest tracing, so I think the module parameter is appropriate.  The
question is more, what is the best default.

Paolo
Paolo Bonzini May 3, 2018, 11:52 a.m. UTC | #3
On 03/05/2018 13:32, Alexander Shishkin wrote:
>> +static inline bool cpu_has_vmx_pt_use_gpa(void)
>> +{
>> +	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA;
>> +}
> 
> I can deduce the meaning of the previous one, but not this one, and there's
> no explanation.

Within KVM, GPA always means guest physical address.

>> +	if (pt_mode == PT_MODE_SYSTEM)
>> +		vmexit_control &= ~(VM_EXIT_CLEAR_IA32_RTIT_CTL |
>> +				    VM_EXIT_PT_CONCEAL_PIP);
> 
> Ok, so what we really want to know is: is there an encompassing PT
> event on this cpu when we go into VMLAUNCH/VMRESTORE, right?
> We can find this out from the pt_ctx and avoid the pt_mode entirely.
> IOW, instead of having the 3 modes that you describe at the top, you
> can use something like the following:
> 
> 1. Do we have an event in pt_ctx?
>  * No -> Set up the context for VMX.
>  * Yes -> 2. Is attr.exclude_guest set?
>              * No -> Guest trace goes to the host's buffer, do nothing.
> 	     * Yes -> Set up/switch the context for VMX.

Can you explain this more clearly?

Thanks,

Paolo
Alexander Shishkin May 3, 2018, 12:02 p.m. UTC | #4
On Thu, May 03, 2018 at 01:50:39PM +0200, Paolo Bonzini wrote:
> On 03/05/2018 13:32, Alexander Shishkin wrote:
> >>  
> >> +/* Default is SYSTEM mode. */
> >> +static int __read_mostly pt_mode = PT_MODE_SYSTEM;
> >> +module_param(pt_mode, int, S_IRUGO);
> > So, it's an explicit module parameter? One apparent problem with this
> > is that one would need to reload kvm module(s) to be able to use PT,
> > which is not ideal.
> 
> If you want to do tracing system-wide, that by definition must disable
> guest tracing,

Sure.

> so I think the module parameter is appropriate.

I don't see why. System-wide tracing takes place while perf record is
running. When it's done, it's done and we can un-disable the guest
tracing, without requiring the user to kill all their VMs and reload
modules.

Regards,
--
Alex
Alexander Shishkin May 3, 2018, 12:09 p.m. UTC | #5
On Thu, May 03, 2018 at 01:52:16PM +0200, Paolo Bonzini wrote:
> On 03/05/2018 13:32, Alexander Shishkin wrote:
> >> +static inline bool cpu_has_vmx_pt_use_gpa(void)
> >> +{
> >> +	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA;
> >> +}
> > 
> > I can deduce the meaning of the previous one, but not this one, and there's
> > no explanation.
> 
> Within KVM, GPA always means guest physical address.

I see, thanks.

> >> +	if (pt_mode == PT_MODE_SYSTEM)
> >> +		vmexit_control &= ~(VM_EXIT_CLEAR_IA32_RTIT_CTL |
> >> +				    VM_EXIT_PT_CONCEAL_PIP);
> > 
> > Ok, so what we really want to know is: is there an encompassing PT
> > event on this cpu when we go into VMLAUNCH/VMRESTORE, right?
> > We can find this out from the pt_ctx and avoid the pt_mode entirely.
> > IOW, instead of having the 3 modes that you describe at the top, you
> > can use something like the following:
> > 
> > 1. Do we have an event in pt_ctx?
> >  * No -> Set up the context for VMX.
> >  * Yes -> 2. Is attr.exclude_guest set?
> >              * No -> Guest trace goes to the host's buffer, do nothing.
> > 	     * Yes -> Set up/switch the context for VMX.
> 
> Can you explain this more clearly?

Let's see; in the intel_pt driver we have a per-cpu PT "context", from which
we can tell if there is a host event that wants to trace the guest. This
should provide enough information to make a decision whether we want to
context switch PT MSRs or not on the spot, instead of having a module
parameter.

Regards,
--
Alex
Paolo Bonzini May 3, 2018, 12:30 p.m. UTC | #6
On 03/05/2018 14:02, Alexander Shishkin wrote:
> I don't see why. System-wide tracing takes place while perf record is
> running. When it's done, it's done and we can un-disable the guest
> tracing, without requiring the user to kill all their VMs and reload
> modules.

Guest tracing can only be enabled at boot time, because the guest's
CPUID changes depending on whether it's enabled.  And likewise if perf
record can do system-wide tracing at any time during the guest's
execution, we need to know it at boot time in order to set the guest CPUID.

Paolo
Paolo Bonzini May 3, 2018, 12:31 p.m. UTC | #7
On 03/05/2018 14:09, Alexander Shishkin wrote:
>>> 1. Do we have an event in pt_ctx?
>>>  * No -> Set up the context for VMX.
>>>  * Yes -> 2. Is attr.exclude_guest set?
>>>              * No -> Guest trace goes to the host's buffer, do nothing.
>>> 	     * Yes -> Set up/switch the context for VMX.
>> Can you explain this more clearly?
> Let's see; in the intel_pt driver we have a per-cpu PT "context", from which
> we can tell if there is a host event that wants to trace the guest. This
> should provide enough information to make a decision whether we want to
> context switch PT MSRs or not on the spot, instead of having a module
> parameter.

I still don't understand how the host event is useful...

Paolo
Alexander Shishkin May 3, 2018, 12:48 p.m. UTC | #8
On Thu, May 03, 2018 at 02:30:25PM +0200, Paolo Bonzini wrote:
> On 03/05/2018 14:02, Alexander Shishkin wrote:
> > I don't see why. System-wide tracing takes place while perf record is
> > running. When it's done, it's done and we can un-disable the guest
> > tracing, without requiring the user to kill all their VMs and reload
> > modules.
> 
> Guest tracing can only be enabled at boot time, because the guest's
> CPUID changes depending on whether it's enabled.  And likewise if perf
> record can do system-wide tracing at any time during the guest's
> execution, we need to know it at boot time in order to set the guest CPUID.

CPUID is immaterial here; the real trick is to disallow the use of PT at
runtime when the host suddenly decides to trace the guest, in such a way
that the guest user is informed that their trace is incomplete due to the
host activity.

Side note: the "system-wide tracing" is a misnomer here, all that matters
is that there's a perf event on the host that wants to trace the guest, it
can very well be a per-task event.

Regards,
--
Alex
Paolo Bonzini May 3, 2018, 12:50 p.m. UTC | #9
On 03/05/2018 14:48, Alexander Shishkin wrote:
>> Guest tracing can only be enabled at boot time, because the guest's
>> CPUID changes depending on whether it's enabled.  And likewise if perf
>> record can do system-wide tracing at any time during the guest's
>> execution, we need to know it at boot time in order to set the guest CPUID.
>
> CPUID is immaterial here; the real trick is to disallow the use of PT at
> runtime when the host suddenly decides to trace the guest, in such a way
> that the guest user is informed that their trace is incomplete due to the
> host activity.

How do you do that?  And you still need the module parameter to decide
whether the host is _allowed_ to cause incomplete traces in the guest.

Paolo

> Side note: the "system-wide tracing" is a misnomer here, all that matters
> is that there's a perf event on the host that wants to trace the guest, it
> can very well be a per-task event.
Alexander Shishkin May 3, 2018, 1:38 p.m. UTC | #10
On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
> On 03/05/2018 14:48, Alexander Shishkin wrote:
> >> Guest tracing can only be enabled at boot time, because the guest's
> >> CPUID changes depending on whether it's enabled.  And likewise if perf
> >> record can do system-wide tracing at any time during the guest's
> >> execution, we need to know it at boot time in order to set the guest CPUID.
> >
> > CPUID is immaterial here; the real trick is to disallow the use of PT at
> > runtime when the host suddenly decides to trace the guest, in such a way
> > that the guest user is informed that their trace is incomplete due to the
> > host activity.
> 
> How do you do that?

Off the top of my head:
  * you don't;
  * you write something to the PT stream;
  * you signal an error via RTIT_STATUS;
  * guest always prevails: host gets PARTIAL records in case of a conflict.

> And you still need the module parameter to decide
> whether the host is _allowed_ to cause incomplete traces in the guest.

Or rather a parameter to decide who wins in case both host and guest want
to trace the guest. That's arguably better than having different versions of
PT in the guest depending on a module parameter setting.

Regards,
--
Alex
Paolo Bonzini May 3, 2018, 1:48 p.m. UTC | #11
On 03/05/2018 15:38, Alexander Shishkin wrote:
> On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
>> On 03/05/2018 14:48, Alexander Shishkin wrote:
>>>> Guest tracing can only be enabled at boot time, because the guest's
>>>> CPUID changes depending on whether it's enabled.  And likewise if perf
>>>> record can do system-wide tracing at any time during the guest's
>>>> execution, we need to know it at boot time in order to set the guest CPUID.
>>>
>>> CPUID is immaterial here; the real trick is to disallow the use of PT at
>>> runtime when the host suddenly decides to trace the guest, in such a way
>>> that the guest user is informed that their trace is incomplete due to the
>>> host activity.
>>
>> How do you do that?
> 
> Off the top of my head:
>   * you don't;
>   * you write something to the PT stream;
>   * you signal an error via RTIT_STATUS;
>   * guest always prevails: host gets PARTIAL records in case of a conflict.
> 
>> And you still need the module parameter to decide
>> whether the host is _allowed_ to cause incomplete traces in the guest.
> 
> Or rather a parameter to decide who wins in case both host and guest want
> to trace the guest. That's arguably better than having different versions of
> PT in the guest depending on a module parameter setting.

It's not different versions; it's having PT vs. not having PT at all.  I
don't really see it as a big issue.  The nice thing about this series is
that the interactions between PT code and KVM code are minimal.

Paolo
Alexander Shishkin May 4, 2018, 10:38 a.m. UTC | #12
On Thu, May 03, 2018 at 03:48:12PM +0200, Paolo Bonzini wrote:
> On 03/05/2018 15:38, Alexander Shishkin wrote:
> > On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
> >> On 03/05/2018 14:48, Alexander Shishkin wrote:
> >>>> Guest tracing can only be enabled at boot time, because the guest's
> >>>> CPUID changes depending on whether it's enabled.  And likewise if perf
> >>>> record can do system-wide tracing at any time during the guest's
> >>>> execution, we need to know it at boot time in order to set the guest CPUID.
> >>>
> >>> CPUID is immaterial here; the real trick is to disallow the use of PT at
> >>> runtime when the host suddenly decides to trace the guest, in such a way
> >>> that the guest user is informed that their trace is incomplete due to the
> >>> host activity.
> >>
> >> How do you do that?
> > 
> > Off the top of my head:
> >   * you don't;
> >   * you write something to the PT stream;
> >   * you signal an error via RTIT_STATUS;
> >   * guest always prevails: host gets PARTIAL records in case of a conflict.
> > 
> >> And you still need the module parameter to decide
> >> whether the host is _allowed_ to cause incomplete traces in the guest.
> > 
> > Or rather a parameter to decide who wins in case both host and guest want
> > to trace the guest. That's arguably better than having different versions of
> > PT in the guest depending on a module parameter setting.
> 
> It's not different versions; it's having PT vs. not having PT at all.  I
> don't really see it as a big issue.  The nice thing about this series is
> that the interactions between PT code and KVM code are minimal.

Unfortunately, it gets it wrong. Like I just said in another email, if you
switch off host's PT, you need to let them know, which this patchset doesn't
do. And when it does, it would be the same amount of interaction with PT
code as what would be required to get the dynamic guest PT right.

Regards,
--
Alex
Peter Zijlstra May 4, 2018, 10:45 a.m. UTC | #13
On Thu, May 03, 2018 at 04:38:23PM +0300, Alexander Shishkin wrote:
> On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:

> > And you still need the module parameter to decide
> > whether the host is _allowed_ to cause incomplete traces in the guest.
> 
> Or rather a parameter to decide who wins in case both host and guest want
> to trace the guest. That's arguably better than having different versions of
> PT in the guest depending on a module parameter setting.

Yes, that sounds like a much better approach.
Paolo Bonzini May 4, 2018, 9:44 p.m. UTC | #14
On 04/05/2018 12:45, Peter Zijlstra wrote:
> On Thu, May 03, 2018 at 04:38:23PM +0300, Alexander Shishkin wrote:
>> On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
> 
>>> And you still need the module parameter to decide
>>> whether the host is _allowed_ to cause incomplete traces in the guest.
>>
>> Or rather a parameter to decide who wins in case both host and guest want
>> to trace the guest. That's arguably better than having different versions of
>> PT in the guest depending on a module parameter setting.
> 
> Yes, that sounds like a much better approach.

I don't think so.  The possibility that the host would lose tracing data
just because the guest starts using PT seems hideous to me...

Paolo
Paolo Bonzini May 4, 2018, 9:52 p.m. UTC | #15
On 04/05/2018 12:38, Alexander Shishkin wrote:
>>> Or rather a parameter to decide who wins in case both host and guest want
>>> to trace the guest. That's arguably better than having different versions of
>>> PT in the guest depending on a module parameter setting.
>> It's not different versions; it's having PT vs. not having PT at all.  I
>> don't really see it as a big issue.  The nice thing about this series is
>> that the interactions between PT code and KVM code are minimal.
> Unfortunately, it gets it wrong. Like I just said in another email, if you
> switch off host's PT, you need to let them know, which this patchset doesn't
> do. And when it does, it would be the same amount of interaction with PT
> code as what would be required to get the dynamic guest PT right.

Two issues:

1) Is there a fast (10 clock cycles, better if less) way for KVM to know
"PT is enabled on the host", or a callback that KVM can register when
e.g. RTIT_CTL is written?

2) We'd have to write trace records into the guest.  That does not sound
that easy.  Does it entail parsing the ToPA and all that?

Thanks,

Paolo
Peter Zijlstra May 4, 2018, 10:15 p.m. UTC | #16
On Fri, May 04, 2018 at 11:44:09PM +0200, Paolo Bonzini wrote:
> On 04/05/2018 12:45, Peter Zijlstra wrote:
> > On Thu, May 03, 2018 at 04:38:23PM +0300, Alexander Shishkin wrote:
> >> On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
> > 
> >>> And you still need the module parameter to decide
> >>> whether the host is _allowed_ to cause incomplete traces in the guest.
> >>
> >> Or rather a parameter to decide who wins in case both host and guest want
> >> to trace the guest. That's arguably better than having different versions of
> >> PT in the guest depending on a module parameter setting.
> > 
> > Yes, that sounds like a much better approach.
> 
> I don't think so.  The possibility that the host would lose tracing data
> just because the guest starts using PT seems hideous to me...

Well, either way around is a fairly crap situation, the modparam at
least lets the admin pick which it goes. But if you want to always let
the host win, that's fine with me too, less knobs is better.
Paolo Bonzini May 7, 2018, 10:47 a.m. UTC | #17
On 05/05/2018 00:15, Peter Zijlstra wrote:
> On Fri, May 04, 2018 at 11:44:09PM +0200, Paolo Bonzini wrote:
>> On 04/05/2018 12:45, Peter Zijlstra wrote:
>>> On Thu, May 03, 2018 at 04:38:23PM +0300, Alexander Shishkin wrote:
>>>> On Thu, May 03, 2018 at 02:50:12PM +0200, Paolo Bonzini wrote:
>>>
>>>>> And you still need the module parameter to decide
>>>>> whether the host is _allowed_ to cause incomplete traces in the guest.
>>>>
>>>> Or rather a parameter to decide who wins in case both host and guest want
>>>> to trace the guest. That's arguably better than having different versions of
>>>> PT in the guest depending on a module parameter setting.
>>>
>>> Yes, that sounds like a much better approach.
>>
>> I don't think so.  The possibility that the host would lose tracing data
>> just because the guest starts using PT seems hideous to me...
> 
> Well, either way around is a fairly crap situation, the modparam at
> least lets the admin pick which it goes. But if you want to always let
> the host win, that's fine with me too, less knobs is better.

I expect that the default "system-wide" host wins will be used almost
always, with "host-guest" being used in case someone actually wants to
use PT in guests.

I agree that "Host-only, drop guest events" should be removed, since it
can be emulated by perf code.  Luwei, can you change that?

Paolo
diff mbox

Patch

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 3a4f524..43ad260 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,12 @@ 
 #define PT_CPUID_LEAVES		2
 #define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
 
+enum pt_mode {
+	PT_MODE_SYSTEM = 0,
+	PT_MODE_HOST,
+	PT_MODE_HOST_GUEST,
+};
+
 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
 	PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index cc9e681..c813507 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -790,6 +790,7 @@ 
 #define VMX_BASIC_INOUT		0x0040000000000000LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 8b67807..9e828d4 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -76,7 +76,9 @@ 
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_RDSEED_EXITING		0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
+#define SECONDARY_EXEC_PT_CONCEAL_VMX		0x00080000
 #define SECONDARY_EXEC_XSAVES			0x00100000
+#define SECONDARY_EXEC_PT_USE_GPA		0x01000000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
@@ -97,6 +99,8 @@ 
 #define VM_EXIT_LOAD_IA32_EFER                  0x00200000
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
 #define VM_EXIT_CLEAR_BNDCFGS                   0x00800000
+#define VM_EXIT_PT_CONCEAL_PIP			0x01000000
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL		0x02000000
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
 
@@ -108,6 +112,8 @@ 
 #define VM_ENTRY_LOAD_IA32_PAT			0x00004000
 #define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
 #define VM_ENTRY_LOAD_BNDCFGS                   0x00010000
+#define VM_ENTRY_PT_CONCEAL_PIP			0x00020000
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL		0x00040000
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR	0x000011ff
 
@@ -234,6 +240,8 @@  enum vmcs_field {
 	GUEST_PDPTR3_HIGH               = 0x00002811,
 	GUEST_BNDCFGS                   = 0x00002812,
 	GUEST_BNDCFGS_HIGH              = 0x00002813,
+	GUEST_IA32_RTIT_CTL		= 0x00002814,
+	GUEST_IA32_RTIT_CTL_HIGH	= 0x00002815,
 	HOST_IA32_PAT			= 0x00002c00,
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
 	HOST_IA32_EFER			= 0x00002c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b49ad4..8680cd5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -54,6 +54,7 @@ 
 #include <asm/microcode.h>
 #include <asm/nospec-branch.h>
 #include <asm/mshyperv.h>
+#include <asm/intel_pt.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -187,6 +188,10 @@ 
 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 struct kvm_vmx {
@@ -1488,6 +1493,19 @@  static inline bool cpu_has_vmx_vmfunc(void)
 		SECONDARY_EXEC_ENABLE_VMFUNC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+	u64 vmx_msr;
+
+	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+	return vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT;
+}
+
+static inline bool cpu_has_vmx_pt_use_gpa(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -4002,6 +4020,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_RDRAND_EXITING |
 			SECONDARY_EXEC_ENABLE_PML |
 			SECONDARY_EXEC_TSC_SCALING |
+			SECONDARY_EXEC_PT_USE_GPA |
+			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -4046,7 +4066,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
 	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-		VM_EXIT_CLEAR_BNDCFGS;
+		VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_PT_CONCEAL_PIP |
+		VM_EXIT_CLEAR_IA32_RTIT_CTL;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -4065,11 +4086,20 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
 	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
-	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
+	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+		VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;
 
+	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_PT_USE_GPA) ||
+		!(_vmexit_control & VM_EXIT_CLEAR_IA32_RTIT_CTL) ||
+		!(_vmentry_control & VM_ENTRY_LOAD_IA32_RTIT_CTL)) {
+		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_PT_USE_GPA;
+		_vmexit_control &= ~VM_EXIT_CLEAR_IA32_RTIT_CTL;
+		_vmentry_control &= ~VM_ENTRY_LOAD_IA32_RTIT_CTL;
+	}
+
 	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
 
 	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
@@ -5780,6 +5810,28 @@  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
+static u32 vmx_vmexit_control(struct vcpu_vmx *vmx)
+{
+	u32 vmexit_control = vmcs_config.vmexit_ctrl;
+
+	if (pt_mode == PT_MODE_SYSTEM)
+		vmexit_control &= ~(VM_EXIT_CLEAR_IA32_RTIT_CTL |
+				    VM_EXIT_PT_CONCEAL_PIP);
+
+	return vmexit_control;
+}
+
+static u32 vmx_vmentry_control(struct vcpu_vmx *vmx)
+{
+	u32 vmentry_control = vmcs_config.vmentry_ctrl;
+
+	if (pt_mode == PT_MODE_SYSTEM)
+		vmentry_control &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+				     VM_ENTRY_LOAD_IA32_RTIT_CTL);
+
+	return vmentry_control;
+}
+
 static bool vmx_rdrand_supported(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -5916,6 +5968,10 @@  static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		}
 	}
 
+	if (pt_mode == PT_MODE_SYSTEM)
+		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA |
+				  SECONDARY_EXEC_PT_CONCEAL_VMX);
+
 	vmx->secondary_exec_control = exec_control;
 }
 
@@ -6026,10 +6082,10 @@  static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
 
-	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
+	vm_exit_controls_init(vmx, vmx_vmexit_control(vmx));
 
 	/* 22.2.1, 20.8.1 */
-	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
+	vm_entry_controls_init(vmx, vmx_vmentry_control(vmx));
 
 	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
@@ -7350,6 +7406,10 @@  static __init int hardware_setup(void)
 
 	kvm_mce_cap_supported |= MCG_LMCE_P;
 
+	if (!enable_ept || !pt_cap_get(PT_CAP_topa_output) ||
+		!cpu_has_vmx_intel_pt() || !cpu_has_vmx_pt_use_gpa())
+		pt_mode = PT_MODE_SYSTEM;
+
 	return alloc_kvm_area();
 
 out: