diff mbox series

[v19,088/130] KVM: x86: Add a switch_db_regs flag to handle TDX's auto-switched behavior

Message ID ca5d0399cdbbaa6c7c6528ad85b3560cec0f0752.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:26 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Add a flag, KVM_DEBUGREG_AUTO_SWITCHED_GUEST, to skip saving/restoring DRs
irrespective of any other flags.  TDX-SEAM unconditionally saves and
restores guest DRs and reset to architectural INIT state on TD exit.
So, KVM needs to save host DRs before TD enter without restoring guest DRs
and restore host DRs after TD exit.

Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT().

Reported-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm_host.h | 10 ++++++++--
 arch/x86/kvm/vmx/tdx.c          |  1 +
 arch/x86/kvm/x86.c              | 11 ++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

Comments

Binbin Wu April 7, 2024, 10:52 a.m. UTC | #1
On 2/26/2024 4:26 PM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> Add a flag, KVM_DEBUGREG_AUTO_SWITCHED_GUEST, to skip saving/restoring DRs
> irrespective of any other flags.

Here "irrespective of any other flags" sounds like other flags will be 
ignored if KVM_DEBUGREG_AUTO_SWITCHED_GUEST is set.
But the code below doesn't align with it.

>    TDX-SEAM unconditionally saves and
> restores guest DRs and reset to architectural INIT state on TD exit.
> So, KVM needs to save host DRs before TD enter without restoring guest DRs
> and restore host DRs after TD exit.
>
> Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT().
>
> Reported-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 10 ++++++++--
>   arch/x86/kvm/vmx/tdx.c          |  1 +
>   arch/x86/kvm/x86.c              | 11 ++++++++---
>   3 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3ab85c3d86ee..a9df898c6fbd 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -610,8 +610,14 @@ struct kvm_pmu {
>   struct kvm_pmu_ops;
>   
>   enum {
> -	KVM_DEBUGREG_BP_ENABLED = 1,
> -	KVM_DEBUGREG_WONT_EXIT = 2,
> +	KVM_DEBUGREG_BP_ENABLED		= BIT(0),
> +	KVM_DEBUGREG_WONT_EXIT		= BIT(1),
> +	/*
> +	 * Guest debug registers (DR0-3 and DR6) are saved/restored by hardware
> +	 * on exit from or enter to guest. KVM needn't switch them. Because DR7
> +	 * is cleared on exit from guest, DR7 need to be saved/restored.
> +	 */
> +	KVM_DEBUGREG_AUTO_SWITCH	= BIT(2),
>   };
>   
>   struct kvm_mtrr_range {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 7aa9188f384d..ab7403a19c5d 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -586,6 +586,7 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>   
>   	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
>   
> +	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
>   	vcpu->arch.cr0_guest_owned_bits = -1ul;
>   	vcpu->arch.cr4_guest_owned_bits = -1ul;
>   
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1b189e86a1f1..fb7597c22f31 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -11013,7 +11013,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	if (vcpu->arch.guest_fpu.xfd_err)
>   		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
>   
> -	if (unlikely(vcpu->arch.switch_db_regs)) {
> +	if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCH)) {
>   		set_debugreg(0, 7);
>   		set_debugreg(vcpu->arch.eff_db[0], 0);
>   		set_debugreg(vcpu->arch.eff_db[1], 1);
> @@ -11059,6 +11059,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	 */
>   	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
>   		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
> +		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
>   		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
>   		kvm_update_dr0123(vcpu);
>   		kvm_update_dr7(vcpu);
> @@ -11071,8 +11072,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	 * care about the messed up debug address registers. But if
>   	 * we have some of them active, restore the old state.
>   	 */
> -	if (hw_breakpoint_active())
> -		hw_breakpoint_restore();
> +	if (hw_breakpoint_active()) {
> +		if (!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))
> +			hw_breakpoint_restore();
> +		else
> +			set_debugreg(__this_cpu_read(cpu_dr7), 7);

According to TDX module 1.5 ABI spec:
DR0-3, DR6 and DR7 are set to their architectural INIT value, why is 
only DR7 restored?


I found a discussion about it in
https://lore.kernel.org/kvm/Yk4iChT2sbtzSb8h@google.com/

Sean mentioned:
"
The TDX module context switches the guest _and_ host debug registers.  
It restores
the host DRs because it needs to write _something_ to hide guest state, 
so it might
as well restore the host values.  The above was an optmization to avoid 
rewriting
all debug registers.
"

I have question on "so it might as well restore the host values".
Was it a guess/assumption or it was the fact?



> +	}
>   
>   	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
>   	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
Isaku Yamahata April 13, 2024, 12:20 a.m. UTC | #2
On Sun, Apr 07, 2024 at 06:52:44PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> 
> 
> On 2/26/2024 4:26 PM, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > Add a flag, KVM_DEBUGREG_AUTO_SWITCHED_GUEST, to skip saving/restoring DRs
> > irrespective of any other flags.
> 
> Here "irrespective of any other flags" sounds like other flags will be
> ignored if KVM_DEBUGREG_AUTO_SWITCHED_GUEST is set.
> But the code below doesn't align with it.

Sure, let's update the commit message.


> >    TDX-SEAM unconditionally saves and
> > restores guest DRs and reset to architectural INIT state on TD exit.
> > So, KVM needs to save host DRs before TD enter without restoring guest DRs
> > and restore host DRs after TD exit.
> > 
> > Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT().
> > 
> > Reported-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Chao Gao <chao.gao@intel.com>
> > Signed-off-by: Chao Gao <chao.gao@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >   arch/x86/include/asm/kvm_host.h | 10 ++++++++--
> >   arch/x86/kvm/vmx/tdx.c          |  1 +
> >   arch/x86/kvm/x86.c              | 11 ++++++++---
> >   3 files changed, 17 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 3ab85c3d86ee..a9df898c6fbd 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -610,8 +610,14 @@ struct kvm_pmu {
> >   struct kvm_pmu_ops;
> >   enum {
> > -	KVM_DEBUGREG_BP_ENABLED = 1,
> > -	KVM_DEBUGREG_WONT_EXIT = 2,
> > +	KVM_DEBUGREG_BP_ENABLED		= BIT(0),
> > +	KVM_DEBUGREG_WONT_EXIT		= BIT(1),
> > +	/*
> > +	 * Guest debug registers (DR0-3 and DR6) are saved/restored by hardware
> > +	 * on exit from or enter to guest. KVM needn't switch them. Because DR7
> > +	 * is cleared on exit from guest, DR7 need to be saved/restored.
> > +	 */
> > +	KVM_DEBUGREG_AUTO_SWITCH	= BIT(2),
> >   };
> >   struct kvm_mtrr_range {
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 7aa9188f384d..ab7403a19c5d 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -586,6 +586,7 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> >   	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
> > +	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
> >   	vcpu->arch.cr0_guest_owned_bits = -1ul;
> >   	vcpu->arch.cr4_guest_owned_bits = -1ul;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1b189e86a1f1..fb7597c22f31 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -11013,7 +11013,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >   	if (vcpu->arch.guest_fpu.xfd_err)
> >   		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
> > -	if (unlikely(vcpu->arch.switch_db_regs)) {
> > +	if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCH)) {
> >   		set_debugreg(0, 7);
> >   		set_debugreg(vcpu->arch.eff_db[0], 0);
> >   		set_debugreg(vcpu->arch.eff_db[1], 1);
> > @@ -11059,6 +11059,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >   	 */
> >   	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
> >   		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
> > +		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
> >   		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
> >   		kvm_update_dr0123(vcpu);
> >   		kvm_update_dr7(vcpu);
> > @@ -11071,8 +11072,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >   	 * care about the messed up debug address registers. But if
> >   	 * we have some of them active, restore the old state.
> >   	 */
> > -	if (hw_breakpoint_active())
> > -		hw_breakpoint_restore();
> > +	if (hw_breakpoint_active()) {
> > +		if (!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))
> > +			hw_breakpoint_restore();
> > +		else
> > +			set_debugreg(__this_cpu_read(cpu_dr7), 7);
> 
> According to TDX module 1.5 ABI spec:
> DR0-3, DR6 and DR7 are set to their architectural INIT value, why is only
> DR7 restored?

This hunk should be dropped. Thank you for finding this.

I checked the base SPEC, the ABI spec, and the TDX module code.  It seems the
documentation bug of the TDX module 1.5 base architecture specification.


The TDX module code:
- restores guest DR<N> on TD Entry to guest.
- saves guest DR<N> on TD Exit from guest TD
- initializes DR<N> on TD Exit to host VMM

TDX module 1.5 base architecture specification:
15.1.2.1 Context Switch
By design, the Intel TDX module context-switches all debug/tracing state that
the guest TD is allowed to use.
        DR0-3, DR6 and IA32_DS_AREA MSR are context-switched in TDH.VP.ENTER and
        TD exit flows
        RFLAGS, IA32_DEBUGCTL MSR and DR7 are saved and cleared on VM exits from
        the guest TD and restored on VM entry to the guest TD.

TDX module 1.5 ABI specification:
5.3.65. TDH.VP.ENTER Leaf
CPU State Preservation Following a Successful TD Entry and a TD Exit
Following a successful TD entry and a TD exit, some CPU state is modified:
        Registers DR0, DR1, DR2, DR3, DR6 and DR7 are set to their architectural
        INIT value.
Binbin Wu May 6, 2024, 3:30 a.m. UTC | #3
On 2/26/2024 4:26 PM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> Add a flag, KVM_DEBUGREG_AUTO_SWITCHED_GUEST, to skip saving/restoring DRs
> irrespective of any other flags.  TDX-SEAM unconditionally saves and
> restores guest DRs and reset to architectural INIT state on TD exit.
> So, KVM needs to save host DRs before TD enter without restoring guest DRs
> and restore host DRs after TD exit.
>
> Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT().
>
> Reported-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 10 ++++++++--
>   arch/x86/kvm/vmx/tdx.c          |  1 +
>   arch/x86/kvm/x86.c              | 11 ++++++++---
>   3 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3ab85c3d86ee..a9df898c6fbd 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -610,8 +610,14 @@ struct kvm_pmu {
>   struct kvm_pmu_ops;
>   
>   enum {
> -	KVM_DEBUGREG_BP_ENABLED = 1,
> -	KVM_DEBUGREG_WONT_EXIT = 2,
> +	KVM_DEBUGREG_BP_ENABLED		= BIT(0),
> +	KVM_DEBUGREG_WONT_EXIT		= BIT(1),
> +	/*
> +	 * Guest debug registers (DR0-3 and DR6) are saved/restored by hardware
> +	 * on exit from or enter to guest. KVM needn't switch them. Because DR7
> +	 * is cleared on exit from guest, DR7 need to be saved/restored.
> +	 */
> +	KVM_DEBUGREG_AUTO_SWITCH	= BIT(2),
>   };
>   
>   struct kvm_mtrr_range {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 7aa9188f384d..ab7403a19c5d 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -586,6 +586,7 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>   
>   	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
>   
> +	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
>   	vcpu->arch.cr0_guest_owned_bits = -1ul;
>   	vcpu->arch.cr4_guest_owned_bits = -1ul;
>   
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1b189e86a1f1..fb7597c22f31 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -11013,7 +11013,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	if (vcpu->arch.guest_fpu.xfd_err)
>   		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
>   
> -	if (unlikely(vcpu->arch.switch_db_regs)) {
> +	if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCH)) {

As pointed by Paolo in 
https://lore.kernel.org/lkml/ea136ac6-53cf-cdc5-a741-acfb437819b1@redhat.com/
KVM_DEBUGREG_BP_ENABLED could be set in vcpu->arch.switch_db_regs,  by 
userspace
kvm_vcpu_ioctl_x86_set_debugregs()  --> kvm_update_dr7()

So it should be fixed as:

-       if (unlikely(vcpu->arch.switch_db_regs)) {
+       if (unlikely(vcpu->arch.switch_db_regs &&
+                    !(vcpu->arch.switch_db_regs & 
KVM_DEBUGREG_AUTO_SWITCH))) {


>   		set_debugreg(0, 7);
>   		set_debugreg(vcpu->arch.eff_db[0], 0);
>   		set_debugreg(vcpu->arch.eff_db[1], 1);
> @@ -11059,6 +11059,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	 */
>   	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
>   		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
> +		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
>   		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
>   		kvm_update_dr0123(vcpu);
>   		kvm_update_dr7(vcpu);
> @@ -11071,8 +11072,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	 * care about the messed up debug address registers. But if
>   	 * we have some of them active, restore the old state.
>   	 */
> -	if (hw_breakpoint_active())
> -		hw_breakpoint_restore();
> +	if (hw_breakpoint_active()) {
> +		if (!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))
> +			hw_breakpoint_restore();
> +		else
> +			set_debugreg(__this_cpu_read(cpu_dr7), 7);
> +	}
>   
>   	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
>   	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
Isaku Yamahata May 6, 2024, 7:10 p.m. UTC | #4
On Mon, May 06, 2024 at 11:30:52AM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1b189e86a1f1..fb7597c22f31 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -11013,7 +11013,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >   	if (vcpu->arch.guest_fpu.xfd_err)
> >   		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
> > -	if (unlikely(vcpu->arch.switch_db_regs)) {
> > +	if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCH)) {
> 
> As pointed by Paolo in
> https://lore.kernel.org/lkml/ea136ac6-53cf-cdc5-a741-acfb437819b1@redhat.com/
> KVM_DEBUGREG_BP_ENABLED could be set in vcpu->arch.switch_db_regs,  by
> userspace
> kvm_vcpu_ioctl_x86_set_debugregs()  --> kvm_update_dr7()
> 
> So it should be fixed as:
> 
> -       if (unlikely(vcpu->arch.switch_db_regs)) {
> +       if (unlikely(vcpu->arch.switch_db_regs &&
> +                    !(vcpu->arch.switch_db_regs &
> KVM_DEBUGREG_AUTO_SWITCH))) {

Yes, that's true. Thanks for catching this.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3ab85c3d86ee..a9df898c6fbd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -610,8 +610,14 @@  struct kvm_pmu {
 struct kvm_pmu_ops;
 
 enum {
-	KVM_DEBUGREG_BP_ENABLED = 1,
-	KVM_DEBUGREG_WONT_EXIT = 2,
+	KVM_DEBUGREG_BP_ENABLED		= BIT(0),
+	KVM_DEBUGREG_WONT_EXIT		= BIT(1),
+	/*
+	 * Guest debug registers (DR0-3 and DR6) are saved/restored by hardware
+	 * on exit from or enter to guest. KVM needn't switch them. Because DR7
+	 * is cleared on exit from guest, DR7 need to be saved/restored.
+	 */
+	KVM_DEBUGREG_AUTO_SWITCH	= BIT(2),
 };
 
 struct kvm_mtrr_range {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 7aa9188f384d..ab7403a19c5d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -586,6 +586,7 @@  int tdx_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
 
+	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
 	vcpu->arch.cr0_guest_owned_bits = -1ul;
 	vcpu->arch.cr4_guest_owned_bits = -1ul;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b189e86a1f1..fb7597c22f31 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11013,7 +11013,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.guest_fpu.xfd_err)
 		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
 
-	if (unlikely(vcpu->arch.switch_db_regs)) {
+	if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCH)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
 		set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -11059,6 +11059,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 */
 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
 		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
 		kvm_update_dr0123(vcpu);
 		kvm_update_dr7(vcpu);
@@ -11071,8 +11072,12 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * care about the messed up debug address registers. But if
 	 * we have some of them active, restore the old state.
 	 */
-	if (hw_breakpoint_active())
-		hw_breakpoint_restore();
+	if (hw_breakpoint_active()) {
+		if (!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))
+			hw_breakpoint_restore();
+		else
+			set_debugreg(__this_cpu_read(cpu_dr7), 7);
+	}
 
 	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
 	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());