[v4,10/15] KVM: arm: implement world switch for debug registers
diff mbox

Message ID 1439213167-8988-11-git-send-email-zhichao.huang@linaro.org
State New
Headers show

Commit Message

Zhichao Huang Aug. 10, 2015, 1:26 p.m. UTC
Implement switching of the debug registers. While the number
of registers is massive, CPUs usually don't implement them all
(A15 has 6 breakpoints and 4 watchpoints, which gives us a total
of 22 registers "only").

Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
---
 arch/arm/kvm/interrupts_head.S | 170 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 159 insertions(+), 11 deletions(-)

Comments

Christoffer Dall Sept. 2, 2015, 2:53 p.m. UTC | #1
On Mon, Aug 10, 2015 at 09:26:02PM +0800, Zhichao Huang wrote:
> Implement switching of the debug registers. While the number
> of registers is massive, CPUs usually don't implement them all
> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
> of 22 registers "only").
> 
> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
> ---
>  arch/arm/kvm/interrupts_head.S | 170 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 159 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
> index 7ac5e51..b9e7410 100644
> --- a/arch/arm/kvm/interrupts_head.S
> +++ b/arch/arm/kvm/interrupts_head.S
> @@ -5,6 +5,7 @@
>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
> +#define CP14_OFFSET(_cp14_reg_idx) ((_cp14_reg_idx) * 4)
>  
>  /*
>   * Many of these macros need to access the VCPU structure, which is always
> @@ -239,6 +240,136 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
>  .endm
>  
> +/* Assume r10/r11/r12 are in use, clobbers r2-r3 */
> +.macro cp14_read_and_str base Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3

can this code be compiled in Thumb-2 ?  If so, are all the instructions
below 32-bit wide?

> +	bx	r3
> +1:
> +	mrc	p14, 0, r2, c0, c15, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+15)]
> +	mrc	p14, 0, r2, c0, c14, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+14)]
> +	mrc	p14, 0, r2, c0, c13, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+13)]
> +	mrc	p14, 0, r2, c0, c12, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+12)]
> +	mrc	p14, 0, r2, c0, c11, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+11)]
> +	mrc	p14, 0, r2, c0, c10, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+10)]
> +	mrc	p14, 0, r2, c0, c9, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+9)]
> +	mrc	p14, 0, r2, c0, c8, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+8)]
> +	mrc	p14, 0, r2, c0, c7, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+7)]
> +	mrc	p14, 0, r2, c0, c6, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+6)]
> +	mrc	p14, 0, r2, c0, c5, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+5)]
> +	mrc	p14, 0, r2, c0, c4, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+4)]
> +	mrc	p14, 0, r2, c0, c3, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+3)]
> +	mrc	p14, 0, r2, c0, c2, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+2)]
> +	mrc	p14, 0, r2, c0, c1, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+1)]
> +	mrc	p14, 0, r2, c0, c0, \Op2
> +	str     r2, [\base, #CP14_OFFSET(\cp14_reg0)]
> +.endm
> +
> +/* Assume r11/r12 are in use, clobbers r2-r3 */
> +.macro cp14_ldr_and_write base Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3

see above

> +	bx	r3
> +1:
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+15)]
> +	mcr	p14, 0, r2, c0, c15, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+14)]
> +	mcr	p14, 0, r2, c0, c14, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+13)]
> +	mcr	p14, 0, r2, c0, c13, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+12)]
> +	mcr	p14, 0, r2, c0, c12, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+11)]
> +	mcr	p14, 0, r2, c0, c11, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+10)]
> +	mcr	p14, 0, r2, c0, c10, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+9)]
> +	mcr	p14, 0, r2, c0, c9, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+8)]
> +	mcr	p14, 0, r2, c0, c8, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+7)]
> +	mcr	p14, 0, r2, c0, c7, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+6)]
> +	mcr	p14, 0, r2, c0, c6, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+5)]
> +	mcr	p14, 0, r2, c0, c5, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+4)]
> +	mcr	p14, 0, r2, c0, c4, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+3)]
> +	mcr	p14, 0, r2, c0, c3, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+2)]
> +	mcr	p14, 0, r2, c0, c2, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+1)]
> +	mcr	p14, 0, r2, c0, c1, \Op2
> +	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0)]
> +	mcr	p14, 0, r2, c0, c0, \Op2
> +.endm
> +
> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
> +.macro read_hw_dbg_num
> +	mrc	p14, 0, r2, c0, c0, 0

can you add @ DBGDIDR here, so we know which register we are looking at?

> +	ubfx	r11, r2, #24, #4
> +	add	r11, r11, #1		@ Extract BRPs
> +	ubfx	r12, r2, #28, #4
> +	add	r12, r12, #1		@ Extract WRPs
> +	mov	r2, #16
> +	sub	r11, r2, r11		@ How many BPs to skip
> +	sub	r12, r2, r12		@ How many WPs to skip
> +.endm
> +
> +/* Reads cp14 registers from hardware.
> + * Writes cp14 registers in-order to the CP14 struct pointed to by r10
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_debug_state
> +	read_hw_dbg_num
> +	cp14_read_and_str r10, 4, cp14_DBGBVR0, r11
> +	cp14_read_and_str r10, 5, cp14_DBGBCR0, r11
> +	cp14_read_and_str r10, 6, cp14_DBGWVR0, r12
> +	cp14_read_and_str r10, 7, cp14_DBGWCR0, r12
> +
> +	/* DBGDSCR reg */
> +	mrc	p14, 0, r2, c0, c1, 0
> +	str	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]

so again we're touching the scary register on every world-switch.  Since
it sounds like we have experience telling us that this can cause
troubles, I'm wondering if we can get around it by:

Only ever allow the guest to use debugging registers if we managed to
enter_monitor_mode on the host, and in that case only allow guest
debugging with the configuration of DBGDSCR that the host has.

If the host never managed to enable debugging, the guest probably won't
succeed either, and we should just trap all guest accesses to the debug
registers.

Does this work?

> +.endm
> +
> +/* Reads cp14 registers in-order from the CP14 struct pointed to by r10
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_debug_state
> +	read_hw_dbg_num
> +	cp14_ldr_and_write r10, 4, cp14_DBGBVR0, r11
> +	cp14_ldr_and_write r10, 5, cp14_DBGBCR0, r11
> +	cp14_ldr_and_write r10, 6, cp14_DBGWVR0, r12
> +	cp14_ldr_and_write r10, 7, cp14_DBGWCR0, r12
> +
> +	/* DBGDSCR reg */
> +	ldr	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]
> +	mcr	p14, 0, r2, c0, c2, 2

same as above

> +.endm
> +
>  /* Reads cp14/cp15 registers from hardware and stores them in memory
>   * @store_to_vcpu: If 0, registers are written in-order to the stack,
>   * 		   otherwise to the VCPU struct pointed to by vcpup
> @@ -248,11 +379,17 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>   * Clobbers r2 - r12
>   */
>  .macro read_coproc_state store_to_vcpu
> -	.if \store_to_vcpu == 0
> -	mrc	p14, 0, r2, c0, c1, 0	@ DBGDSCR
> -	push	{r2}
> +	.if \store_to_vcpu == 1
> +	add	r10, vcpu, #VCPU_CP14
> +	.else
> +	add	r10, vcpu, #VCPU_HOST_CONTEXT
> +	ldr	r10, [r10]
> +	add	r10, r10, #VCPU_CP14_HOST
>  	.endif
>  
> +	/* Assumes r10 pointer in cp14 regs  */
> +	bl __save_debug_state
> +
>  	mrc	p15, 0, r2, c1, c0, 0	@ SCTLR
>  	mrc	p15, 0, r3, c1, c0, 2	@ CPACR
>  	mrc	p15, 0, r4, c2, c0, 2	@ TTBCR
> @@ -331,6 +468,17 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>   * Assumes vcpu pointer in vcpu reg
>   */
>  .macro write_coproc_state read_from_vcpu
> +	.if \read_from_vcpu == 1
> +	add	r10, vcpu, #VCPU_CP14
> +	.else
> +	add	r10, vcpu, #VCPU_HOST_CONTEXT
> +	ldr	r10, [r10]
> +	add	r10, r10, #VCPU_CP14_HOST
> +	.endif
> +
> +	/* Assumes r10 pointer in cp14 regs  */
> +	bl __restore_debug_state
> +
>  	.if \read_from_vcpu == 0
>  	pop	{r2,r4-r7}
>  	.else
> @@ -399,14 +547,6 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	mcr	p15, 0, r10, c10, c2, 0	@ PRRR
>  	mcr	p15, 0, r11, c10, c2, 1	@ NMRR
>  	mcr	p15, 2, r12, c0, c0, 0	@ CSSELR
> -
> -	.if \read_from_vcpu == 0
> -	pop	{r2}
> -	.else
> -	mov	r2, #0
> -	.endif
> -
> -	mcr	p14, 0, r2, c0, c2, 2	@ DBGDSCR
>  .endm
>  
>  /*
> @@ -657,3 +797,11 @@ ARM_BE8(rev	r6, r6  )
>  .macro load_vcpu
>  	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
>  .endm
> +
> +__save_debug_state:
> +	save_debug_state
> +	bx	lr
> +
> +__restore_debug_state:
> +	restore_debug_state
> +	bx	lr
> -- 
> 1.7.12.4
> 

The rest of the register mangling looks ok this time though.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhichao Huang Sept. 29, 2015, 5:32 a.m. UTC | #2
On 2015/9/2 22:53, Christoffer Dall wrote:
>> +/* Reads cp14 registers from hardware.
>> + * Writes cp14 registers in-order to the CP14 struct pointed to by r10
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro save_debug_state
>> +	read_hw_dbg_num
>> +	cp14_read_and_str r10, 4, cp14_DBGBVR0, r11
>> +	cp14_read_and_str r10, 5, cp14_DBGBCR0, r11
>> +	cp14_read_and_str r10, 6, cp14_DBGWVR0, r12
>> +	cp14_read_and_str r10, 7, cp14_DBGWCR0, r12
>> +
>> +	/* DBGDSCR reg */
>> +	mrc	p14, 0, r2, c0, c1, 0
>> +	str	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]
> 
> so again we're touching the scary register on every world-switch.  Since
> it sounds like we have experience telling us that this can cause
> troubles, I'm wondering if we can get around it by:
> 
> Only ever allow the guest to use debugging registers if we managed to
> enter_monitor_mode on the host, and in that case only allow guest
> debugging with the configuration of DBGDSCR that the host has.
> 
> If the host never managed to enable debugging, the guest probably won't
> succeed either, and we should just trap all guest accesses to the debug
> registers.
> 
> Does this work?
> 

I think it works. Since the register is dangerous, we will try not to
world switch it. It means that the guest will not be able to write the register,
and will always see what the host set. So the guest will not be able to use
hardware debug feature if the host disable it.

> 
> -Christoffer
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Sept. 29, 2015, 7:34 a.m. UTC | #3
On Tue, Sep 29, 2015 at 01:32:35PM +0800, Zhichao Huang wrote:
> 
> 
> On 2015/9/2 22:53, Christoffer Dall wrote:
> >> +/* Reads cp14 registers from hardware.
> >> + * Writes cp14 registers in-order to the CP14 struct pointed to by r10
> >> + *
> >> + * Assumes vcpu pointer in vcpu reg
> >> + *
> >> + * Clobbers r2-r12
> >> + */
> >> +.macro save_debug_state
> >> +	read_hw_dbg_num
> >> +	cp14_read_and_str r10, 4, cp14_DBGBVR0, r11
> >> +	cp14_read_and_str r10, 5, cp14_DBGBCR0, r11
> >> +	cp14_read_and_str r10, 6, cp14_DBGWVR0, r12
> >> +	cp14_read_and_str r10, 7, cp14_DBGWCR0, r12
> >> +
> >> +	/* DBGDSCR reg */
> >> +	mrc	p14, 0, r2, c0, c1, 0
> >> +	str	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]
> > 
> > so again we're touching the scary register on every world-switch.  Since
> > it sounds like we have experience telling us that this can cause
> > troubles, I'm wondering if we can get around it by:
> > 
> > Only ever allow the guest to use debugging registers if we managed to
> > enter_monitor_mode on the host, and in that case only allow guest
> > debugging with the configuration of DBGDSCR that the host has.
> > 
> > If the host never managed to enable debugging, the guest probably won't
> > succeed either, and we should just trap all guest accesses to the debug
> > registers.
> > 
> > Does this work?
> > 
> 
> I think it works. Since the register is dangerous, we will try not to
> world switch it. It means that the guest will not be able to write the register,
> and will always see what the host set. So the guest will not be able to use
> hardware debug feature if the host disable it.
> 
I talked to Will about this the last day of Linaro Connect and we
arrived at the conclusion that since you cannot trap on only a subset of
the debug registers (namely the DBGDSCR and the registers related to the
OS lock etc.), there is simply no safe and robus way to do this on
ARMv7.

Therfore, this patch series probably needs to be reworked so that we
*always* set TDA for the guest, and when the guest programs a break- or
watchpoint, then we program the hardware registers directly in KVM
without telling perf or hw_breakpoints about this, and we only do any of
this if monitor_mode_enabled() is true.  If not, then debugging inside
the guest simply won't work.

I also don't think that checking the host's breakpoint and wathcpoint
registers are necessary after all, since they are context switched per
process on the host, so this will only be a problem for guests when the
user uses GDB on the VCPU thread (in which case he's asking for trouble
anyway) or with using the weird perf CPU-wide breakpoints, which is very
far from a the common case, so I wouldn't worry about this for now.

Hopefully this makes sense.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 7ac5e51..b9e7410 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -5,6 +5,7 @@ 
 #define VCPU_USR_SP		(VCPU_USR_REG(13))
 #define VCPU_USR_LR		(VCPU_USR_REG(14))
 #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
+#define CP14_OFFSET(_cp14_reg_idx) ((_cp14_reg_idx) * 4)
 
 /*
  * Many of these macros need to access the VCPU structure, which is always
@@ -239,6 +240,136 @@  vcpu	.req	r0		@ vcpu pointer always in r0
 	save_guest_regs_mode irq, #VCPU_IRQ_REGS
 .endm
 
+/* Assume r10/r11/r12 are in use, clobbers r2-r3 */
+.macro cp14_read_and_str base Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	mrc	p14, 0, r2, c0, c15, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+15)]
+	mrc	p14, 0, r2, c0, c14, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+14)]
+	mrc	p14, 0, r2, c0, c13, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+13)]
+	mrc	p14, 0, r2, c0, c12, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+12)]
+	mrc	p14, 0, r2, c0, c11, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+11)]
+	mrc	p14, 0, r2, c0, c10, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+10)]
+	mrc	p14, 0, r2, c0, c9, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+9)]
+	mrc	p14, 0, r2, c0, c8, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+8)]
+	mrc	p14, 0, r2, c0, c7, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+7)]
+	mrc	p14, 0, r2, c0, c6, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+6)]
+	mrc	p14, 0, r2, c0, c5, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+5)]
+	mrc	p14, 0, r2, c0, c4, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+4)]
+	mrc	p14, 0, r2, c0, c3, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+3)]
+	mrc	p14, 0, r2, c0, c2, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+2)]
+	mrc	p14, 0, r2, c0, c1, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0+1)]
+	mrc	p14, 0, r2, c0, c0, \Op2
+	str     r2, [\base, #CP14_OFFSET(\cp14_reg0)]
+.endm
+
+/* Assume r11/r12 are in use, clobbers r2-r3 */
+.macro cp14_ldr_and_write base Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+15)]
+	mcr	p14, 0, r2, c0, c15, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+14)]
+	mcr	p14, 0, r2, c0, c14, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+13)]
+	mcr	p14, 0, r2, c0, c13, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+12)]
+	mcr	p14, 0, r2, c0, c12, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+11)]
+	mcr	p14, 0, r2, c0, c11, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+10)]
+	mcr	p14, 0, r2, c0, c10, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+9)]
+	mcr	p14, 0, r2, c0, c9, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+8)]
+	mcr	p14, 0, r2, c0, c8, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+7)]
+	mcr	p14, 0, r2, c0, c7, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+6)]
+	mcr	p14, 0, r2, c0, c6, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+5)]
+	mcr	p14, 0, r2, c0, c5, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+4)]
+	mcr	p14, 0, r2, c0, c4, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+3)]
+	mcr	p14, 0, r2, c0, c3, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+2)]
+	mcr	p14, 0, r2, c0, c2, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0+1)]
+	mcr	p14, 0, r2, c0, c1, \Op2
+	ldr     r2, [\base, #CP14_OFFSET(\cp14_reg0)]
+	mcr	p14, 0, r2, c0, c0, \Op2
+.endm
+
+/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
+.macro read_hw_dbg_num
+	mrc	p14, 0, r2, c0, c0, 0
+	ubfx	r11, r2, #24, #4
+	add	r11, r11, #1		@ Extract BRPs
+	ubfx	r12, r2, #28, #4
+	add	r12, r12, #1		@ Extract WRPs
+	mov	r2, #16
+	sub	r11, r2, r11		@ How many BPs to skip
+	sub	r12, r2, r12		@ How many WPs to skip
+.endm
+
+/* Reads cp14 registers from hardware.
+ * Writes cp14 registers in-order to the CP14 struct pointed to by r10
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro save_debug_state
+	read_hw_dbg_num
+	cp14_read_and_str r10, 4, cp14_DBGBVR0, r11
+	cp14_read_and_str r10, 5, cp14_DBGBCR0, r11
+	cp14_read_and_str r10, 6, cp14_DBGWVR0, r12
+	cp14_read_and_str r10, 7, cp14_DBGWCR0, r12
+
+	/* DBGDSCR reg */
+	mrc	p14, 0, r2, c0, c1, 0
+	str	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]
+.endm
+
+/* Reads cp14 registers in-order from the CP14 struct pointed to by r10
+ * Writes cp14 registers to hardware.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro restore_debug_state
+	read_hw_dbg_num
+	cp14_ldr_and_write r10, 4, cp14_DBGBVR0, r11
+	cp14_ldr_and_write r10, 5, cp14_DBGBCR0, r11
+	cp14_ldr_and_write r10, 6, cp14_DBGWVR0, r12
+	cp14_ldr_and_write r10, 7, cp14_DBGWCR0, r12
+
+	/* DBGDSCR reg */
+	ldr	r2, [r10, #CP14_OFFSET(cp14_DBGDSCRext)]
+	mcr	p14, 0, r2, c0, c2, 2
+.endm
+
 /* Reads cp14/cp15 registers from hardware and stores them in memory
  * @store_to_vcpu: If 0, registers are written in-order to the stack,
  * 		   otherwise to the VCPU struct pointed to by vcpup
@@ -248,11 +379,17 @@  vcpu	.req	r0		@ vcpu pointer always in r0
  * Clobbers r2 - r12
  */
 .macro read_coproc_state store_to_vcpu
-	.if \store_to_vcpu == 0
-	mrc	p14, 0, r2, c0, c1, 0	@ DBGDSCR
-	push	{r2}
+	.if \store_to_vcpu == 1
+	add	r10, vcpu, #VCPU_CP14
+	.else
+	add	r10, vcpu, #VCPU_HOST_CONTEXT
+	ldr	r10, [r10]
+	add	r10, r10, #VCPU_CP14_HOST
 	.endif
 
+	/* Assumes r10 pointer in cp14 regs  */
+	bl __save_debug_state
+
 	mrc	p15, 0, r2, c1, c0, 0	@ SCTLR
 	mrc	p15, 0, r3, c1, c0, 2	@ CPACR
 	mrc	p15, 0, r4, c2, c0, 2	@ TTBCR
@@ -331,6 +468,17 @@  vcpu	.req	r0		@ vcpu pointer always in r0
  * Assumes vcpu pointer in vcpu reg
  */
 .macro write_coproc_state read_from_vcpu
+	.if \read_from_vcpu == 1
+	add	r10, vcpu, #VCPU_CP14
+	.else
+	add	r10, vcpu, #VCPU_HOST_CONTEXT
+	ldr	r10, [r10]
+	add	r10, r10, #VCPU_CP14_HOST
+	.endif
+
+	/* Assumes r10 pointer in cp14 regs  */
+	bl __restore_debug_state
+
 	.if \read_from_vcpu == 0
 	pop	{r2,r4-r7}
 	.else
@@ -399,14 +547,6 @@  vcpu	.req	r0		@ vcpu pointer always in r0
 	mcr	p15, 0, r10, c10, c2, 0	@ PRRR
 	mcr	p15, 0, r11, c10, c2, 1	@ NMRR
 	mcr	p15, 2, r12, c0, c0, 0	@ CSSELR
-
-	.if \read_from_vcpu == 0
-	pop	{r2}
-	.else
-	mov	r2, #0
-	.endif
-
-	mcr	p14, 0, r2, c0, c2, 2	@ DBGDSCR
 .endm
 
 /*
@@ -657,3 +797,11 @@  ARM_BE8(rev	r6, r6  )
 .macro load_vcpu
 	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
 .endm
+
+__save_debug_state:
+	save_debug_state
+	bx	lr
+
+__restore_debug_state:
+	restore_debug_state
+	bx	lr