diff mbox series

[v2,4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()

Message ID 8e2aae32-917c-8035-1aef-8b47c321e42b@suse.com (mailing list archive)
State New, archived
Headers show
Series XSA-292 follow-up | expand

Commit Message

Jan Beulich Sept. 17, 2019, 6:15 a.m. UTC
The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
particular not when loading nested guest state.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Paul Durrant Sept. 17, 2019, 12:45 p.m. UTC | #1
> -----Original Message-----
> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
> Sent: 17 September 2019 07:15
> To: xen-devel@lists.xenproject.org
> Cc: Kevin Tian <kevin.tian@intel.com>; Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>; Wei Liu
> <wl@xen.org>; Paul Durrant <paul@xen.org>; George Dunlap <George.Dunlap@citrix.com>; Andrew Cooper
> <Andrew.Cooper3@citrix.com>; Jun Nakajima <jun.nakajima@intel.com>; Boris Ostrovsky
> <boris.ostrovsky@oracle.com>; Roger Pau Monne <roger.pau@citrix.com>
> Subject: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
> particular not when loading nested guest state.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>      switch ( reg )
>      {
> +        bool noflush;
> +

I said this before... I think the tighter case-scope is better. Cosmetically it may look a little odd, but surely it gives the compiler a better chance to optimize?

  Paul

>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> @@ -2090,7 +2092,10 @@ static int hvmemul_write_cr(
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;
> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2059,12 +2059,17 @@ int hvm_mov_to_cr(unsigned int cr, unsig
> 
>      switch ( cr )
>      {
> +        bool noflush;
> +
>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;
> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>      return X86EMUL_OKAY;
>  }
> 
> -int hvm_set_cr3(unsigned long value, bool may_defer)
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>  {
>      struct vcpu *v = current;
>      struct page_info *page;
>      unsigned long old = v->arch.hvm.guest_cr[3];
> -    bool noflush = false;
> 
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>              v->arch.vm_event->write_data.cr3 = value;
> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> 
>              return X86EMUL_OKAY;
>          }
>      }
> 
> -    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
> -    {
> -        noflush = value & X86_CR3_NOFLUSH;
> -        value &= ~X86_CR3_NOFLUSH;
> -    }
> -
>      if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
>           (value != v->arch.hvm.guest_cr[3]) )
>      {
> @@ -3004,7 +3003,7 @@ void hvm_task_switch(
>      if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
>          goto out;
> 
> -    rc = hvm_set_cr3(tss.cr3, true);
> +    rc = hvm_set_cr3(tss.cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if ( rc != X86EMUL_OKAY )
> --- a/xen/arch/x86/hvm/svm/nestedsvm.c
> +++ b/xen/arch/x86/hvm/svm/nestedsvm.c
> @@ -324,7 +324,7 @@ static int nsvm_vcpu_hostrestore(struct
>          v->arch.guest_table = pagetable_null();
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
>      }
> -    rc = hvm_set_cr3(n1vmcb->_cr3, true);
> +    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if (rc != X86EMUL_OKAY)
> @@ -584,7 +584,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>          nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
> 
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> @@ -598,7 +598,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>           * we assume it intercepts page faults.
>           */
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> --- a/xen/arch/x86/hvm/vm_event.c
> +++ b/xen/arch/x86/hvm/vm_event.c
> @@ -110,7 +110,7 @@ void hvm_vm_event_do_resume(struct vcpu
> 
>      if ( unlikely(w->do_write.cr3) )
>      {
> -        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
> +        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
>          w->do_write.cr3 = 0;
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -1032,7 +1032,7 @@ static void load_shadow_guest_state(stru
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> @@ -1246,7 +1246,7 @@ static void load_vvmcs_host_state(struct
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -274,6 +274,8 @@ struct monitor_write_data {
>          unsigned int cr4 : 1;
>      } do_write;
> 
> +    bool cr3_noflush;
> +
>      uint32_t msr;
>      uint64_t value;
>      uint64_t cr0;
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -135,7 +135,7 @@ void hvm_shadow_handle_cd(struct vcpu *v
>   */
>  int hvm_set_efer(uint64_t value);
>  int hvm_set_cr0(unsigned long value, bool may_defer);
> -int hvm_set_cr3(unsigned long value, bool may_defer);
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
>  int hvm_set_cr4(unsigned long value, bool may_defer);
>  int hvm_descriptor_access_intercept(uint64_t exit_info,
>                                      uint64_t vmx_exit_qualification,
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
Jan Beulich Sept. 17, 2019, 1:03 p.m. UTC | #2
On 17.09.2019 14:45, Paul Durrant wrote:
>> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
>> Sent: 17 September 2019 07:15
>>
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
>>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>>      switch ( reg )
>>      {
>> +        bool noflush;
>> +
> 
> I said this before... I think the tighter case-scope is better.

I recall you saying so, but I don't recall you actually making this a
requirement to get your ack.

> Cosmetically it may look a little odd, but surely it gives the
> compiler a better chance to optimize?

I don't think so - they're pretty good to limit the life range of
variables (at least in not overly hairy functions) these days. The
more narrow scopes we often ask for are more for the reader to
easily know what the intended usage range of a variable is.

Jan
Paul Durrant Sept. 17, 2019, 1:38 p.m. UTC | #3
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 17 September 2019 14:03
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Jun Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>;
> BorisOstrovsky <boris.ostrovsky@oracle.com>; Paul Durrant <paul@xen.org>; Wei Liu <wl@xen.org>
> Subject: Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> On 17.09.2019 14:45, Paul Durrant wrote:
> >> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
> >> Sent: 17 September 2019 07:15
> >>
> >> --- a/xen/arch/x86/hvm/emulate.c
> >> +++ b/xen/arch/x86/hvm/emulate.c
> >> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
> >>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
> >>      switch ( reg )
> >>      {
> >> +        bool noflush;
> >> +
> >
> > I said this before... I think the tighter case-scope is better.
> 
> I recall you saying so, but I don't recall you actually making this a
> requirement to get your ack.
> 
> > Cosmetically it may look a little odd, but surely it gives the
> > compiler a better chance to optimize?
> 
> I don't think so - they're pretty good to limit the life range of
> variables (at least in not overly hairy functions) these days. The
> more narrow scopes we often ask for are more for the reader to
> easily know what the intended usage range of a variable is.

Ok. I'm not going to insist, but I would still prefer case-scope here.

Reviewed-by: Paul Durrant <paul@xen.org>

> 
> Jan
Andrew Cooper Sept. 17, 2019, 7:31 p.m. UTC | #4
On 17/09/2019 07:15, Jan Beulich wrote:
> The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
> particular not when loading nested guest state.

I've found a footnote for "26.3.1.1 Checks on Guest Control Registers,
Debug Registers, and MSRs" stating:

"Bit 63 of the CR3 field in the guest-state area must be 0. This is true
even though, If CR4.PCIDE = 1, bit 63 of the source operand to MOV to
CR3 is used to determine whether cached translation information is
invalidated."

This is fairly clear that attempting to set the NOFLUSH bit in the VMCS
is going to cause problems, and I seem to recall some vmentry failures
reported in the past on the subject.

What it isn't so clear on is whether we will ever find a VMCS with the
NOFLUSH bit set in guest state.  Sadly, the vmexit side of the above
comment (which was for vmentry) simply says "The contents of ... CR3,
... are saved into the corresponding fields."

Any way, I'm in principle ok with the change, except for...

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>      return X86EMUL_OKAY;
>  }
>  
> -int hvm_set_cr3(unsigned long value, bool may_defer)
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>  {
>      struct vcpu *v = current;
>      struct page_info *page;
>      unsigned long old = v->arch.hvm.guest_cr[3];
> -    bool noflush = false;
>  
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>              v->arch.vm_event->write_data.cr3 = value;
> +            v->arch.vm_event->write_data.cr3_noflush = noflush;

... this, which I'm pretty sure breaks the reporting of noflush mov to
cr3's.  The semantics of the VMI hook is "the guest tried to write this
value to cr3", which should include the noflush bit in its architectural
position.  I suspect it also breaks a reply with the noflush bit set,
because I don't see any way that is fed back from the introspecting agent.

CC'ing the Introspection maintainers for their view.

~Andrew
Jan Beulich Sept. 18, 2019, 9:29 a.m. UTC | #5
On 17.09.2019 21:31, Andrew Cooper wrote:
> On 17/09/2019 07:15, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>>      return X86EMUL_OKAY;
>>  }
>>  
>> -int hvm_set_cr3(unsigned long value, bool may_defer)
>> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>>  {
>>      struct vcpu *v = current;
>>      struct page_info *page;
>>      unsigned long old = v->arch.hvm.guest_cr[3];
>> -    bool noflush = false;
>>  
>>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
>> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>>              v->arch.vm_event->write_data.cr3 = value;
>> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> 
> ... this, which I'm pretty sure breaks the reporting of noflush mov to
> cr3's.  The semantics of the VMI hook is "the guest tried to write this
> value to cr3", which should include the noflush bit in its architectural
> position.  I suspect it also breaks a reply with the noflush bit set,
> because I don't see any way that is fed back from the introspecting agent.

hvm_monitor_cr() zaps the bit off the reported value. I wonder
whether the patch here should include removing this zapping, as
being redundant now. I don't think though that the patch alters
behavior in this regard.

> CC'ing the Introspection maintainers for their view.

I'll wait for their feedback anyway, before making any possible
change to the patch.

Jan
Tamas K Lengyel Sept. 18, 2019, 3:43 p.m. UTC | #6
On Wed, Sep 18, 2019 at 3:29 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 17.09.2019 21:31, Andrew Cooper wrote:
> > On 17/09/2019 07:15, Jan Beulich wrote:
> >> --- a/xen/arch/x86/hvm/hvm.c
> >> +++ b/xen/arch/x86/hvm/hvm.c
> >> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
> >>      return X86EMUL_OKAY;
> >>  }
> >>
> >> -int hvm_set_cr3(unsigned long value, bool may_defer)
> >> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
> >>  {
> >>      struct vcpu *v = current;
> >>      struct page_info *page;
> >>      unsigned long old = v->arch.hvm.guest_cr[3];
> >> -    bool noflush = false;
> >>
> >>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
> >>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> >> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
> >>              /* The actual write will occur in hvm_do_resume(), if permitted. */
> >>              v->arch.vm_event->write_data.do_write.cr3 = 1;
> >>              v->arch.vm_event->write_data.cr3 = value;
> >> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> >
> > ... this, which I'm pretty sure breaks the reporting of noflush mov to
> > cr3's.  The semantics of the VMI hook is "the guest tried to write this
> > value to cr3", which should include the noflush bit in its architectural
> > position.  I suspect it also breaks a reply with the noflush bit set,
> > because I don't see any way that is fed back from the introspecting agent.
>
> hvm_monitor_cr() zaps the bit off the reported value. I wonder
> whether the patch here should include removing this zapping, as
> being redundant now. I don't think though that the patch alters
> behavior in this regard.
>
> > CC'ing the Introspection maintainers for their view.
>
> I'll wait for their feedback anyway, before making any possible
> change to the patch.
>

I'm not aware of any use-case of the CR3 introspection that needs to
know whether the noflush bit was set or not. Having it zapped before
the event is sent out simplifies handling of CR3 events because we
don't have to constantly reply with a different cr3 value from the
monitor agent with that bit masked. So unless that changes and we come
up with a usecase that needs it I wouldn't change how things are right
now.

Tamas
diff mbox series

Patch

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2080,6 +2080,8 @@  static int hvmemul_write_cr(
     HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
     switch ( reg )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
@@ -2090,7 +2092,10 @@  static int hvmemul_write_cr(
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2059,12 +2059,17 @@  int hvm_mov_to_cr(unsigned int cr, unsig
 
     switch ( cr )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
@@ -2282,12 +2287,11 @@  int hvm_set_cr0(unsigned long value, boo
     return X86EMUL_OKAY;
 }
 
-int hvm_set_cr3(unsigned long value, bool may_defer)
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
     struct vcpu *v = current;
     struct page_info *page;
     unsigned long old = v->arch.hvm.guest_cr[3];
-    bool noflush = false;
 
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
@@ -2299,17 +2303,12 @@  int hvm_set_cr3(unsigned long value, boo
             /* The actual write will occur in hvm_do_resume(), if permitted. */
             v->arch.vm_event->write_data.do_write.cr3 = 1;
             v->arch.vm_event->write_data.cr3 = value;
+            v->arch.vm_event->write_data.cr3_noflush = noflush;
 
             return X86EMUL_OKAY;
         }
     }
 
-    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
-    {
-        noflush = value & X86_CR3_NOFLUSH;
-        value &= ~X86_CR3_NOFLUSH;
-    }
-
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm.guest_cr[3]) )
     {
@@ -3004,7 +3003,7 @@  void hvm_task_switch(
     if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
         goto out;
 
-    rc = hvm_set_cr3(tss.cr3, true);
+    rc = hvm_set_cr3(tss.cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if ( rc != X86EMUL_OKAY )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -324,7 +324,7 @@  static int nsvm_vcpu_hostrestore(struct
         v->arch.guest_table = pagetable_null();
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
     }
-    rc = hvm_set_cr3(n1vmcb->_cr3, true);
+    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if (rc != X86EMUL_OKAY)
@@ -584,7 +584,7 @@  static int nsvm_vmcb_prepare4vmrun(struc
         nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
@@ -598,7 +598,7 @@  static int nsvm_vmcb_prepare4vmrun(struc
          * we assume it intercepts page faults.
          */
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
--- a/xen/arch/x86/hvm/vm_event.c
+++ b/xen/arch/x86/hvm/vm_event.c
@@ -110,7 +110,7 @@  void hvm_vm_event_do_resume(struct vcpu
 
     if ( unlikely(w->do_write.cr3) )
     {
-        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
+        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
         w->do_write.cr3 = 0;
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1032,7 +1032,7 @@  static void load_shadow_guest_state(stru
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
@@ -1246,7 +1246,7 @@  static void load_vvmcs_host_state(struct
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -274,6 +274,8 @@  struct monitor_write_data {
         unsigned int cr4 : 1;
     } do_write;
 
+    bool cr3_noflush;
+
     uint32_t msr;
     uint64_t value;
     uint64_t cr0;
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -135,7 +135,7 @@  void hvm_shadow_handle_cd(struct vcpu *v
  */
 int hvm_set_efer(uint64_t value);
 int hvm_set_cr0(unsigned long value, bool may_defer);
-int hvm_set_cr3(unsigned long value, bool may_defer);
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
 int hvm_set_cr4(unsigned long value, bool may_defer);
 int hvm_descriptor_access_intercept(uint64_t exit_info,
                                     uint64_t vmx_exit_qualification,