diff mbox series

[8/9] KVM: x86: When guest set CR3, handle LAM bits semantics

Message ID 20221017070450.23031-9-robert.hu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Linear Address Masking (LAM) KVM Enabling | expand

Commit Message

Robert Hoo Oct. 17, 2022, 7:04 a.m. UTC
When only changes LAM bits, ask next vcpu run to load mmu pgd, so that it
will build new CR3 with LAM bits updates. No TLB flush needed on this case.
When changes on effective addresses, no matter LAM bits changes or not, go
through normal pgd update process.

Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
---
 arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

Comments

Kirill A . Shutemov Oct. 31, 2022, 2:59 a.m. UTC | #1
On Mon, Oct 17, 2022 at 03:04:49PM +0800, Robert Hoo wrote:
> When only changes LAM bits, ask next vcpu run to load mmu pgd, so that it
> will build new CR3 with LAM bits updates. No TLB flush needed on this case.
> When changes on effective addresses, no matter LAM bits changes or not, go
> through normal pgd update process.
> 
> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
> ---
>  arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++----
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e9b465bff8d3..fb779f88ae88 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1228,9 +1228,9 @@ static bool kvm_is_valid_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  {
>  	bool skip_tlb_flush = false;
> -	unsigned long pcid = 0;
> +	unsigned long pcid = 0, old_cr3;
>  #ifdef CONFIG_X86_64
> -	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
> +	bool pcid_enabled = !!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
>  
>  	if (pcid_enabled) {
>  		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
> @@ -1243,6 +1243,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
>  		goto handle_tlb_flush;
>  
> +	if (!guest_cpuid_has(vcpu, X86_FEATURE_LAM) &&
> +	    (cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)))
> +		return	1;
> +
>  	/*
>  	 * Do not condition the GPA check on long mode, this helper is used to
>  	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
> @@ -1254,8 +1258,22 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
>  		return 1;
>  
> -	if (cr3 != kvm_read_cr3(vcpu))
> -		kvm_mmu_new_pgd(vcpu, cr3);
> +	old_cr3 = kvm_read_cr3(vcpu);
> +	if (cr3 != old_cr3) {
> +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> +			kvm_mmu_new_pgd(vcpu, cr3 & ~(X86_CR3_LAM_U48 |
> +					X86_CR3_LAM_U57));
> +		} else {
> +			/* Only LAM conf changes, no tlb flush needed */
> +			skip_tlb_flush = true;

I'm not sure about this.

Consider case when LAM_U48 gets enabled on 5-level paging machines. We may
have valid TLB entries for addresses above 47-bit. It's kinda broken case,
but seems valid from architectural PoV, no?

I guess after enabling LAM, these entries will never match. But if LAM
gets disabled again they will become active. Hm?

Maybe just flush?

> +			/*
> +			 * Though effective addr no change, mark the
> +			 * request so that LAM bits will take effect
> +			 * when enter guest.
> +			 */
> +			kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
> +		}
> +	}
>  
>  	vcpu->arch.cr3 = cr3;
>  	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
> -- 
> 2.31.1
>
Robert Hoo Nov. 1, 2022, 1:46 a.m. UTC | #2
On Mon, 2022-10-31 at 05:59 +0300, Kirill A. Shutemov wrote:
> On Mon, Oct 17, 2022 at 03:04:49PM +0800, Robert Hoo wrote:
> > When only changes LAM bits, ask next vcpu run to load mmu pgd, so
> > that it
> > will build new CR3 with LAM bits updates. No TLB flush needed on
> > this case.
> > When changes on effective addresses, no matter LAM bits changes or
> > not, go
> > through normal pgd update process.
> > 
> > Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
> > ---
> >  arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++----
> >  1 file changed, 22 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e9b465bff8d3..fb779f88ae88 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1228,9 +1228,9 @@ static bool kvm_is_valid_cr3(struct kvm_vcpu
> > *vcpu, unsigned long cr3)
> >  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> >  {
> >  	bool skip_tlb_flush = false;
> > -	unsigned long pcid = 0;
> > +	unsigned long pcid = 0, old_cr3;
> >  #ifdef CONFIG_X86_64
> > -	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
> > +	bool pcid_enabled = !!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
> >  
> >  	if (pcid_enabled) {
> >  		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
> > @@ -1243,6 +1243,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu,
> > unsigned long cr3)
> >  	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
> >  		goto handle_tlb_flush;
> >  
> > +	if (!guest_cpuid_has(vcpu, X86_FEATURE_LAM) &&
> > +	    (cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)))
> > +		return	1;
> > +
> >  	/*
> >  	 * Do not condition the GPA check on long mode, this helper is
> > used to
> >  	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee
> > that
> > @@ -1254,8 +1258,22 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu,
> > unsigned long cr3)
> >  	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
> >  		return 1;
> >  
> > -	if (cr3 != kvm_read_cr3(vcpu))
> > -		kvm_mmu_new_pgd(vcpu, cr3);
> > +	old_cr3 = kvm_read_cr3(vcpu);
> > +	if (cr3 != old_cr3) {
> > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > +			kvm_mmu_new_pgd(vcpu, cr3 & ~(X86_CR3_LAM_U48 |
> > +					X86_CR3_LAM_U57));
> > +		} else {
> > +			/* Only LAM conf changes, no tlb flush needed
> > */
> > +			skip_tlb_flush = true;
> 
> I'm not sure about this.
> 
> Consider case when LAM_U48 gets enabled on 5-level paging machines.
> We may
> have valid TLB entries for addresses above 47-bit. It's kinda broken
> case,
> but seems valid from architectural PoV, no?

You're right, thanks Kirill.

I noticed in your Kernel enabling, because of this LAM_U48 and LA_57
overlapping, you enabled LAM_U57 only for simplicity at this moment. I
thought at that time, that this trickiness will be contained in Kernel
layer, but now it turns out at least non-EPT KVM MMU is not spared.
> 
> I guess after enabling LAM, these entries will never match. But if
> LAM
> gets disabled again they will become active. Hm?
> 
> Maybe just flush?

Now we have 2 options
1. as you suggested, just flush
2. more precisely identify the case Guest.LA57 && (CR3.bit[62:61] 00
-->10 switching), flush. (LAM_U57 bit take precedence over LAM_U48,
from spec.)

Considering CR3 change is relatively hot path, and tlb flush is heavy,
I lean towards option 2. Your opinion? 
 
> 
> > +			/*
> > +			 * Though effective addr no change, mark the
> > +			 * request so that LAM bits will take effect
> > +			 * when enter guest.
> > +			 */
> > +			kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
> > +		}
> > +	}
> >  
> >  	vcpu->arch.cr3 = cr3;
> >  	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
> > -- 
> > 2.31.1
> > 
> 
>
Kirill A . Shutemov Nov. 1, 2022, 2:04 a.m. UTC | #3
On Tue, Nov 01, 2022 at 09:46:39AM +0800, Robert Hoo wrote:
> On Mon, 2022-10-31 at 05:59 +0300, Kirill A. Shutemov wrote:
> > On Mon, Oct 17, 2022 at 03:04:49PM +0800, Robert Hoo wrote:
> > > When only changes LAM bits, ask next vcpu run to load mmu pgd, so
> > > that it
> > > will build new CR3 with LAM bits updates. No TLB flush needed on
> > > this case.
> > > When changes on effective addresses, no matter LAM bits changes or
> > > not, go
> > > through normal pgd update process.
> > > 
> > > Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
> > > ---
> > >  arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++----
> > >  1 file changed, 22 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index e9b465bff8d3..fb779f88ae88 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -1228,9 +1228,9 @@ static bool kvm_is_valid_cr3(struct kvm_vcpu
> > > *vcpu, unsigned long cr3)
> > >  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> > >  {
> > >  	bool skip_tlb_flush = false;
> > > -	unsigned long pcid = 0;
> > > +	unsigned long pcid = 0, old_cr3;
> > >  #ifdef CONFIG_X86_64
> > > -	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
> > > +	bool pcid_enabled = !!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
> > >  
> > >  	if (pcid_enabled) {
> > >  		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
> > > @@ -1243,6 +1243,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu,
> > > unsigned long cr3)
> > >  	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
> > >  		goto handle_tlb_flush;
> > >  
> > > +	if (!guest_cpuid_has(vcpu, X86_FEATURE_LAM) &&
> > > +	    (cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)))
> > > +		return	1;
> > > +
> > >  	/*
> > >  	 * Do not condition the GPA check on long mode, this helper is
> > > used to
> > >  	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee
> > > that
> > > @@ -1254,8 +1258,22 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu,
> > > unsigned long cr3)
> > >  	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
> > >  		return 1;
> > >  
> > > -	if (cr3 != kvm_read_cr3(vcpu))
> > > -		kvm_mmu_new_pgd(vcpu, cr3);
> > > +	old_cr3 = kvm_read_cr3(vcpu);
> > > +	if (cr3 != old_cr3) {
> > > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > > +			kvm_mmu_new_pgd(vcpu, cr3 & ~(X86_CR3_LAM_U48 |
> > > +					X86_CR3_LAM_U57));
> > > +		} else {
> > > +			/* Only LAM conf changes, no tlb flush needed
> > > */
> > > +			skip_tlb_flush = true;
> > 
> > I'm not sure about this.
> > 
> > Consider case when LAM_U48 gets enabled on 5-level paging machines.
> > We may
> > have valid TLB entries for addresses above 47-bit. It's kinda broken
> > case,
> > but seems valid from architectural PoV, no?
> 
> You're right, thanks Kirill.
> 
> I noticed in your Kernel enabling, because of this LAM_U48 and LA_57
> overlapping, you enabled LAM_U57 only for simplicity at this moment. I
> thought at that time, that this trickiness will be contained in Kernel
> layer, but now it turns out at least non-EPT KVM MMU is not spared.
> > 
> > I guess after enabling LAM, these entries will never match. But if
> > LAM
> > gets disabled again they will become active. Hm?
> > 
> > Maybe just flush?
> 
> Now we have 2 options
> 1. as you suggested, just flush
> 2. more precisely identify the case Guest.LA57 && (CR3.bit[62:61] 00
> -->10 switching), flush. (LAM_U57 bit take precedence over LAM_U48,
> from spec.)
> 
> Considering CR3 change is relatively hot path, and tlb flush is heavy,
> I lean towards option 2. Your opinion? 

11 in bits [62:61] is also considered LAM_U57. So your option 2 is broken.

And I don't buy argument about hot path: the case we talking about is
about enabling/disabling LAM with constant PGD. It's not hot path by any
mean.

Let's not be fancy. Just flush TLB.
Robert Hoo Nov. 1, 2022, 2:26 a.m. UTC | #4
On Tue, 2022-11-01 at 05:04 +0300, Kirill A. Shutemov wrote:
> > > > @@ -1254,8 +1258,22 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu,
> > > > unsigned long cr3)
> > > >  	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
> > > >  		return 1;
> > > >  
> > > > -	if (cr3 != kvm_read_cr3(vcpu))
> > > > -		kvm_mmu_new_pgd(vcpu, cr3);
> > > > +	old_cr3 = kvm_read_cr3(vcpu);
> > > > +	if (cr3 != old_cr3) {
> > > > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > > > +			kvm_mmu_new_pgd(vcpu, cr3 &
> > > > ~(X86_CR3_LAM_U48 |
> > > > +					X86_CR3_LAM_U57));
> > > > +		} else {
> > > > +			/* Only LAM conf changes, no tlb flush
> > > > needed
> > > > */
> > > > +			skip_tlb_flush = true;
> > > 
> > > I'm not sure about this.
> > > 
> > > Consider case when LAM_U48 gets enabled on 5-level paging
> > > machines.
> > > We may
> > > have valid TLB entries for addresses above 47-bit. It's kinda
> > > broken
> > > case,
> > > but seems valid from architectural PoV, no?
> > 
> > You're right, thanks Kirill.
> > 
> > I noticed in your Kernel enabling, because of this LAM_U48 and
> > LA_57
> > overlapping, you enabled LAM_U57 only for simplicity at this
> > moment. I
> > thought at that time, that this trickiness will be contained in
> > Kernel
> > layer, but now it turns out at least non-EPT KVM MMU is not spared.
> > > 
> > > I guess after enabling LAM, these entries will never match. But
> > > if
> > > LAM
> > > gets disabled again they will become active. Hm?
> > > 
> > > Maybe just flush?
> > 
> > Now we have 2 options
> > 1. as you suggested, just flush
> > 2. more precisely identify the case Guest.LA57 && (CR3.bit[62:61]
> > 00
> > -->10 switching), flush. (LAM_U57 bit take precedence over LAM_U48,
> > from spec.)
> > 
> > Considering CR3 change is relatively hot path, and tlb flush is
> > heavy,
> > I lean towards option 2. Your opinion? 
> 
> 11 in bits [62:61] is also considered LAM_U57. So your option 2 is
> broken.

I don't get you here. Yes, 11 is also LAM_U57, but LAM_U57 is safe (6
bit).
Anyway, flush, as below hot path doesn't hold.
> 
> And I don't buy argument about hot path: the case we talking about is
> about enabling/disabling LAM with constant PGD. 

Right.

> It's not hot path by any
> mean.
> 
> Let's not be fancy. Just flush TLB.
>
Robert Hoo Nov. 2, 2022, 7:29 a.m. UTC | #5
On Tue, 2022-11-01 at 05:04 +0300, Kirill A. Shutemov wrote:
...
> > > > -	if (cr3 != kvm_read_cr3(vcpu))
> > > > -		kvm_mmu_new_pgd(vcpu, cr3);
> > > > +	old_cr3 = kvm_read_cr3(vcpu);
> > > > +	if (cr3 != old_cr3) {
> > > > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > > > +			kvm_mmu_new_pgd(vcpu, cr3 &
> > > > ~(X86_CR3_LAM_U48 |
> > > > +					X86_CR3_LAM_U57));
> > > > +		} else {
> > > > +			/* Only LAM conf changes, no tlb flush
> > > > needed
> > > > */
> > > > +			skip_tlb_flush = true;
> > > 
> > > I'm not sure about this.
> > > 
> > > Consider case when LAM_U48 gets enabled on 5-level paging
> > > machines.
> > > We may
> > > have valid TLB entries for addresses above 47-bit. It's kinda
> > > broken
> > > case,
> > > but seems valid from architectural PoV, no?
> > 
> > You're right, thanks Kirill.
> > 
> > I noticed in your Kernel enabling, because of this LAM_U48 and
> > LA_57
> > overlapping, you enabled LAM_U57 only for simplicity at this
> > moment. I
> > thought at that time, that this trickiness will be contained in
> > Kernel
> > layer, but now it turns out at least non-EPT KVM MMU is not spared.
> > > 
> > > I guess after enabling LAM, these entries will never match. But
> > > if
> > > LAM
> > > gets disabled again they will become active. Hm?
> > > 
> > > Maybe just flush?
> > 
> > Now we have 2 options
> > 1. as you suggested, just flush
> > 2. more precisely identify the case Guest.LA57 && (CR3.bit[62:61]
> > 00
> > -->10 switching), flush. (LAM_U57 bit take precedence over LAM_U48,
> > from spec.)
> > 
> > Considering CR3 change is relatively hot path, and tlb flush is
> > heavy,
> > I lean towards option 2. Your opinion? 
> 
> 11 in bits [62:61] is also considered LAM_U57. So your option 2 is
> broken.

Hi Kirill,

When I came to cook v2 per your suggestion, i.e. leave it just flush, I
pondered on the necessity on all the cases of the 2 bits (LAM_U48,
LAM_U57) flips.
Hold this: LAM_U57 (bit61) takes precedence over LAM_U48 (bit62).

(0,0) --> {(0,1), (1,0), (1,1)}
(0,1) --> {(0,0), (1,0), (1,1)}
(1,0) --> {(0,0), (0,1), (1,1)}
(1,1) --> {(0,0), (1,0), (1,0)}

Among all the 12 cases, only (0,0) --> (1,0) && 5-level paging on, has
to flush tlb. Am I right? if so, would you still prefer unconditionally
flush, just for 1/12 necessity? (if include 5-level/4-level variations,
1/24)

> 
> And I don't buy argument about hot path: the case we talking about is
> about enabling/disabling LAM with constant PGD. It's not hot path by
> any
> mean.
> 
> Let's not be fancy. Just flush TLB.
>
Kirill A . Shutemov Nov. 2, 2022, 9:05 p.m. UTC | #6
On Wed, Nov 02, 2022 at 03:29:10PM +0800, Robert Hoo wrote:
> On Tue, 2022-11-01 at 05:04 +0300, Kirill A. Shutemov wrote:
> ...
> > > > > -	if (cr3 != kvm_read_cr3(vcpu))
> > > > > -		kvm_mmu_new_pgd(vcpu, cr3);
> > > > > +	old_cr3 = kvm_read_cr3(vcpu);
> > > > > +	if (cr3 != old_cr3) {
> > > > > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > > > > +			kvm_mmu_new_pgd(vcpu, cr3 &
> > > > > ~(X86_CR3_LAM_U48 |
> > > > > +					X86_CR3_LAM_U57));
> > > > > +		} else {
> > > > > +			/* Only LAM conf changes, no tlb flush
> > > > > needed
> > > > > */
> > > > > +			skip_tlb_flush = true;
> > > > 
> > > > I'm not sure about this.
> > > > 
> > > > Consider case when LAM_U48 gets enabled on 5-level paging
> > > > machines.
> > > > We may
> > > > have valid TLB entries for addresses above 47-bit. It's kinda
> > > > broken
> > > > case,
> > > > but seems valid from architectural PoV, no?
> > > 
> > > You're right, thanks Kirill.
> > > 
> > > I noticed in your Kernel enabling, because of this LAM_U48 and
> > > LA_57
> > > overlapping, you enabled LAM_U57 only for simplicity at this
> > > moment. I
> > > thought at that time, that this trickiness will be contained in
> > > Kernel
> > > layer, but now it turns out at least non-EPT KVM MMU is not spared.
> > > > 
> > > > I guess after enabling LAM, these entries will never match. But
> > > > if
> > > > LAM
> > > > gets disabled again they will become active. Hm?
> > > > 
> > > > Maybe just flush?
> > > 
> > > Now we have 2 options
> > > 1. as you suggested, just flush
> > > 2. more precisely identify the case Guest.LA57 && (CR3.bit[62:61]
> > > 00
> > > -->10 switching), flush. (LAM_U57 bit take precedence over LAM_U48,
> > > from spec.)
> > > 
> > > Considering CR3 change is relatively hot path, and tlb flush is
> > > heavy,
> > > I lean towards option 2. Your opinion? 
> > 
> > 11 in bits [62:61] is also considered LAM_U57. So your option 2 is
> > broken.
> 
> Hi Kirill,
> 
> When I came to cook v2 per your suggestion, i.e. leave it just flush, I
> pondered on the necessity on all the cases of the 2 bits (LAM_U48,
> LAM_U57) flips.
> Hold this: LAM_U57 (bit61) takes precedence over LAM_U48 (bit62).
> 
> (0,0) --> {(0,1), (1,0), (1,1)}
> (0,1) --> {(0,0), (1,0), (1,1)}
> (1,0) --> {(0,0), (0,1), (1,1)}
> (1,1) --> {(0,0), (1,0), (1,0)}
> 
> Among all the 12 cases, only (0,0) --> (1,0) && 5-level paging on, has
> to flush tlb. Am I right? if so, would you still prefer unconditionally
> flush, just for 1/12 necessity? (if include 5-level/4-level variations,
> 1/24)

I would keep it simple. We can always add optimization later if there's
a workload that actually benefit from it. But I cannot imagine situation
where enabling LAM is a hot path.
Robert Hoo Nov. 3, 2022, 1:04 a.m. UTC | #7
On Thu, 2022-11-03 at 00:05 +0300, Kirill A. Shutemov wrote:
> On Wed, Nov 02, 2022 at 03:29:10PM +0800, Robert Hoo wrote:
> > On Tue, 2022-11-01 at 05:04 +0300, Kirill A. Shutemov wrote:
> > ...
> > > > > > -	if (cr3 != kvm_read_cr3(vcpu))
> > > > > > -		kvm_mmu_new_pgd(vcpu, cr3);
> > > > > > +	old_cr3 = kvm_read_cr3(vcpu);
> > > > > > +	if (cr3 != old_cr3) {
> > > > > > +		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
> > > > > > +			kvm_mmu_new_pgd(vcpu, cr3 &
> > > > > > ~(X86_CR3_LAM_U48 |
> > > > > > +					X86_CR3_LAM_U57));
> > > > > > +		} else {
> > > > > > +			/* Only LAM conf changes, no tlb flush
> > > > > > needed
> > > > > > */
> > > > > > +			skip_tlb_flush = true;
> > > > > 
> > > > > I'm not sure about this.
> > > > > 
> > > > > Consider case when LAM_U48 gets enabled on 5-level paging
> > > > > machines.
> > > > > We may
> > > > > have valid TLB entries for addresses above 47-bit. It's kinda
> > > > > broken
> > > > > case,
> > > > > but seems valid from architectural PoV, no?
> > > > 
> > > > You're right, thanks Kirill.
> > > > 
> > > > I noticed in your Kernel enabling, because of this LAM_U48 and
> > > > LA_57
> > > > overlapping, you enabled LAM_U57 only for simplicity at this
> > > > moment. I
> > > > thought at that time, that this trickiness will be contained in
> > > > Kernel
> > > > layer, but now it turns out at least non-EPT KVM MMU is not
> > > > spared.
> > > > > 
> > > > > I guess after enabling LAM, these entries will never match.
> > > > > But
> > > > > if
> > > > > LAM
> > > > > gets disabled again they will become active. Hm?
> > > > > 
> > > > > Maybe just flush?
> > > > 
> > > > Now we have 2 options
> > > > 1. as you suggested, just flush
> > > > 2. more precisely identify the case Guest.LA57 &&
> > > > (CR3.bit[62:61]
> > > > 00
> > > > -->10 switching), flush. (LAM_U57 bit take precedence over
> > > > LAM_U48,
> > > > from spec.)
> > > > 
> > > > Considering CR3 change is relatively hot path, and tlb flush is
> > > > heavy,
> > > > I lean towards option 2. Your opinion? 
> > > 
> > > 11 in bits [62:61] is also considered LAM_U57. So your option 2
> > > is
> > > broken.
> > 
> > Hi Kirill,
> > 
> > When I came to cook v2 per your suggestion, i.e. leave it just
> > flush, I
> > pondered on the necessity on all the cases of the 2 bits (LAM_U48,
> > LAM_U57) flips.
> > Hold this: LAM_U57 (bit61) takes precedence over LAM_U48 (bit62).
> > 
> > (0,0) --> {(0,1), (1,0), (1,1)}
> > (0,1) --> {(0,0), (1,0), (1,1)}
> > (1,0) --> {(0,0), (0,1), (1,1)}
> > (1,1) --> {(0,0), (1,0), (1,0)}
> > 
> > Among all the 12 cases, only (0,0) --> (1,0) && 5-level paging on,
> > has
> > to flush tlb. Am I right? if so, would you still prefer
> > unconditionally
> > flush, just for 1/12 necessity? (if include 5-level/4-level
> > variations,
> > 1/24)
> 
> I would keep it simple. We can always add optimization later if
> there's
> a workload that actually benefit from it. But I cannot imagine
> situation
> where enabling LAM is a hot path.
> 
OK, I'm open to this.

I also notice that skip_tlb_flush is set when pcid_enabled && (CR3
& X86_CR3_PCID_NOFLUSH). Under this condition, do you think (0,0) -->
(1,0) need to flip it back to false?

int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
	bool skip_tlb_flush = false;
	unsigned long pcid = 0, old_cr3;
#ifdef CONFIG_X86_64
	bool pcid_enabled = !!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);

	if (pcid_enabled) {
		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
Kirill A . Shutemov Nov. 3, 2022, 2:40 a.m. UTC | #8
On Thu, Nov 03, 2022 at 09:04:23AM +0800, Robert Hoo wrote:
> I also notice that skip_tlb_flush is set when pcid_enabled && (CR3
> & X86_CR3_PCID_NOFLUSH). Under this condition, do you think (0,0) -->
> (1,0) need to flip it back to false?

Yes, I think we should. We know it is a safe choice.

It also would be nice to get LAM documentation updated on the expected
behaviour. It is not clear from current documentation if enabling LAM
causes flush. We can only guess that it should at least for some
scenarios.

Phantom TLB entires that resurface after LAM gets disable would be fun to
debug.
Robert Hoo Nov. 3, 2022, 8:07 a.m. UTC | #9
On Thu, 2022-11-03 at 05:40 +0300, Kirill A. Shutemov wrote:
> On Thu, Nov 03, 2022 at 09:04:23AM +0800, Robert Hoo wrote:
> > I also notice that skip_tlb_flush is set when pcid_enabled && (CR3
> > & X86_CR3_PCID_NOFLUSH). Under this condition, do you think (0,0)
> > -->
> > (1,0) need to flip it back to false?
> 
> Yes, I think we should. We know it is a safe choice.

If so, then judging the (0,0) --> (1,0) case in the else{} branch is
inevitable, isn't it?

Or totally remove the skip_tlb_flush logic in this function, but this
would break existing logic. You won't like it. 
> 
> It also would be nice to get LAM documentation updated on the
> expected
> behaviour. It is not clear from current documentation if enabling LAM
> causes flush. We can only guess that it should at least for some
> scenarios.
> 
> Phantom TLB entires that resurface after LAM gets disable would be
> fun to
> debug.
> 
Agree, and echo your conservativeness.
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e9b465bff8d3..fb779f88ae88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1228,9 +1228,9 @@  static bool kvm_is_valid_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	bool skip_tlb_flush = false;
-	unsigned long pcid = 0;
+	unsigned long pcid = 0, old_cr3;
 #ifdef CONFIG_X86_64
-	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+	bool pcid_enabled = !!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
 	if (pcid_enabled) {
 		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
@@ -1243,6 +1243,10 @@  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
 		goto handle_tlb_flush;
 
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_LAM) &&
+	    (cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)))
+		return	1;
+
 	/*
 	 * Do not condition the GPA check on long mode, this helper is used to
 	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
@@ -1254,8 +1258,22 @@  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
 		return 1;
 
-	if (cr3 != kvm_read_cr3(vcpu))
-		kvm_mmu_new_pgd(vcpu, cr3);
+	old_cr3 = kvm_read_cr3(vcpu);
+	if (cr3 != old_cr3) {
+		if ((cr3 ^ old_cr3) & CR3_ADDR_MASK) {
+			kvm_mmu_new_pgd(vcpu, cr3 & ~(X86_CR3_LAM_U48 |
+					X86_CR3_LAM_U57));
+		} else {
+			/* Only LAM conf changes, no tlb flush needed */
+			skip_tlb_flush = true;
+			/*
+			 * Though effective addr no change, mark the
+			 * request so that LAM bits will take effect
+			 * when enter guest.
+			 */
+			kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+		}
+	}
 
 	vcpu->arch.cr3 = cr3;
 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);