Message ID | 20240228024147.41573-3-seanjc@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: Page fault and MMIO cleanups | expand |
On Wed, Feb 28, 2024 at 3:46 AM Sean Christopherson <seanjc@google.com> wrote: > > Open code the bit number directly in the PFERR_* masks and drop the > intermediate PFERR_*_BIT defines, as having to bounce through two macros > just to see which flag corresponds to which bit is quite annoying, as is > having to define two macros just to add recognition of a new flag. > > Use ilog2() to derive the bit in permission_fault(), the one function that > actually needs the bit number (it does clever shifting to manipulate flags > in order to avoid conditional branches). > > No functional change intended. > > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > arch/x86/include/asm/kvm_host.h | 32 ++++++++++---------------------- > arch/x86/kvm/mmu.h | 4 ++-- > 2 files changed, 12 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index aaf5a25ea7ed..88cc523bafa8 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -254,28 +254,16 @@ enum x86_intercept_stage; > KVM_GUESTDBG_INJECT_DB | \ > KVM_GUESTDBG_BLOCKIRQ) > > - > -#define PFERR_PRESENT_BIT 0 > -#define PFERR_WRITE_BIT 1 > -#define PFERR_USER_BIT 2 > -#define PFERR_RSVD_BIT 3 > -#define PFERR_FETCH_BIT 4 > -#define PFERR_PK_BIT 5 > -#define PFERR_SGX_BIT 15 > -#define PFERR_GUEST_FINAL_BIT 32 > -#define PFERR_GUEST_PAGE_BIT 33 > -#define PFERR_IMPLICIT_ACCESS_BIT 48 > - > -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) > -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) > -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) > -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) > -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) > -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) > -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) > -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) > -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) > -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) > +#define PFERR_PRESENT_MASK BIT(0) > +#define PFERR_WRITE_MASK BIT(1) > +#define PFERR_USER_MASK BIT(2) > +#define PFERR_RSVD_MASK BIT(3) > +#define PFERR_FETCH_MASK BIT(4) > +#define PFERR_PK_MASK BIT(5) > +#define PFERR_SGX_MASK BIT(15) > +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) > +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) > +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) > > #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ > PFERR_WRITE_MASK | \ > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 60f21bb4c27b..e8b620a85627 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > */ > u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; > bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; > - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; > + int index = (pfec + (not_smap << ilog2(PFERR_RSVD_MASK))) >> 1; Just use "(pfec + (not_smap ? PFERR_RSVD_MASK : 0)) >> 1". Likewise below, "pte_access & PT_USER_MASK ? PFERR_RSVD_MASK : 0"/ No need to even check what the compiler produces, it will be either exactly the same code or a bunch of cmov instructions. Paolo > u32 errcode = PFERR_PRESENT_MASK; > bool fault; > > @@ -235,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ > offset = (pfec & ~1) + > - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); > + ((pte_access & PT_USER_MASK) << (ilog2(PFERR_RSVD_MASK) - PT_USER_SHIFT)); > > pkru_bits &= mmu->pkru_mask >> offset; > errcode |= -pkru_bits & PFERR_PK_MASK; > -- > 2.44.0.278.ge034bb2e1d-goog >
I remember I read somewhere suggesting not to change the headers in selftest. Just double-check if there is requirement to edit tools/testing/selftests/kvm/include/x86_64/processor.h. Dongli Zhang On 2/27/24 18:41, Sean Christopherson wrote: > Open code the bit number directly in the PFERR_* masks and drop the > intermediate PFERR_*_BIT defines, as having to bounce through two macros > just to see which flag corresponds to which bit is quite annoying, as is > having to define two macros just to add recognition of a new flag. > > Use ilog2() to derive the bit in permission_fault(), the one function that > actually needs the bit number (it does clever shifting to manipulate flags > in order to avoid conditional branches). > > No functional change intended. > > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > arch/x86/include/asm/kvm_host.h | 32 ++++++++++---------------------- > arch/x86/kvm/mmu.h | 4 ++-- > 2 files changed, 12 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index aaf5a25ea7ed..88cc523bafa8 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -254,28 +254,16 @@ enum x86_intercept_stage; > KVM_GUESTDBG_INJECT_DB | \ > KVM_GUESTDBG_BLOCKIRQ) > > - > -#define PFERR_PRESENT_BIT 0 > -#define PFERR_WRITE_BIT 1 > -#define PFERR_USER_BIT 2 > -#define PFERR_RSVD_BIT 3 > -#define PFERR_FETCH_BIT 4 > -#define PFERR_PK_BIT 5 > -#define PFERR_SGX_BIT 15 > -#define PFERR_GUEST_FINAL_BIT 32 > -#define PFERR_GUEST_PAGE_BIT 33 > -#define PFERR_IMPLICIT_ACCESS_BIT 48 > - > -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) > -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) > -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) > -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) > -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) > -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) > -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) > -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) > -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) > -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) > +#define PFERR_PRESENT_MASK BIT(0) > +#define PFERR_WRITE_MASK BIT(1) > +#define PFERR_USER_MASK BIT(2) > +#define PFERR_RSVD_MASK BIT(3) > +#define PFERR_FETCH_MASK BIT(4) > +#define PFERR_PK_MASK BIT(5) > +#define PFERR_SGX_MASK BIT(15) > +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) > +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) > +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) > > #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ > PFERR_WRITE_MASK | \ > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 60f21bb4c27b..e8b620a85627 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > */ > u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; > bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; > - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; > + int index = (pfec + (not_smap << ilog2(PFERR_RSVD_MASK))) >> 1; > u32 errcode = PFERR_PRESENT_MASK; > bool fault; > > @@ -235,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ > offset = (pfec & ~1) + > - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); > + ((pte_access & PT_USER_MASK) << (ilog2(PFERR_RSVD_MASK) - PT_USER_SHIFT)); > > pkru_bits &= mmu->pkru_mask >> offset; > errcode |= -pkru_bits & PFERR_PK_MASK;
On Thu, Feb 29, 2024, Dongli Zhang wrote: > I remember I read somewhere suggesting not to change the headers in selftest. The "suggestion" is to not update the headers that perf tooling copies verbatim from the kernel, e.g. tools/include/uapi/linux/kvm.h. The duplicates in tools/ aren't used by KVM selftests, it's purely perf that needs identical copies from the kernel tree, so I strongly prefer to leave it to the perf folks to deal with synchronizing the headers as needed. > Just double-check if there is requirement to edit > tools/testing/selftests/kvm/include/x86_64/processor.h. This header is a KVM selftests specific header that is independent from the kernel headers. It does have _some_ copy+paste, mostly for architecturally defined bits and bobs, but it's not a straight copy of any kernel header. That said, yes, I think we should also clean up x86_64/processor.h. That can be done in a one-off standalone patch though.
On Thu, Feb 29, 2024, Paolo Bonzini wrote: > On Wed, Feb 28, 2024 at 3:46 AM Sean Christopherson <seanjc@google.com> wrote: > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > > index 60f21bb4c27b..e8b620a85627 100644 > > --- a/arch/x86/kvm/mmu.h > > +++ b/arch/x86/kvm/mmu.h > > @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > */ > > u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; > > bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; > > - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; > > + int index = (pfec + (not_smap << ilog2(PFERR_RSVD_MASK))) >> 1; > > Just use "(pfec + (not_smap ? PFERR_RSVD_MASK : 0)) >> 1". > > Likewise below, "pte_access & PT_USER_MASK ? PFERR_RSVD_MASK : 0"/ > > No need to even check what the compiler produces, it will be either > exactly the same code or a bunch of cmov instructions. I couldn't resist :-) The second one generates identical code, but for this one: int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; gcc generates almost bizarrely different code in the call from vcpu_mmio_gva_to_gpa(). clang is clever enough to realize "pfec" can only contain USER_MASK and/or WRITE_MASK, and so does a ton of dead code elimination and other optimizations. But for some reason, gcc doesn't appear to realize that, and generates a MOVSX when computing "index", i.e. sign-extends the result of the ADD (at least, I think that's what it's doing). There's no actual bug today, and the vcpu_mmio_gva_to_gpa() path is super safe since KVM fully controls the error code. But the call from FNAME(walk_addr_generic) uses a _much_ more dynamic error code. If an error code with unexpected bits set managed to get into permission_fault(), I'm pretty sure we'd end up with out-of-bounds accesses. KVM sanity checks that PK and RSVD aren't set, WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); but KVM unnecessarily uses an ADD instead of OR, here int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; and here /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); i.e. if the WARN fired, KVM would generate completely unexpected values due to adding two RSVD bit flags. And if _really_ unexpected flags make their way into permission_fault(), e.g. the upcoming RMP flag (bit 31) or Intel's SGX flag (bit 15), then the use of index fault = (mmu->permissions[index] >> pte_access) & 1; could generate a read waaaya outside of the array. It can't/shouldn't happen in practice since KVM shouldn't be trying to emulate RMP violations or faults in SGX enclaves, but it's unnecessarily dangerous. Long story short, I think we should get to the below (I'll post a separate series, assuming I'm not missing something). unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); unsigned int pfec = access & (PFERR_PRESENT_MASK | PFERR_WRITE_MASK | PFERR_USER_MASK | PFERR_FETCH_MASK); /* * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. * For implicit supervisor accesses, SMAP cannot be overridden. * * SMAP works on supervisor accesses only, and not_smap can * be set or not set when user access with neither has any bearing * on the result. * * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit; * this bit will always be zero in pfec, but it will be one in index * if SMAP checks are being disabled. */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; kvm_mmu_refresh_passthrough_bits(vcpu, mmu); fault = (mmu->permissions[index] >> pte_access) & 1; /* * Sanity check that no bits are set in the legacy #PF error code * (bits 31:0) other than the supported permission bits (see above). */ WARN_ON_ONCE(pfec != (unsigned int)access); if (unlikely(mmu->pkru_mask)) { u32 pkru_bits, offset; /* * PKRU defines 32 bits, there are 16 domains and 2 * attribute bits per domain in pkru. pte_pkey is the * index of the protection domain, so pte_pkey * 2 is * is the index of the first bit for the domain. */ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) | (pte_access & PT_USER_MASK ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; fault |= (pkru_bits != 0); } return -(u32)fault & errcode;
On Thu, Feb 29, 2024 at 7:40 PM Sean Christopherson <seanjc@google.com> wrote: > Long story short, I think we should get to the below (I'll post a separate series, > assuming I'm not missing something). > > unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); > unsigned int pfec = access & (PFERR_PRESENT_MASK | > PFERR_WRITE_MASK | > PFERR_USER_MASK | > PFERR_FETCH_MASK); > > /* > * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. > * For implicit supervisor accesses, SMAP cannot be overridden. > * > * SMAP works on supervisor accesses only, and not_smap can > * be set or not set when user access with neither has any bearing > * on the result. > * > * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit; > * this bit will always be zero in pfec, but it will be one in index > * if SMAP checks are being disabled. > */ > u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; > bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; > int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; > u32 errcode = PFERR_PRESENT_MASK; > bool fault; Sounds good. The whole series is Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> apart from the small nits that were pointed out here and there. Paolo
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aaf5a25ea7ed..88cc523bafa8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -254,28 +254,16 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) - -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b..e8b620a85627 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; + int index = (pfec + (not_smap << ilog2(PFERR_RSVD_MASK))) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; @@ -235,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + ((pte_access & PT_USER_MASK) << (ilog2(PFERR_RSVD_MASK) - PT_USER_SHIFT)); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK;
Open code the bit number directly in the PFERR_* masks and drop the intermediate PFERR_*_BIT defines, as having to bounce through two macros just to see which flag corresponds to which bit is quite annoying, as is having to define two macros just to add recognition of a new flag. Use ilog2() to derive the bit in permission_fault(), the one function that actually needs the bit number (it does clever shifting to manipulate flags in order to avoid conditional branches). No functional change intended. Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/include/asm/kvm_host.h | 32 ++++++++++---------------------- arch/x86/kvm/mmu.h | 4 ++-- 2 files changed, 12 insertions(+), 24 deletions(-)