diff mbox

[v2] kvm: x86: fix stale mmio cache bug

Message ID 1407186620-1999-1-git-send-email-dmatlack@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Matlack Aug. 4, 2014, 9:10 p.m. UTC
The following events can lead to an incorrect KVM_EXIT_MMIO bubbling
up to userspace:

(1) Guest accesses gpa X without a memory slot. The gfn is cached in
struct kvm_vcpu_arch (mmio_gfn). On Intel EPT-enabled hosts, KVM sets
the SPTE write-execute-noread so that future accesses cause
EPT_MISCONFIGs.

(2) Host userspace creates a memory slot via KVM_SET_USER_MEMORY_REGION
covering the page just accessed.

(3) Guest attempts to read or write to gpa X again. On Intel, this
generates an EPT_MISCONFIG. The memory slot generation number that
was incremented in (2) would normally take care of this but we fast
path mmio faults through quickly_check_mmio_pf(), which only checks
the per-vcpu mmio cache. Since we hit the cache, KVM passes a
KVM_EXIT_MMIO up to userspace.

This patch fixes the issue by doing the following:
  - Tag the mmio cache with the memslot generation and use it to
    validate mmio cache lookups.
  - Extend vcpu_clear_mmio_info to clear mmio_gfn in addition to
    mmio_gva, since both can be used to fast path mmio faults.
  - In mmu_sync_roots, unconditionally clear the mmio cache since
    even direct_map (e.g. tdp) hosts use it.

Signed-off-by: David Matlack <dmatlack@google.com>
---
Changes in v2:
  - Use memslot generation to invalidate the mmio cache rather than
    actively invalidating the cache.
  - Update patch description with new cache invalidation technique.
  - Pull mmio cache/clear code up out of x86.h and mmu.c and into
    mmu.h.

 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 16 ++--------
 arch/x86/kvm/mmu.h              | 70 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h              | 36 ---------------------
 4 files changed, 73 insertions(+), 50 deletions(-)

Comments

Wanpeng Li Aug. 5, 2014, 12:31 a.m. UTC | #1
Hi David,
On Mon, Aug 04, 2014 at 02:10:20PM -0700, David Matlack wrote:
>The following events can lead to an incorrect KVM_EXIT_MMIO bubbling
>up to userspace:
>
>(1) Guest accesses gpa X without a memory slot. The gfn is cached in
>struct kvm_vcpu_arch (mmio_gfn). On Intel EPT-enabled hosts, KVM sets
>the SPTE write-execute-noread so that future accesses cause
>EPT_MISCONFIGs.
>
>(2) Host userspace creates a memory slot via KVM_SET_USER_MEMORY_REGION
>covering the page just accessed.
>

One question:

Who trigger host userspace creates a mmio memslot? It will be created
just after first mmio #PF? 

Regards,
Wanpeng Li 

>(3) Guest attempts to read or write to gpa X again. On Intel, this
>generates an EPT_MISCONFIG. The memory slot generation number that
>was incremented in (2) would normally take care of this but we fast
>path mmio faults through quickly_check_mmio_pf(), which only checks
>the per-vcpu mmio cache. Since we hit the cache, KVM passes a
>KVM_EXIT_MMIO up to userspace.
>
>This patch fixes the issue by doing the following:
>  - Tag the mmio cache with the memslot generation and use it to
>    validate mmio cache lookups.
>  - Extend vcpu_clear_mmio_info to clear mmio_gfn in addition to
>    mmio_gva, since both can be used to fast path mmio faults.
>  - In mmu_sync_roots, unconditionally clear the mmio cache since
>    even direct_map (e.g. tdp) hosts use it.
>
>Signed-off-by: David Matlack <dmatlack@google.com>
>---
>Changes in v2:
>  - Use memslot generation to invalidate the mmio cache rather than
>    actively invalidating the cache.
>  - Update patch description with new cache invalidation technique.
>  - Pull mmio cache/clear code up out of x86.h and mmu.c and into
>    mmu.h.
>
> arch/x86/include/asm/kvm_host.h |  1 +
> arch/x86/kvm/mmu.c              | 16 ++--------
> arch/x86/kvm/mmu.h              | 70 +++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/x86.h              | 36 ---------------------
> 4 files changed, 73 insertions(+), 50 deletions(-)
>
>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>index 49205d0..f518d14 100644
>--- a/arch/x86/include/asm/kvm_host.h
>+++ b/arch/x86/include/asm/kvm_host.h
>@@ -479,6 +479,7 @@ struct kvm_vcpu_arch {
> 	u64 mmio_gva;
> 	unsigned access;
> 	gfn_t mmio_gfn;
>+	unsigned int mmio_gen;
> 
> 	struct kvm_pmu pmu;
> 
>diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>index 9314678..43f1c18 100644
>--- a/arch/x86/kvm/mmu.c
>+++ b/arch/x86/kvm/mmu.c
>@@ -206,11 +206,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
> #define MMIO_SPTE_GEN_LOW_SHIFT		3
> #define MMIO_SPTE_GEN_HIGH_SHIFT	52
> 
>-#define MMIO_GEN_SHIFT			19
> #define MMIO_GEN_LOW_SHIFT		9
> #define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 1)
>-#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
>-#define MMIO_MAX_GEN			((1 << MMIO_GEN_SHIFT) - 1)
> 
> static u64 generation_mmio_spte_mask(unsigned int gen)
> {
>@@ -234,16 +231,6 @@ static unsigned int get_mmio_spte_generation(u64 spte)
> 	return gen;
> }
> 
>-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
>-{
>-	/*
>-	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
>-	 * code of handling generation number wrap-around.
>-	 */
>-	return (kvm_memslots(kvm)->generation +
>-		      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
>-}
>-
> static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
> 			   unsigned access)
> {
>@@ -3157,13 +3144,14 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
> 	int i;
> 	struct kvm_mmu_page *sp;
> 
>+	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
>+
> 	if (vcpu->arch.mmu.direct_map)
> 		return;
> 
> 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> 		return;
> 
>-	vcpu_clear_mmio_info(vcpu, ~0ul);
> 	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
> 		hpa_t root = vcpu->arch.mmu.root_hpa;
>diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>index b982112..058651a 100644
>--- a/arch/x86/kvm/mmu.h
>+++ b/arch/x86/kvm/mmu.h
>@@ -82,6 +82,76 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
> void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> 		bool ept);
> 
>+#define MMIO_GEN_SHIFT                 19
>+#define MMIO_GEN_MASK                  ((1 << MMIO_GEN_SHIFT) - 1)
>+#define MMIO_MAX_GEN                   ((1 << MMIO_GEN_SHIFT) - 1)
>+static inline unsigned int kvm_current_mmio_generation(struct kvm *kvm)
>+{
>+	/*
>+	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
>+	 * code of handling generation number wrap-around.
>+	 */
>+	return (kvm_memslots(kvm)->generation + MMIO_MAX_GEN - 150) &
>+	       MMIO_GEN_MASK;
>+}
>+
>+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
>+					gva_t gva, gfn_t gfn, unsigned access)
>+{
>+	vcpu->arch.mmio_gen = kvm_current_mmio_generation(vcpu->kvm);
>+
>+	/*
>+	 * Ensure that the mmio_gen is set before the rest of the cache entry.
>+	 * Otherwise we might see a new generation number attached to an old
>+	 * cache entry if creating/deleting a memslot races with mmio caching.
>+	 * The inverse case is possible (old generation number with new cache
>+	 * info), but that is safe. The next access will just miss the cache
>+	 * when it should have hit.
>+	 */
>+	smp_wmb();
>+
>+	vcpu->arch.mmio_gva = gva & PAGE_MASK;
>+	vcpu->arch.access = access;
>+	vcpu->arch.mmio_gfn = gfn;
>+}
>+
>+/*
>+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY,
>+ * unconditionally clear the mmio cache.
>+ */
>+#define MMIO_GVA_ANY ~((gva_t) 0)
>+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
>+{
>+	if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
>+		return;
>+
>+	vcpu->arch.mmio_gva = 0;
>+	vcpu->arch.mmio_gfn = 0;
>+}
>+
>+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
>+{
>+	return vcpu->arch.mmio_gen == kvm_current_mmio_generation(vcpu->kvm);
>+}
>+
>+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
>+{
>+	u64 mmio_gva = vcpu->arch.mmio_gva;
>+
>+	return vcpu_match_mmio_gen(vcpu) &&
>+	       mmio_gva &&
>+	       mmio_gva == (gva & PAGE_MASK);
>+}
>+
>+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
>+{
>+	gfn_t mmio_gfn = vcpu->arch.mmio_gfn;
>+
>+	return vcpu_match_mmio_gen(vcpu) &&
>+	       mmio_gfn &&
>+	       mmio_gfn == (gpa >> PAGE_SHIFT);
>+}
>+
> static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
> {
> 	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
>diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>index 8c97bac..c686c91 100644
>--- a/arch/x86/kvm/x86.h
>+++ b/arch/x86/kvm/x86.h
>@@ -72,42 +72,6 @@ static inline u32 bit(int bitno)
> 	return 1 << (bitno & 31);
> }
> 
>-static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
>-					gva_t gva, gfn_t gfn, unsigned access)
>-{
>-	vcpu->arch.mmio_gva = gva & PAGE_MASK;
>-	vcpu->arch.access = access;
>-	vcpu->arch.mmio_gfn = gfn;
>-}
>-
>-/*
>- * Clear the mmio cache info for the given gva,
>- * specially, if gva is ~0ul, we clear all mmio cache info.
>- */
>-static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
>-{
>-	if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
>-		return;
>-
>-	vcpu->arch.mmio_gva = 0;
>-}
>-
>-static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
>-{
>-	if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
>-		return true;
>-
>-	return false;
>-}
>-
>-static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
>-{
>-	if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
>-		return true;
>-
>-	return false;
>-}
>-
> void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
> void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
> int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
>-- 
>2.0.0.526.g5318336
>
>--
>To unsubscribe from this list: send the line "unsubscribe kvm" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 5, 2014, 3:36 a.m. UTC | #2
On 08/05/2014 05:10 AM, David Matlack wrote:
> The following events can lead to an incorrect KVM_EXIT_MMIO bubbling
> up to userspace:
> 
> (1) Guest accesses gpa X without a memory slot. The gfn is cached in
> struct kvm_vcpu_arch (mmio_gfn). On Intel EPT-enabled hosts, KVM sets
> the SPTE write-execute-noread so that future accesses cause
> EPT_MISCONFIGs.
> 
> (2) Host userspace creates a memory slot via KVM_SET_USER_MEMORY_REGION
> covering the page just accessed.
> 
> (3) Guest attempts to read or write to gpa X again. On Intel, this
> generates an EPT_MISCONFIG. The memory slot generation number that
> was incremented in (2) would normally take care of this but we fast
> path mmio faults through quickly_check_mmio_pf(), which only checks
> the per-vcpu mmio cache. Since we hit the cache, KVM passes a
> KVM_EXIT_MMIO up to userspace.
> 
> This patch fixes the issue by doing the following:
>   - Tag the mmio cache with the memslot generation and use it to
>     validate mmio cache lookups.
>   - Extend vcpu_clear_mmio_info to clear mmio_gfn in addition to
>     mmio_gva, since both can be used to fast path mmio faults.
>   - In mmu_sync_roots, unconditionally clear the mmio cache since
>     even direct_map (e.g. tdp) hosts use it.

It's not needed.

direct map only uses gpa (and never cache gva) and
vcpu_clear_mmio_info only clears gva.

> +static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
> +					gva_t gva, gfn_t gfn, unsigned access)
> +{
> +	vcpu->arch.mmio_gen = kvm_current_mmio_generation(vcpu->kvm);
> +
> +	/*
> +	 * Ensure that the mmio_gen is set before the rest of the cache entry.
> +	 * Otherwise we might see a new generation number attached to an old
> +	 * cache entry if creating/deleting a memslot races with mmio caching.
> +	 * The inverse case is possible (old generation number with new cache
> +	 * info), but that is safe. The next access will just miss the cache
> +	 * when it should have hit.
> +	 */
> +	smp_wmb();

The memory barrier can't help us, consider this scenario:

CPU 0                                      CPU 1
page-fault
see gpa is not mapped in memslot

                              create new memslot containing gpa from Qemu
                                  update the slots's generation number
cache mmio info

!!! later when vcpu accesses gpa again
it will cause mmio-exit.

The easy way to fix this is that we update slots's generation-number
after synchronize_srcu_expedited when memslot is being updated that
ensures other sides can see the new generation-number only after
finishing update.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Aug. 5, 2014, 6:56 p.m. UTC | #3
On Mon, Aug 4, 2014 at 5:31 PM, Wanpeng Li <wanpeng.li@linux.intel.com> wrote:
> Hi David,
> On Mon, Aug 04, 2014 at 02:10:20PM -0700, David Matlack wrote:
>>The following events can lead to an incorrect KVM_EXIT_MMIO bubbling
>>up to userspace:
>>
>>(1) Guest accesses gpa X without a memory slot. The gfn is cached in
>>struct kvm_vcpu_arch (mmio_gfn). On Intel EPT-enabled hosts, KVM sets
>>the SPTE write-execute-noread so that future accesses cause
>>EPT_MISCONFIGs.
>>
>>(2) Host userspace creates a memory slot via KVM_SET_USER_MEMORY_REGION
>>covering the page just accessed.
>>
>
> One question:
>
> Who trigger host userspace creates a mmio memslot? It will be created
> just after first mmio #PF?

Devices such as vga can be in modes where their memory behaves
like ram and using a memslot to back the memory makes sense. In
other modes, reading and writing to vga memory has side-effects
and so mmio makes sense (delete memslot). Switching between these
modes is a guest initiated event.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Aug. 5, 2014, 10:39 p.m. UTC | #4
On Mon, Aug 4, 2014 at 8:36 PM, Xiao Guangrong
<xiaoguangrong@linux.vnet.ibm.com> wrote:
> On 08/05/2014 05:10 AM, David Matlack wrote:
>>
>> This patch fixes the issue by doing the following:
>>   - Tag the mmio cache with the memslot generation and use it to
>>     validate mmio cache lookups.
>>   - Extend vcpu_clear_mmio_info to clear mmio_gfn in addition to
>>     mmio_gva, since both can be used to fast path mmio faults.
>>   - In mmu_sync_roots, unconditionally clear the mmio cache since
>>     even direct_map (e.g. tdp) hosts use it.
>
> It's not needed.
>
> direct map only uses gpa (and never cache gva) and
> vcpu_clear_mmio_info only clears gva.

Ok thanks for the clarification.

>> +static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
>> +                                     gva_t gva, gfn_t gfn, unsigned access)
>> +{
>> +     vcpu->arch.mmio_gen = kvm_current_mmio_generation(vcpu->kvm);
>> +
>> +     /*
>> +      * Ensure that the mmio_gen is set before the rest of the cache entry.
>> +      * Otherwise we might see a new generation number attached to an old
>> +      * cache entry if creating/deleting a memslot races with mmio caching.
>> +      * The inverse case is possible (old generation number with new cache
>> +      * info), but that is safe. The next access will just miss the cache
>> +      * when it should have hit.
>> +      */
>> +     smp_wmb();
>
> The memory barrier can't help us, consider this scenario:
>
> CPU 0                                      CPU 1
> page-fault
> see gpa is not mapped in memslot
>
>                               create new memslot containing gpa from Qemu
>                                   update the slots's generation number
> cache mmio info
>
> !!! later when vcpu accesses gpa again
> it will cause mmio-exit.

Ah! Thanks for catching my mistake.

> The easy way to fix this is that we update slots's generation-number
> after synchronize_srcu_expedited when memslot is being updated that
> ensures other sides can see the new generation-number only after
> finishing update.

It would be possible for a vcpu to see an inconsistent kvm_memslots struct
(new set of slots with an old generation number). Is that not an issue?

We could just use the generation number that is stored in the
spte. The only downside (that I can see) is that handle_abnormal_pfn()
calls vcpu_cache_mmio_info() but does not have access to the spte.
So presumably we'd have to do a page table walk.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 6, 2014, 3:26 a.m. UTC | #5
On 08/06/2014 06:39 AM, David Matlack wrote:
> On Mon, Aug 4, 2014 at 8:36 PM, Xiao Guangrong
> <xiaoguangrong@linux.vnet.ibm.com> wrote:
>> On 08/05/2014 05:10 AM, David Matlack wrote:
>>>
>>> This patch fixes the issue by doing the following:
>>>   - Tag the mmio cache with the memslot generation and use it to
>>>     validate mmio cache lookups.
>>>   - Extend vcpu_clear_mmio_info to clear mmio_gfn in addition to
>>>     mmio_gva, since both can be used to fast path mmio faults.
>>>   - In mmu_sync_roots, unconditionally clear the mmio cache since
>>>     even direct_map (e.g. tdp) hosts use it.
>>
>> It's not needed.
>>
>> direct map only uses gpa (and never cache gva) and
>> vcpu_clear_mmio_info only clears gva.
> 
> Ok thanks for the clarification.
> 
>>> +static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
>>> +                                     gva_t gva, gfn_t gfn, unsigned access)
>>> +{
>>> +     vcpu->arch.mmio_gen = kvm_current_mmio_generation(vcpu->kvm);
>>> +
>>> +     /*
>>> +      * Ensure that the mmio_gen is set before the rest of the cache entry.
>>> +      * Otherwise we might see a new generation number attached to an old
>>> +      * cache entry if creating/deleting a memslot races with mmio caching.
>>> +      * The inverse case is possible (old generation number with new cache
>>> +      * info), but that is safe. The next access will just miss the cache
>>> +      * when it should have hit.
>>> +      */
>>> +     smp_wmb();
>>
>> The memory barrier can't help us, consider this scenario:
>>
>> CPU 0                                      CPU 1
>> page-fault
>> see gpa is not mapped in memslot
>>
>>                               create new memslot containing gpa from Qemu
>>                                   update the slots's generation number
>> cache mmio info
>>
>> !!! later when vcpu accesses gpa again
>> it will cause mmio-exit.
> 
> Ah! Thanks for catching my mistake.
> 
>> The easy way to fix this is that we update slots's generation-number
>> after synchronize_srcu_expedited when memslot is being updated that
>> ensures other sides can see the new generation-number only after
>> finishing update.
> 
> It would be possible for a vcpu to see an inconsistent kvm_memslots struct
> (new set of slots with an old generation number). Is that not an issue?

In this case, checking generation-number will fail, we will goto the slow path
to handle mmio access - that's very rare, so i think it's ok.

> 
> We could just use the generation number that is stored in the
> spte. The only downside (that I can see) is that handle_abnormal_pfn()
> calls vcpu_cache_mmio_info() but does not have access to the spte.
> So presumably we'd have to do a page table walk.

The issue is not only in vcpu_cache_mmio_info but also in
mark_mmio_spte() where we may cache new generation-number and old memslots
info.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Aug. 7, 2014, 4:20 a.m. UTC | #6
On Tue, Aug 5, 2014 at 8:26 PM, Xiao Guangrong
<xiaoguangrong@linux.vnet.ibm.com> wrote:
> On 08/06/2014 06:39 AM, David Matlack wrote:
>> On Mon, Aug 4, 2014 at 8:36 PM, Xiao Guangrong
>> <xiaoguangrong@linux.vnet.ibm.com> wrote:
>>> The memory barrier can't help us, consider this scenario:
>>>
>>> CPU 0                                      CPU 1
>>> page-fault
>>> see gpa is not mapped in memslot
>>>
>>>                               create new memslot containing gpa from Qemu
>>>                                   update the slots's generation number
>>> cache mmio info
>>>
>>> !!! later when vcpu accesses gpa again
>>> it will cause mmio-exit.
>>
>> Ah! Thanks for catching my mistake.
>>
>>> The easy way to fix this is that we update slots's generation-number
>>> after synchronize_srcu_expedited when memslot is being updated that
>>> ensures other sides can see the new generation-number only after
>>> finishing update.
>>
>> It would be possible for a vcpu to see an inconsistent kvm_memslots struct
>> (new set of slots with an old generation number). Is that not an issue?
>
> In this case, checking generation-number will fail, we will goto the slow path
> to handle mmio access - that's very rare, so i think it's ok.
>
>>
>> We could just use the generation number that is stored in the
>> spte. The only downside (that I can see) is that handle_abnormal_pfn()
>> calls vcpu_cache_mmio_info() but does not have access to the spte.
>> So presumably we'd have to do a page table walk.
>
> The issue is not only in vcpu_cache_mmio_info but also in
> mark_mmio_spte() where we may cache new generation-number and old memslots
> info.

This is now a separate bug from the one I originally reported :). For now, I
will resend a 3rd version of my patch with the fixes you suggested (the memory
barrier is useless and no need to change to mmu_sync_roots).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49205d0..f518d14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -479,6 +479,7 @@  struct kvm_vcpu_arch {
 	u64 mmio_gva;
 	unsigned access;
 	gfn_t mmio_gfn;
+	unsigned int mmio_gen;
 
 	struct kvm_pmu pmu;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9314678..43f1c18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -206,11 +206,8 @@  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 #define MMIO_SPTE_GEN_LOW_SHIFT		3
 #define MMIO_SPTE_GEN_HIGH_SHIFT	52
 
-#define MMIO_GEN_SHIFT			19
 #define MMIO_GEN_LOW_SHIFT		9
 #define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 1)
-#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
-#define MMIO_MAX_GEN			((1 << MMIO_GEN_SHIFT) - 1)
 
 static u64 generation_mmio_spte_mask(unsigned int gen)
 {
@@ -234,16 +231,6 @@  static unsigned int get_mmio_spte_generation(u64 spte)
 	return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
-{
-	/*
-	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
-	 * code of handling generation number wrap-around.
-	 */
-	return (kvm_memslots(kvm)->generation +
-		      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
-}
-
 static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
@@ -3157,13 +3144,14 @@  static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
+	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
 	if (vcpu->arch.mmu.direct_map)
 		return;
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 
-	vcpu_clear_mmio_info(vcpu, ~0ul);
 	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b982112..058651a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -82,6 +82,76 @@  void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
 void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		bool ept);
 
+#define MMIO_GEN_SHIFT                 19
+#define MMIO_GEN_MASK                  ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN                   ((1 << MMIO_GEN_SHIFT) - 1)
+static inline unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+	/*
+	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
+	 * code of handling generation number wrap-around.
+	 */
+	return (kvm_memslots(kvm)->generation + MMIO_MAX_GEN - 150) &
+	       MMIO_GEN_MASK;
+}
+
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+					gva_t gva, gfn_t gfn, unsigned access)
+{
+	vcpu->arch.mmio_gen = kvm_current_mmio_generation(vcpu->kvm);
+
+	/*
+	 * Ensure that the mmio_gen is set before the rest of the cache entry.
+	 * Otherwise we might see a new generation number attached to an old
+	 * cache entry if creating/deleting a memslot races with mmio caching.
+	 * The inverse case is possible (old generation number with new cache
+	 * info), but that is safe. The next access will just miss the cache
+	 * when it should have hit.
+	 */
+	smp_wmb();
+
+	vcpu->arch.mmio_gva = gva & PAGE_MASK;
+	vcpu->arch.access = access;
+	vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY,
+ * unconditionally clear the mmio cache.
+ */
+#define MMIO_GVA_ANY ~((gva_t) 0)
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+		return;
+
+	vcpu->arch.mmio_gva = 0;
+	vcpu->arch.mmio_gfn = 0;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mmio_gen == kvm_current_mmio_generation(vcpu->kvm);
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+	u64 mmio_gva = vcpu->arch.mmio_gva;
+
+	return vcpu_match_mmio_gen(vcpu) &&
+	       mmio_gva &&
+	       mmio_gva == (gva & PAGE_MASK);
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	gfn_t mmio_gfn = vcpu->arch.mmio_gfn;
+
+	return vcpu_match_mmio_gen(vcpu) &&
+	       mmio_gfn &&
+	       mmio_gfn == (gpa >> PAGE_SHIFT);
+}
+
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
 	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8c97bac..c686c91 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -72,42 +72,6 @@  static inline u32 bit(int bitno)
 	return 1 << (bitno & 31);
 }
 
-static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
-					gva_t gva, gfn_t gfn, unsigned access)
-{
-	vcpu->arch.mmio_gva = gva & PAGE_MASK;
-	vcpu->arch.access = access;
-	vcpu->arch.mmio_gfn = gfn;
-}
-
-/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
- */
-static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
-		return;
-
-	vcpu->arch.mmio_gva = 0;
-}
-
-static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
-{
-	if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
-		return true;
-
-	return false;
-}
-
-static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-	if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
-		return true;
-
-	return false;
-}
-
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);