Message ID | 20221007232818.459650-8-oliver.upton@linux.dev (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: arm64: Parallel stage-2 fault handling | expand |
On Fri, Oct 07, 2022, Oliver Upton wrote: > Use an opaque type for pteps and require visitors explicitly dereference > the pointer before using. Protecting page table memory with RCU requires > that KVM dereferences RCU-annotated pointers before using. However, RCU > is not available for use in the nVHE hypervisor and the opaque type can > be conditionally annotated with RCU for the stage-2 MMU. > > Call the type a 'pteref' to avoid a naming collision with raw pteps. No > functional change intended. > > Signed-off-by: Oliver Upton <oliver.upton@linux.dev> > --- > arch/arm64/include/asm/kvm_pgtable.h | 9 ++++++++- > arch/arm64/kvm/hyp/pgtable.c | 23 ++++++++++++----------- > 2 files changed, 20 insertions(+), 12 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > index c33edcf36b5b..beb89eac155c 100644 > --- a/arch/arm64/include/asm/kvm_pgtable.h > +++ b/arch/arm64/include/asm/kvm_pgtable.h > @@ -25,6 +25,13 @@ static inline u64 kvm_get_parange(u64 mmfr0) > > typedef u64 kvm_pte_t; > > +typedef kvm_pte_t *kvm_pteref_t; > + > +static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) > +{ > + return pteref; Returning the pointer is unsafe (when it becomes RCU-protected). The full dereference of the data needs to occur under RCU protection, not just the retrieval of the pointer. E.g. this (straw man) would be broken bool table = kvm_pte_table(ctx.old, level); rcu_read_lock(); ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED); rcu_read_unlock(); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); ctx.old = READ_ONCE(*ptep); table = kvm_pte_table(ctx.old, level); } as the read of the entry pointed at by ptep could be to a page table that is freed in an RCU callback. The naming collision you are trying to avoid is a symptom of this bad pattern, as there should never be "raw" pteps floating around, at least not in non-pKVM contexts that utilize RCU. > +} > + > #define KVM_PTE_VALID BIT(0) > > #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) > @@ -170,7 +177,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, > struct kvm_pgtable { > u32 ia_bits; > u32 start_level; > - kvm_pte_t *pgd; > + kvm_pteref_t pgd; > struct kvm_pgtable_mm_ops *mm_ops; > > /* Stage-2 only */ > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > index 02c33fccb178..6b6e1ed7ee2f 100644 > --- a/arch/arm64/kvm/hyp/pgtable.c > +++ b/arch/arm64/kvm/hyp/pgtable.c > @@ -175,13 +175,14 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, > } > > static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, > - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); > + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); > > static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, > struct kvm_pgtable_mm_ops *mm_ops, > - kvm_pte_t *ptep, u32 level) > + kvm_pteref_t pteref, u32 level) > { > enum kvm_pgtable_walk_flags flags = data->walker->flags; > + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); > struct kvm_pgtable_visit_ctx ctx = { > .ptep = ptep, > .old = READ_ONCE(*ptep), This is where you want the protection to kick in, e.g. typedef kvm_pte_t __rcu *kvm_ptep_t; static inline kvm_pte_t kvm_read_pte(kvm_ptep_t ptep) { return READ_ONCE(*rcu_dereference(ptep)); } .old = kvm_read_pte(ptep), In other words, the pointer itself isn't that's protected, it's PTE that the pointer points at that's protected. rcu_dereference() has no overhead when CONFIG_PROVE_RCU=n, i.e. there's no reason to "optimize" dereferences.
On Wed, Oct 19, 2022 at 11:17:43PM +0000, Sean Christopherson wrote: > On Fri, Oct 07, 2022, Oliver Upton wrote: > > Use an opaque type for pteps and require visitors explicitly dereference > > the pointer before using. Protecting page table memory with RCU requires > > that KVM dereferences RCU-annotated pointers before using. However, RCU > > is not available for use in the nVHE hypervisor and the opaque type can > > be conditionally annotated with RCU for the stage-2 MMU. > > > > Call the type a 'pteref' to avoid a naming collision with raw pteps. No > > functional change intended. > > > > Signed-off-by: Oliver Upton <oliver.upton@linux.dev> > > --- > > arch/arm64/include/asm/kvm_pgtable.h | 9 ++++++++- > > arch/arm64/kvm/hyp/pgtable.c | 23 ++++++++++++----------- > > 2 files changed, 20 insertions(+), 12 deletions(-) > > > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > > index c33edcf36b5b..beb89eac155c 100644 > > --- a/arch/arm64/include/asm/kvm_pgtable.h > > +++ b/arch/arm64/include/asm/kvm_pgtable.h > > @@ -25,6 +25,13 @@ static inline u64 kvm_get_parange(u64 mmfr0) > > > > typedef u64 kvm_pte_t; > > > > +typedef kvm_pte_t *kvm_pteref_t; > > + > > +static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) > > +{ > > + return pteref; > > Returning the pointer is unsafe (when it becomes RCU-protected). The full > dereference of the data needs to occur under RCU protection, not just the retrieval > of the pointer. E.g. this (straw man) would be broken > > bool table = kvm_pte_table(ctx.old, level); > > rcu_read_lock(); > ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED); > rcu_read_unlock(); > > if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) > ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); > > if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { > ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); > ctx.old = READ_ONCE(*ptep); > table = kvm_pte_table(ctx.old, level); > } > > as the read of the entry pointed at by ptep could be to a page table that is freed > in an RCU callback. > > The naming collision you are trying to avoid is a symptom of this bad pattern, > as there should never be "raw" pteps floating around, at least not in non-pKVM > contexts that utilize RCU. Fair enough, this was mostly from an attempt to avoid churn in the visitor callbacks by adding more layering for reads/writes through RCU pointers. The lifetime of the raw pointer is 'safe' as it sits on the stack and we go behind the rcu lock much further up. > > +} > > + > > #define KVM_PTE_VALID BIT(0) > > > > #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) > > @@ -170,7 +177,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, > > struct kvm_pgtable { > > u32 ia_bits; > > u32 start_level; > > - kvm_pte_t *pgd; > > + kvm_pteref_t pgd; > > struct kvm_pgtable_mm_ops *mm_ops; > > > > /* Stage-2 only */ > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > > index 02c33fccb178..6b6e1ed7ee2f 100644 > > --- a/arch/arm64/kvm/hyp/pgtable.c > > +++ b/arch/arm64/kvm/hyp/pgtable.c > > @@ -175,13 +175,14 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, > > } > > > > static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, > > - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); > > + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); > > > > static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, > > struct kvm_pgtable_mm_ops *mm_ops, > > - kvm_pte_t *ptep, u32 level) > > + kvm_pteref_t pteref, u32 level) > > { > > enum kvm_pgtable_walk_flags flags = data->walker->flags; > > + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); > > struct kvm_pgtable_visit_ctx ctx = { > > .ptep = ptep, > > .old = READ_ONCE(*ptep), > > This is where you want the protection to kick in, e.g. > > typedef kvm_pte_t __rcu *kvm_ptep_t; > > static inline kvm_pte_t kvm_read_pte(kvm_ptep_t ptep) > { > return READ_ONCE(*rcu_dereference(ptep)); > } > > .old = kvm_read_pte(ptep), > > In other words, the pointer itself isn't that's protected, it's PTE that the > pointer points at that's protected. Right, but practically speaking it is the boundary at which we assert that protection. Anyhow, I'll look at abstracting the actual memory accesses in the visitors without too much mess. -- Thanks, Oliver
On Thu, Oct 20, 2022 at 11:32:28AM +0300, Oliver Upton wrote: > On Wed, Oct 19, 2022 at 11:17:43PM +0000, Sean Christopherson wrote: > > On Fri, Oct 07, 2022, Oliver Upton wrote: [...] > > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > > > index 02c33fccb178..6b6e1ed7ee2f 100644 > > > --- a/arch/arm64/kvm/hyp/pgtable.c > > > +++ b/arch/arm64/kvm/hyp/pgtable.c > > > @@ -175,13 +175,14 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, > > > } > > > > > > static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, > > > - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); > > > + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); > > > > > > static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, > > > struct kvm_pgtable_mm_ops *mm_ops, > > > - kvm_pte_t *ptep, u32 level) > > > + kvm_pteref_t pteref, u32 level) > > > { > > > enum kvm_pgtable_walk_flags flags = data->walker->flags; > > > + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); > > > struct kvm_pgtable_visit_ctx ctx = { > > > .ptep = ptep, > > > .old = READ_ONCE(*ptep), > > > > This is where you want the protection to kick in, e.g. > > > > typedef kvm_pte_t __rcu *kvm_ptep_t; > > > > static inline kvm_pte_t kvm_read_pte(kvm_ptep_t ptep) > > { > > return READ_ONCE(*rcu_dereference(ptep)); > > } > > > > .old = kvm_read_pte(ptep), > > > > In other words, the pointer itself isn't that's protected, it's PTE that the > > pointer points at that's protected. > > Right, but practically speaking it is the boundary at which we assert > that protection. > > Anyhow, I'll look at abstracting the actual memory accesses in the > visitors without too much mess. Took this in a slightly different direction after playing with it for a while. Abstracting all PTE accesses adds a lot of churn to the series. Adding in an assertion before invoking a visitor callback (i.e. when the raw pointer is about to be used) provides a similar degree of assurance that we are indeed RCU-safe. -- Thanks, Oliver
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c33edcf36b5b..beb89eac155c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,13 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +typedef kvm_pte_t *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) +{ + return pteref; +} + #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) @@ -170,7 +177,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, struct kvm_pgtable { u32 ia_bits; u32 start_level; - kvm_pte_t *pgd; + kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; /* Stage-2 only */ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 02c33fccb178..6b6e1ed7ee2f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -175,13 +175,14 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pte_t *ptep, u32 level) + kvm_pteref_t pteref, u32 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), @@ -193,7 +194,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .flags = flags, }; int ret = 0; - kvm_pte_t *childp; + kvm_pteref_t childp; bool table = kvm_pte_table(ctx.old, level); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) @@ -214,7 +215,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(ctx.old, mm_ops); + childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (ret) goto out; @@ -227,7 +228,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) { u32 idx; int ret = 0; @@ -236,12 +237,12 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { - kvm_pte_t *ptep = &pgtable[idx]; + kvm_pteref_t pteref = &pgtable[idx]; if (data->addr >= data->end) break; - ret = __kvm_pgtable_visit(data, mm_ops, ptep, level); + ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); if (ret) break; } @@ -262,9 +263,9 @@ static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_da return -EINVAL; for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { - kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; - ret = __kvm_pgtable_walk(data, pgt->mm_ops, ptep, pgt->start_level); + ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); if (ret) break; } @@ -507,7 +508,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, { u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); - pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; @@ -1119,7 +1120,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; - pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM;
Use an opaque type for pteps and require visitors explicitly dereference the pointer before using. Protecting page table memory with RCU requires that KVM dereferences RCU-annotated pointers before using. However, RCU is not available for use in the nVHE hypervisor and the opaque type can be conditionally annotated with RCU for the stage-2 MMU. Call the type a 'pteref' to avoid a naming collision with raw pteps. No functional change intended. Signed-off-by: Oliver Upton <oliver.upton@linux.dev> --- arch/arm64/include/asm/kvm_pgtable.h | 9 ++++++++- arch/arm64/kvm/hyp/pgtable.c | 23 ++++++++++++----------- 2 files changed, 20 insertions(+), 12 deletions(-)