Message ID | 20220516232138.1783324-22-dmatlack@google.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | KVM: Extend Eager Page Splitting to the shadow MMU | expand |
On Tue, May 17, 2022 at 4:52 AM David Matlack <dmatlack@google.com> wrote: > > Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at > declaration time rather than being fixed for all declarations. This will > be used in a follow-up commit to declare an cache in x86 with a capacity > of 512+ objects without having to increase the capacity of all caches in > KVM. > > This change requires each cache now specify its capacity at runtime, > since the cache struct itself no longer has a fixed capacity known at > compile time. To protect against someone accidentally defining a > kvm_mmu_memory_cache struct directly (without the extra storage), this > commit includes a WARN_ON() in kvm_mmu_topup_memory_cache(). > > In order to support different capacities, this commit changes the > objects pointer array to be dynamically allocated the first time the > cache is topped-up. > > While here, opportunistically clean up the stack-allocated > kvm_mmu_memory_cache structs in riscv and arm64 to use designated > initializers. > > No functional change intended. > > Reviewed-by: Marc Zyngier <maz@kernel.org> > Signed-off-by: David Matlack <dmatlack@google.com> Looks good to me for KVM RISC-V. Reviewed-by: Anup Patel <anup@brainfault.org> A small heads-up that function stage2_ioremap() is going to be renamed for Linux-5.19 so you might have to rebase one more time. Thanks, Anup > --- > arch/arm64/kvm/mmu.c | 2 +- > arch/riscv/kvm/mmu.c | 5 +---- > include/linux/kvm_types.h | 6 +++++- > virt/kvm/kvm_main.c | 33 ++++++++++++++++++++++++++++++--- > 4 files changed, 37 insertions(+), 9 deletions(-) > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 53ae2c0640bc..f443ed845f85 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -764,7 +764,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > { > phys_addr_t addr; > int ret = 0; > - struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; > + struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; > struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | > KVM_PGTABLE_PROT_R | > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > index f80a34fbf102..4d95ebe4114f 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -347,10 +347,7 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, > int ret = 0; > unsigned long pfn; > phys_addr_t addr, end; > - struct kvm_mmu_memory_cache pcache; > - > - memset(&pcache, 0, sizeof(pcache)); > - pcache.gfp_zero = __GFP_ZERO; > + struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO }; > > end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; > pfn = __phys_to_pfn(hpa); > diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h > index ac1ebb37a0ff..68529884eaf8 100644 > --- a/include/linux/kvm_types.h > +++ b/include/linux/kvm_types.h > @@ -83,12 +83,16 @@ struct gfn_to_pfn_cache { > * MMU flows is problematic, as is triggering reclaim, I/O, etc... while > * holding MMU locks. Note, these caches act more like prefetch buffers than > * classical caches, i.e. objects are not returned to the cache on being freed. > + * > + * The @capacity field and @objects array are lazily initialized when the cache > + * is topped up (__kvm_mmu_topup_memory_cache()). > */ > struct kvm_mmu_memory_cache { > int nobjs; > gfp_t gfp_zero; > struct kmem_cache *kmem_cache; > - void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE]; > + int capacity; > + void **objects; > }; > #endif > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index e089db822c12..5e2e75014256 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > return (void *)__get_free_page(gfp_flags); > } > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > { > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > void *obj; > > if (mc->nobjs >= min) > return 0; > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > + > + if (unlikely(!mc->objects)) { > + if (WARN_ON_ONCE(!capacity)) > + return -EIO; > + > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > + if (!mc->objects) > + return -ENOMEM; > + > + mc->capacity = capacity; > + } > + > + /* It is illegal to request a different capacity across topups. */ > + if (WARN_ON_ONCE(mc->capacity != capacity)) > + return -EIO; > + > + while (mc->nobjs < mc->capacity) { > + obj = mmu_memory_cache_alloc_obj(mc, gfp); > if (!obj) > return mc->nobjs >= min ? 0 : -ENOMEM; > mc->objects[mc->nobjs++] = obj; > @@ -384,6 +401,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > return 0; > } > > +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > +{ > + return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); > +} > + > int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) > { > return mc->nobjs; > @@ -397,6 +419,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) > else > free_page((unsigned long)mc->objects[--mc->nobjs]); > } > + > + kvfree(mc->objects); > + > + mc->objects = NULL; > + mc->capacity = 0; > } > > void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) > -- > 2.36.0.550.gb090851708-goog >
On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at > declaration time rather than being fixed for all declarations. This will > be used in a follow-up commit to declare an cache in x86 with a capacity > of 512+ objects without having to increase the capacity of all caches in > KVM. > > This change requires each cache now specify its capacity at runtime, > since the cache struct itself no longer has a fixed capacity known at > compile time. To protect against someone accidentally defining a > kvm_mmu_memory_cache struct directly (without the extra storage), this > commit includes a WARN_ON() in kvm_mmu_topup_memory_cache(). > > In order to support different capacities, this commit changes the > objects pointer array to be dynamically allocated the first time the > cache is topped-up. > > While here, opportunistically clean up the stack-allocated > kvm_mmu_memory_cache structs in riscv and arm64 to use designated > initializers. > > No functional change intended. > > Reviewed-by: Marc Zyngier <maz@kernel.org> > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/arm64/kvm/mmu.c | 2 +- > arch/riscv/kvm/mmu.c | 5 +---- > include/linux/kvm_types.h | 6 +++++- > virt/kvm/kvm_main.c | 33 ++++++++++++++++++++++++++++++--- > 4 files changed, 37 insertions(+), 9 deletions(-) > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 53ae2c0640bc..f443ed845f85 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -764,7 +764,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > { > phys_addr_t addr; > int ret = 0; > - struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; > + struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; > struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | > KVM_PGTABLE_PROT_R | > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > index f80a34fbf102..4d95ebe4114f 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -347,10 +347,7 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, > int ret = 0; > unsigned long pfn; > phys_addr_t addr, end; > - struct kvm_mmu_memory_cache pcache; > - > - memset(&pcache, 0, sizeof(pcache)); > - pcache.gfp_zero = __GFP_ZERO; > + struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO }; > > end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; > pfn = __phys_to_pfn(hpa); > diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h > index ac1ebb37a0ff..68529884eaf8 100644 > --- a/include/linux/kvm_types.h > +++ b/include/linux/kvm_types.h > @@ -83,12 +83,16 @@ struct gfn_to_pfn_cache { > * MMU flows is problematic, as is triggering reclaim, I/O, etc... while > * holding MMU locks. Note, these caches act more like prefetch buffers than > * classical caches, i.e. objects are not returned to the cache on being freed. > + * > + * The @capacity field and @objects array are lazily initialized when the cache > + * is topped up (__kvm_mmu_topup_memory_cache()). > */ > struct kvm_mmu_memory_cache { > int nobjs; > gfp_t gfp_zero; > struct kmem_cache *kmem_cache; > - void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE]; > + int capacity; > + void **objects; > }; > #endif > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index e089db822c12..5e2e75014256 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > return (void *)__get_free_page(gfp_flags); > } > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > { > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > void *obj; > > if (mc->nobjs >= min) > return 0; > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > + > + if (unlikely(!mc->objects)) { > + if (WARN_ON_ONCE(!capacity)) > + return -EIO; > + > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > + if (!mc->objects) > + return -ENOMEM; > + > + mc->capacity = capacity; Do we want to ensure the minimum value of the capacity? I think otherwise, we may more likely start using memory from GFP_ATOMIC if the capacity is less than, say 5? But the minimum value seems related to each cache type. > + } > + > + /* It is illegal to request a different capacity across topups. */ > + if (WARN_ON_ONCE(mc->capacity != capacity)) > + return -EIO; > + > + while (mc->nobjs < mc->capacity) { > + obj = mmu_memory_cache_alloc_obj(mc, gfp); > if (!obj) > return mc->nobjs >= min ? 0 : -ENOMEM; > mc->objects[mc->nobjs++] = obj; > @@ -384,6 +401,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > return 0; > } > > +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > +{ > + return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); > +} > + > int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) > { > return mc->nobjs; > @@ -397,6 +419,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) > else > free_page((unsigned long)mc->objects[--mc->nobjs]); > } > + > + kvfree(mc->objects); > + > + mc->objects = NULL; > + mc->capacity = 0; > } > > void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) > -- > 2.36.0.550.gb090851708-goog >
On Fri, May 20, 2022, Mingwei Zhang wrote: > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index e089db822c12..5e2e75014256 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > > return (void *)__get_free_page(gfp_flags); > > } > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > { > > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > > void *obj; > > > > if (mc->nobjs >= min) > > return 0; > > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > > + > > + if (unlikely(!mc->objects)) { > > + if (WARN_ON_ONCE(!capacity)) > > + return -EIO; > > + > > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > > + if (!mc->objects) > > + return -ENOMEM; > > + > > + mc->capacity = capacity; > > Do we want to ensure the minimum value of the capacity? I think > otherwise, we may more likely start using memory from GFP_ATOMIC if > the capacity is less than, say 5? But the minimum value seems related > to each cache type. Eh, if we specify a minimum, just make the arch default the minimum. That way we avoid adding even more magic/arbitrary numbers. E.g. for whatever reason, MIPS's default is '4'.
On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote: > > On Fri, May 20, 2022, Mingwei Zhang wrote: > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > index e089db822c12..5e2e75014256 100644 > > > --- a/virt/kvm/kvm_main.c > > > +++ b/virt/kvm/kvm_main.c > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > > > return (void *)__get_free_page(gfp_flags); > > > } > > > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > > { > > > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > > > void *obj; > > > > > > if (mc->nobjs >= min) > > > return 0; > > > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > > > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > > > + > > > + if (unlikely(!mc->objects)) { > > > + if (WARN_ON_ONCE(!capacity)) > > > + return -EIO; > > > + > > > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > > > + if (!mc->objects) > > > + return -ENOMEM; > > > + > > > + mc->capacity = capacity; > > > > Do we want to ensure the minimum value of the capacity? I think > > otherwise, we may more likely start using memory from GFP_ATOMIC if > > the capacity is less than, say 5? But the minimum value seems related > > to each cache type. > > Eh, if we specify a minimum, just make the arch default the minimum. That way we > avoid adding even more magic/arbitrary numbers. E.g. for whatever reason, MIPS's > default is '4'. I'm not exactly sure what you had in mind Mingwei. But there is a bug in this code if min > capacity. This function will happily return 0 after filling up the cache, even though it did not allocate min objects. The same bug existed before this patch if min > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this bug (e.g. WARN and return -ENOMEM if min > capacity).
On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote: > > On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote: > > > > On Fri, May 20, 2022, Mingwei Zhang wrote: > > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > index e089db822c12..5e2e75014256 100644 > > > > --- a/virt/kvm/kvm_main.c > > > > +++ b/virt/kvm/kvm_main.c > > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > > > > return (void *)__get_free_page(gfp_flags); > > > > } > > > > > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > > > { > > > > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > > > > void *obj; > > > > > > > > if (mc->nobjs >= min) > > > > return 0; > > > > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > > > > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > > > > + > > > > + if (unlikely(!mc->objects)) { > > > > + if (WARN_ON_ONCE(!capacity)) > > > > + return -EIO; > > > > + > > > > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > > > > + if (!mc->objects) > > > > + return -ENOMEM; > > > > + > > > > + mc->capacity = capacity; > > > > > > Do we want to ensure the minimum value of the capacity? I think > > > otherwise, we may more likely start using memory from GFP_ATOMIC if > > > the capacity is less than, say 5? But the minimum value seems related > > > to each cache type. > > > > Eh, if we specify a minimum, just make the arch default the minimum. That way we > > avoid adding even more magic/arbitrary numbers. E.g. for whatever reason, MIPS's > > default is '4'. > > I'm not exactly sure what you had in mind Mingwei. But there is a bug > in this code if min > capacity. This function will happily return 0 > after filling up the cache, even though it did not allocate min > objects. The same bug existed before this patch if min > > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this > bug (e.g. WARN and return -ENOMEM if min > capacity). oh, what I am saying is this one: https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417 If we are running out of kmem cache, then we start to use __GFP_ATOMIC, which should be avoided as much as we can? Since this patch parameterized the 'capacity', then to avoid the future usage where caller provides a too small value, maybe we could add a warning if the 'capacity' is too small, say, smaller than 40 (the default value)? The case of 'capacity' < min would be a more serious issue, that situation probably should never be allowed.
On Mon, May 23, 2022 at 11:13 AM Mingwei Zhang <mizhang@google.com> wrote: > > On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote: > > > > On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > On Fri, May 20, 2022, Mingwei Zhang wrote: > > > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > > index e089db822c12..5e2e75014256 100644 > > > > > --- a/virt/kvm/kvm_main.c > > > > > +++ b/virt/kvm/kvm_main.c > > > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > > > > > return (void *)__get_free_page(gfp_flags); > > > > > } > > > > > > > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > > > > { > > > > > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > > > > > void *obj; > > > > > > > > > > if (mc->nobjs >= min) > > > > > return 0; > > > > > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > > > > > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > > > > > + > > > > > + if (unlikely(!mc->objects)) { > > > > > + if (WARN_ON_ONCE(!capacity)) > > > > > + return -EIO; > > > > > + > > > > > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > > > > > + if (!mc->objects) > > > > > + return -ENOMEM; > > > > > + > > > > > + mc->capacity = capacity; > > > > > > > > Do we want to ensure the minimum value of the capacity? I think > > > > otherwise, we may more likely start using memory from GFP_ATOMIC if > > > > the capacity is less than, say 5? But the minimum value seems related > > > > to each cache type. > > > > > > Eh, if we specify a minimum, just make the arch default the minimum. That way we > > > avoid adding even more magic/arbitrary numbers. E.g. for whatever reason, MIPS's > > > default is '4'. > > > > I'm not exactly sure what you had in mind Mingwei. But there is a bug > > in this code if min > capacity. This function will happily return 0 > > after filling up the cache, even though it did not allocate min > > objects. The same bug existed before this patch if min > > > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this > > bug (e.g. WARN and return -ENOMEM if min > capacity). > > oh, what I am saying is this one: > https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417 > > If we are running out of kmem cache, then we start to use > __GFP_ATOMIC, which should be avoided as much as we can? Since this > patch parameterized the 'capacity', then to avoid the future usage > where caller provides a too small value, maybe we could add a warning > if the 'capacity' is too small, say, smaller than 40 (the default > value)? I'm not too worried about that. Callers of kvm_mmu_topup_memory_cache() are responsible for passing in a min value. It doesn't matter if capacity is a number lower than 40, as long as kvm_mmu_topup_memory_cache() is able to allocate min objects, the call is a success (and the GFP_ATOMIC fallback should never trigger, and if it does, we'll get a WARN splat). The only actual loophole I can spot is if capacity is less than min. In that case topup will return 0 despite allocating less than min objects. Again we'll still hit the GFP_ATOMIC and get a WARN splat, but we can detect the problem in kvm_mmu_topup_memory_cache() which will include the buggy callsite in the backtrace. > > The case of 'capacity' < min would be a more serious issue, that > situation probably should never be allowed.
On Mon, May 23, 2022 at 11:22 AM David Matlack <dmatlack@google.com> wrote: > > On Mon, May 23, 2022 at 11:13 AM Mingwei Zhang <mizhang@google.com> wrote: > > > > On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote: > > > > > > On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > On Fri, May 20, 2022, Mingwei Zhang wrote: > > > > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote: > > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > > > index e089db822c12..5e2e75014256 100644 > > > > > > --- a/virt/kvm/kvm_main.c > > > > > > +++ b/virt/kvm/kvm_main.c > > > > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > > > > > > return (void *)__get_free_page(gfp_flags); > > > > > > } > > > > > > > > > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > > > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > > > > > { > > > > > > + gfp_t gfp = GFP_KERNEL_ACCOUNT; > > > > > > void *obj; > > > > > > > > > > > > if (mc->nobjs >= min) > > > > > > return 0; > > > > > > - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { > > > > > > - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); > > > > > > + > > > > > > + if (unlikely(!mc->objects)) { > > > > > > + if (WARN_ON_ONCE(!capacity)) > > > > > > + return -EIO; > > > > > > + > > > > > > + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); > > > > > > + if (!mc->objects) > > > > > > + return -ENOMEM; > > > > > > + > > > > > > + mc->capacity = capacity; > > > > > > > > > > Do we want to ensure the minimum value of the capacity? I think > > > > > otherwise, we may more likely start using memory from GFP_ATOMIC if > > > > > the capacity is less than, say 5? But the minimum value seems related > > > > > to each cache type. > > > > > > > > Eh, if we specify a minimum, just make the arch default the minimum. That way we > > > > avoid adding even more magic/arbitrary numbers. E.g. for whatever reason, MIPS's > > > > default is '4'. > > > > > > I'm not exactly sure what you had in mind Mingwei. But there is a bug > > > in this code if min > capacity. This function will happily return 0 > > > after filling up the cache, even though it did not allocate min > > > objects. The same bug existed before this patch if min > > > > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this > > > bug (e.g. WARN and return -ENOMEM if min > capacity). > > > > oh, what I am saying is this one: > > https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417 > > > > If we are running out of kmem cache, then we start to use > > __GFP_ATOMIC, which should be avoided as much as we can? Since this > > patch parameterized the 'capacity', then to avoid the future usage > > where caller provides a too small value, maybe we could add a warning > > if the 'capacity' is too small, say, smaller than 40 (the default > > value)? > > I'm not too worried about that. Callers of > kvm_mmu_topup_memory_cache() are responsible for passing in a min > value. It doesn't matter if capacity is a number lower than 40, as > long as kvm_mmu_topup_memory_cache() is able to allocate min objects, > the call is a success (and the GFP_ATOMIC fallback should never > trigger, and if it does, we'll get a WARN splat). Ah and I forgot to add: In this situation, the bug is that *min* is too small, not capacity. So adding a restriction on capacity would not help. > > The only actual loophole I can spot is if capacity is less than min. > In that case topup will return 0 despite allocating less than min > objects. Again we'll still hit the GFP_ATOMIC and get a WARN splat, > but we can detect the problem in kvm_mmu_topup_memory_cache() which > will include the buggy callsite in the backtrace. > > > > > The case of 'capacity' < min would be a more serious issue, that > > situation probably should never be allowed.
On Mon, May 16, 2022, David Matlack wrote: > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) I still find it somewhat kludgy to have callers provide an capacity. It's not terrible while there's only a single call site, but if a use case comes along for using an oversized cache with multiple call sites, it'll be gross. Tweaking my idea of a "custom" wrapper, what about adding an "oversized" wrapper? That yields clear, firm rules on when to use each helper, guards against calling the "standard" flavor with an impossible @min, and addresses Mingwei's concern that a misguided user could specify a nonsensically small capacity. The only quirk is that kvm_mmu_topup_memory_cache_oversized() has a fixed min, but IMO that's an acceptable tradeoff, and it's a non-issue until another user pops up. static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { gfp_t gfp = GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; if (unlikely(!mc->objects)) { capacity = max(min, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE); mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); if (!mc->objects) return -ENOMEM; mc->capacity = capacity; } /* It is illegal to request a different capacity across topups. */ if (WARN_ON_ONCE(mc->capacity != capacity)) return -EIO; while (mc->nobjs < mc->capacity) { obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; } return 0; } int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { const int capacity = KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; if (WARN_ON_ONCE(min > capacity)) min = capacity; return __kvm_mmu_topup_memory_cache(mc, capacity, min); } /* Oversized caches have a fixed size, i.e. min == capacity == size. */ int kvm_mmu_topup_memory_cache_oversized(struct kvm_mmu_memory_cache *mc, int size) { if (WARN_ON_ONCE(size < KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE) size = KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; return __kvm_mmu_topup_memory_cache(mc, size, size); }
On Fri, Jun 17, 2022, Sean Christopherson wrote: > On Mon, May 16, 2022, David Matlack wrote: > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > > I still find it somewhat kludgy to have callers provide an capacity. It's not > terrible while there's only a single call site, but if a use case comes along > for using an oversized cache with multiple call sites, it'll be gross. > > Tweaking my idea of a "custom" wrapper, what about adding an "oversized" wrapper? > That yields clear, firm rules on when to use each helper, guards against calling > the "standard" flavor with an impossible @min, and addresses Mingwei's concern > that a misguided user could specify a nonsensically small capacity. Drat, arguing against my own idea. The future usage in nested_mmu_try_split_huge_page() is going to be inefficient. By having capacity==min, consuming just one entry, which is guaranteed when a huge page split is successful, will mean the cache needs to be topped up. In other words, I'm pretty sure need_topup_split_caches_or_resched() will always return true between different huge pages and so KVM will drop mmu_lock and reschedule after every huge page. Maybe that's not a big deal, but at the very least it's odd, and its a nasty gotcha with forcing capacity==min. So I'm ok with this patch, though it should yell if min > capacity. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 932abb4fb67e..14e807501229 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -388,7 +388,7 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, return 0; if (unlikely(!mc->objects)) { - if (WARN_ON_ONCE(!capacity)) + if (WARN_ON_ONCE(!capacity || min > capacity)) return -EIO; mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 53ae2c0640bc..f443ed845f85 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -764,7 +764,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, { phys_addr_t addr; int ret = 0; - struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; + struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_R | diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index f80a34fbf102..4d95ebe4114f 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -347,10 +347,7 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, int ret = 0; unsigned long pfn; phys_addr_t addr, end; - struct kvm_mmu_memory_cache pcache; - - memset(&pcache, 0, sizeof(pcache)); - pcache.gfp_zero = __GFP_ZERO; + struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO }; end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index ac1ebb37a0ff..68529884eaf8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -83,12 +83,16 @@ struct gfn_to_pfn_cache { * MMU flows is problematic, as is triggering reclaim, I/O, etc... while * holding MMU locks. Note, these caches act more like prefetch buffers than * classical caches, i.e. objects are not returned to the cache on being freed. + * + * The @capacity field and @objects array are lazily initialized when the cache + * is topped up (__kvm_mmu_topup_memory_cache()). */ struct kvm_mmu_memory_cache { int nobjs; gfp_t gfp_zero; struct kmem_cache *kmem_cache; - void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE]; + int capacity; + void **objects; }; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e089db822c12..5e2e75014256 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, return (void *)__get_free_page(gfp_flags); } -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); + + if (unlikely(!mc->objects)) { + if (WARN_ON_ONCE(!capacity)) + return -EIO; + + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); + if (!mc->objects) + return -ENOMEM; + + mc->capacity = capacity; + } + + /* It is illegal to request a different capacity across topups. */ + if (WARN_ON_ONCE(mc->capacity != capacity)) + return -EIO; + + while (mc->nobjs < mc->capacity) { + obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; @@ -384,6 +401,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) return 0; } +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +{ + return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); +} + int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; @@ -397,6 +419,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) else free_page((unsigned long)mc->objects[--mc->nobjs]); } + + kvfree(mc->objects); + + mc->objects = NULL; + mc->capacity = 0; } void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)