diff mbox series

[2/5] KVM: arm64: Build MPIDR to vcpu index cache at runtime

Message ID 20230907100931.1186690-3-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series [1/5] KVM: arm64: Simplify kvm_vcpu_get_mpidr_aff() | expand

Commit Message

Marc Zyngier Sept. 7, 2023, 10:09 a.m. UTC
The MPIDR_EL1 register contains a unique value that identifies
the CPU. The only problem with it is that it is stupidly large
(32 bits, once the useless stuff is removed).

Trying to obtain a vcpu from an MPIDR value is a fairly common,
yet costly operation: we iterate over all the vcpus until we
find the correct one. While this is cheap for small VMs, it is
pretty expensive on large ones, specially if you are trying to
get to the one that's at the end of the list...

In order to help with this, it is important to realise that
the MPIDR values are actually structured, and that implementations
tend to use a small number of significant bits in the 32bit space.

We can use this fact to our advantage by computing a small hash
table that uses the "compression" of the significant MPIDR bits
as an index, giving us the vcpu index as a result.

Given that the MPIDR values can be supplied by userspace, and
that an evil VMM could decide to make *all* bits significant,
resulting in a 4G-entry table, we only use this method if the
resulting table fits in a single page. Otherwise, we fallback
to the good old iterative method.

Nothing uses that table just yet, but keep your eyes peeled.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h | 28 ++++++++++++++++
 arch/arm64/kvm/arm.c              | 54 +++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

Comments

Joey Gouly Sept. 7, 2023, 3:29 p.m. UTC | #1
On Thu, Sep 07, 2023 at 11:09:28AM +0100, Marc Zyngier wrote:
> The MPIDR_EL1 register contains a unique value that identifies
> the CPU. The only problem with it is that it is stupidly large
> (32 bits, once the useless stuff is removed).
> 
> Trying to obtain a vcpu from an MPIDR value is a fairly common,
> yet costly operation: we iterate over all the vcpus until we
> find the correct one. While this is cheap for small VMs, it is
> pretty expensive on large ones, specially if you are trying to
> get to the one that's at the end of the list...
> 
> In order to help with this, it is important to realise that
> the MPIDR values are actually structured, and that implementations
> tend to use a small number of significant bits in the 32bit space.
> 
> We can use this fact to our advantage by computing a small hash
> table that uses the "compression" of the significant MPIDR bits
> as an index, giving us the vcpu index as a result.
> 
> Given that the MPIDR values can be supplied by userspace, and
> that an evil VMM could decide to make *all* bits significant,
> resulting in a 4G-entry table, we only use this method if the
> resulting table fits in a single page. Otherwise, we fallback
> to the good old iterative method.
> 
> Nothing uses that table just yet, but keep your eyes peeled.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/include/asm/kvm_host.h | 28 ++++++++++++++++
>  arch/arm64/kvm/arm.c              | 54 +++++++++++++++++++++++++++++++
>  2 files changed, 82 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index af06ccb7ee34..361aaf66ac32 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -202,6 +202,31 @@ struct kvm_protected_vm {
>  	struct kvm_hyp_memcache teardown_mc;
>  };
>  
> +struct kvm_mpidr_data {
> +	u64			mpidr_mask;
> +	DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
> +};
> +
> +static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
> +{
> +	unsigned long mask = data->mpidr_mask;
> +	u64 aff = mpidr & MPIDR_HWID_BITMASK;
> +	int nbits, bit, bit_idx = 0;
> +	u16 vcpu_idx = 0;
> +
> +	/*
> +	 * If this looks like RISC-V's BEXT or x86's PEXT
> +	 * instructions, it isn't by accident.
> +	 */
> +	nbits = fls(mask);
> +	for_each_set_bit(bit, &mask, nbits) {
> +		vcpu_idx |= (aff & BIT(bit)) >> (bit - bit_idx);
> +		bit_idx++;
> +	}
> +
> +	return vcpu_idx;
> +}
> +
>  struct kvm_arch {
>  	struct kvm_s2_mmu mmu;
>  
> @@ -248,6 +273,9 @@ struct kvm_arch {
>  	/* VM-wide vCPU feature set */
>  	DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
>  
> +	/* MPIDR to vcpu index mapping, optional */
> +	struct kvm_mpidr_data *mpidr_data;
> +
>  	/*
>  	 * VM-wide PMU filter, implemented as a bitmap and big enough for
>  	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 4866b3f7b4ea..30ce394c09d4 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -205,6 +205,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  	if (is_protected_kvm_enabled())
>  		pkvm_destroy_hyp_vm(kvm);
>  
> +	kfree(kvm->arch.mpidr_data);
>  	kvm_destroy_vcpus(kvm);
>  
>  	kvm_unshare_hyp(kvm, kvm + 1);
> @@ -578,6 +579,57 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
>  	return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
>  }
>  
> +static void kvm_init_mpidr_data(struct kvm *kvm)
> +{
> +	struct kvm_mpidr_data *data = NULL;
> +	unsigned long c, mask, nr_entries;
> +	u64 aff_set = 0, aff_clr = ~0UL;
> +	struct kvm_vcpu *vcpu;
> +
> +	mutex_lock(&kvm->arch.config_lock);
> +
> +	if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
> +		goto out;
> +
> +	kvm_for_each_vcpu(c, vcpu, kvm) {
> +		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
> +		aff_set |= aff;
> +		aff_clr &= aff;
> +	}
> +
> +	/*
> +	 * A significant bit can be either 0 or 1, and will only appear in
> +	 * aff_set. Use aff_clr to weed out the useless stuff.
> +	 */
> +	mask = aff_set ^ aff_clr;
> +	nr_entries = BIT_ULL(hweight_long(mask));
> +
> +	/*
> +	 * Don't let userspace fool us. If we need more than a single page
> +	 * to describe the compressed MPIDR array, just fall back to the
> +	 * iterative method. Single vcpu VMs do not need this either.
> +	 */
> +	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
> +		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
> +			       GFP_KERNEL_ACCOUNT);
> +
> +	if (!data)
> +		goto out;

Probably not a big deal, but if the data doesn't fit, every vCPU will run this
function up until this point (if the data fits or there's only 1 vCPU we bail
out earlier)

> +
> +	data->mpidr_mask = mask;
> +
> +	kvm_for_each_vcpu(c, vcpu, kvm) {
> +		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
> +		u16 vcpu_idx = kvm_mpidr_index(data, aff);
> +
> +		data->cmpidr_to_idx[vcpu_idx] = c;
> +	}
> +
> +	kvm->arch.mpidr_data = data;
> +out:
> +	mutex_unlock(&kvm->arch.config_lock);
> +}
> +
>  /*
>   * Handle both the initialisation that is being done when the vcpu is
>   * run for the first time, as well as the updates that must be
> @@ -601,6 +653,8 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
>  	if (likely(vcpu_has_run_once(vcpu)))
>  		return 0;
>  
> +	kvm_init_mpidr_data(kvm);
> +
>  	kvm_arm_vcpu_init_debug(vcpu);
>  
>  	if (likely(irqchip_in_kernel(kvm))) {

Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Marc Zyngier Sept. 7, 2023, 6:15 p.m. UTC | #2
On Thu, 07 Sep 2023 16:29:18 +0100,
Joey Gouly <joey.gouly@arm.com> wrote:
> 
> On Thu, Sep 07, 2023 at 11:09:28AM +0100, Marc Zyngier wrote:

[...]

> > @@ -578,6 +579,57 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
> >  	return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
> >  }
> >  
> > +static void kvm_init_mpidr_data(struct kvm *kvm)
> > +{
> > +	struct kvm_mpidr_data *data = NULL;
> > +	unsigned long c, mask, nr_entries;
> > +	u64 aff_set = 0, aff_clr = ~0UL;
> > +	struct kvm_vcpu *vcpu;
> > +
> > +	mutex_lock(&kvm->arch.config_lock);
> > +
> > +	if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
> > +		goto out;
> > +
> > +	kvm_for_each_vcpu(c, vcpu, kvm) {
> > +		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
> > +		aff_set |= aff;
> > +		aff_clr &= aff;
> > +	}
> > +
> > +	/*
> > +	 * A significant bit can be either 0 or 1, and will only appear in
> > +	 * aff_set. Use aff_clr to weed out the useless stuff.
> > +	 */
> > +	mask = aff_set ^ aff_clr;
> > +	nr_entries = BIT_ULL(hweight_long(mask));
> > +
> > +	/*
> > +	 * Don't let userspace fool us. If we need more than a single page
> > +	 * to describe the compressed MPIDR array, just fall back to the
> > +	 * iterative method. Single vcpu VMs do not need this either.
> > +	 */
> > +	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
> > +		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
> > +			       GFP_KERNEL_ACCOUNT);
> > +
> > +	if (!data)
> > +		goto out;
> 
> Probably not a big deal, but if the data doesn't fit, every vCPU will run this
> function up until this point (if the data fits or there's only 1 vCPU we bail
> out earlier)

Yeah, I thought about that when writing this code, and applied the
following reasoning:

- this code is only run once per vcpu

- being able to remember that we cannot allocate the hash table
  requires at least an extra flag or a special value for the pointer

- this sequence is pretty quick (one read/or/and * nr_vcpu^2), and
  even if you have 512 vcpus, it isn't *that* much stuff given that it
  is spread across vcpus

Now, if someone can actually measure a significant boot-time speed-up,
I'll happily add that flag.

[...]

> Reviewed-by: Joey Gouly <joey.gouly@arm.com>

Thanks!

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index af06ccb7ee34..361aaf66ac32 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -202,6 +202,31 @@  struct kvm_protected_vm {
 	struct kvm_hyp_memcache teardown_mc;
 };
 
+struct kvm_mpidr_data {
+	u64			mpidr_mask;
+	DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
+};
+
+static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
+{
+	unsigned long mask = data->mpidr_mask;
+	u64 aff = mpidr & MPIDR_HWID_BITMASK;
+	int nbits, bit, bit_idx = 0;
+	u16 vcpu_idx = 0;
+
+	/*
+	 * If this looks like RISC-V's BEXT or x86's PEXT
+	 * instructions, it isn't by accident.
+	 */
+	nbits = fls(mask);
+	for_each_set_bit(bit, &mask, nbits) {
+		vcpu_idx |= (aff & BIT(bit)) >> (bit - bit_idx);
+		bit_idx++;
+	}
+
+	return vcpu_idx;
+}
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -248,6 +273,9 @@  struct kvm_arch {
 	/* VM-wide vCPU feature set */
 	DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
 
+	/* MPIDR to vcpu index mapping, optional */
+	struct kvm_mpidr_data *mpidr_data;
+
 	/*
 	 * VM-wide PMU filter, implemented as a bitmap and big enough for
 	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4866b3f7b4ea..30ce394c09d4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -205,6 +205,7 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (is_protected_kvm_enabled())
 		pkvm_destroy_hyp_vm(kvm);
 
+	kfree(kvm->arch.mpidr_data);
 	kvm_destroy_vcpus(kvm);
 
 	kvm_unshare_hyp(kvm, kvm + 1);
@@ -578,6 +579,57 @@  static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
 	return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
 }
 
+static void kvm_init_mpidr_data(struct kvm *kvm)
+{
+	struct kvm_mpidr_data *data = NULL;
+	unsigned long c, mask, nr_entries;
+	u64 aff_set = 0, aff_clr = ~0UL;
+	struct kvm_vcpu *vcpu;
+
+	mutex_lock(&kvm->arch.config_lock);
+
+	if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
+		goto out;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+		aff_set |= aff;
+		aff_clr &= aff;
+	}
+
+	/*
+	 * A significant bit can be either 0 or 1, and will only appear in
+	 * aff_set. Use aff_clr to weed out the useless stuff.
+	 */
+	mask = aff_set ^ aff_clr;
+	nr_entries = BIT_ULL(hweight_long(mask));
+
+	/*
+	 * Don't let userspace fool us. If we need more than a single page
+	 * to describe the compressed MPIDR array, just fall back to the
+	 * iterative method. Single vcpu VMs do not need this either.
+	 */
+	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
+		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
+			       GFP_KERNEL_ACCOUNT);
+
+	if (!data)
+		goto out;
+
+	data->mpidr_mask = mask;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+		u16 vcpu_idx = kvm_mpidr_index(data, aff);
+
+		data->cmpidr_to_idx[vcpu_idx] = c;
+	}
+
+	kvm->arch.mpidr_data = data;
+out:
+	mutex_unlock(&kvm->arch.config_lock);
+}
+
 /*
  * Handle both the initialisation that is being done when the vcpu is
  * run for the first time, as well as the updates that must be
@@ -601,6 +653,8 @@  int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 	if (likely(vcpu_has_run_once(vcpu)))
 		return 0;
 
+	kvm_init_mpidr_data(kvm);
+
 	kvm_arm_vcpu_init_debug(vcpu);
 
 	if (likely(irqchip_in_kernel(kvm))) {