diff mbox series

[v2,3/3] hvf: arm: Implement and use hvf_get_physical_address_range

Message ID 20240828111552.93482-4-danny_canter@apple.com (mailing list archive)
State New
Headers show
Series hvf: arm: Support creating VMs with 64+GB of RAM on macOS 15+ | expand

Commit Message

Danny Canter Aug. 28, 2024, 11:15 a.m. UTC
This patch's main focus is to use the previously added
hvf_get_physical_address_range to inform VM creation
about the IPA size we need for the VM, so we can extend
the default 36b IPA size and support VMs with 64+GB of
RAM. This is done by freezing the memory map, computing
the highest GPA and then (depending on if the platform
supports an IPA size that large) telling the kernel to
use a size >= for the VM. In pursuit of this a couple of
things related to how we handle the physical address range
we expose to guests were altered, but for an explanation of
what we were doing:

Today, to get the IPA size we were reading id_aa64mmfr0_el1's
PARange field from a newly made vcpu. Unfortunately, HVF just
returns the hosts PARange directly for the initial value and
not the IPA size that will actually back the VM, so we believe
we have much more address space than we actually do today it seems.

Starting in macOS 13.0 some APIs were introduced to be able to
query the maximum IPA size the kernel supports, and to set the IPA
size for a given VM. However, this still has a couple of issues
on < macOS 15. Up until macOS 15 (and if the hardware supported
it) the max IPA size was 39 bits which is not a valid PARange
value, so we can't clamp down what we advertise in the vcpu's
id_aa64mmfr0_el1 to our IPA size. Starting in macOS 15 however,
the maximum IPA size is 40 bits (if it's supported in the hardware
as well) which is also a valid PARange value so we can set our IPA
size to the maximum as well as clamp down the PARange we advertise
to the guest. This allows VMs with 64+ GB of RAM and should fix the
oddness of the PARange situation as well.

Signed-off-by: Danny Canter <danny_canter@apple.com>
---
 accel/hvf/hvf-accel-ops.c | 12 ++++++++-
 hw/arm/virt.c             | 31 +++++++++++++++++++++-
 target/arm/hvf/hvf.c      | 56 ++++++++++++++++++++++++++++++++++++++-
 target/arm/hvf_arm.h      | 19 +++++++++++++
 target/arm/internals.h    | 19 +++++++++++++
 target/arm/ptw.c          | 15 +++++++++++
 6 files changed, 149 insertions(+), 3 deletions(-)

Comments

Peter Maydell Sept. 6, 2024, 3:31 p.m. UTC | #1
On Wed, 28 Aug 2024 at 12:16, Danny Canter <danny_canter@apple.com> wrote:
>
> This patch's main focus is to use the previously added
> hvf_get_physical_address_range to inform VM creation
> about the IPA size we need for the VM, so we can extend
> the default 36b IPA size and support VMs with 64+GB of
> RAM. This is done by freezing the memory map, computing
> the highest GPA and then (depending on if the platform
> supports an IPA size that large) telling the kernel to
> use a size >= for the VM. In pursuit of this a couple of
> things related to how we handle the physical address range
> we expose to guests were altered, but for an explanation of
> what we were doing:
>
> Today, to get the IPA size we were reading id_aa64mmfr0_el1's
> PARange field from a newly made vcpu. Unfortunately, HVF just
> returns the hosts PARange directly for the initial value and
> not the IPA size that will actually back the VM, so we believe
> we have much more address space than we actually do today it seems.
>
> Starting in macOS 13.0 some APIs were introduced to be able to
> query the maximum IPA size the kernel supports, and to set the IPA
> size for a given VM. However, this still has a couple of issues
> on < macOS 15. Up until macOS 15 (and if the hardware supported
> it) the max IPA size was 39 bits which is not a valid PARange
> value, so we can't clamp down what we advertise in the vcpu's
> id_aa64mmfr0_el1 to our IPA size. Starting in macOS 15 however,
> the maximum IPA size is 40 bits (if it's supported in the hardware
> as well) which is also a valid PARange value so we can set our IPA
> size to the maximum as well as clamp down the PARange we advertise
> to the guest. This allows VMs with 64+ GB of RAM and should fix the
> oddness of the PARange situation as well.
>
> Signed-off-by: Danny Canter <danny_canter@apple.com>

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

thanks
-- PMM
diff mbox series

Patch

diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c
index dbebf209f4..d60874d3e6 100644
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -53,6 +53,7 @@ 
 #include "exec/address-spaces.h"
 #include "exec/exec-all.h"
 #include "gdbstub/enums.h"
+#include "hw/boards.h"
 #include "sysemu/cpus.h"
 #include "sysemu/hvf.h"
 #include "sysemu/hvf_int.h"
@@ -319,8 +320,17 @@  static int hvf_accel_init(MachineState *ms)
     int x;
     hv_return_t ret;
     HVFState *s;
+    int pa_range = 36;
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
 
-    ret = hvf_arch_vm_create(ms, 0);
+    if (mc->hvf_get_physical_address_range) {
+        pa_range = mc->hvf_get_physical_address_range(ms);
+        if (pa_range < 0) {
+            return -EINVAL;
+        }
+    }
+
+    ret = hvf_arch_vm_create(ms, (uint32_t)pa_range);
     assert_hvf_ok(ret);
 
     s = g_new0(HVFState, 1);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 62ee5f849b..b39c7924a0 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -66,6 +66,7 @@ 
 #include "hw/intc/arm_gicv3_its_common.h"
 #include "hw/irq.h"
 #include "kvm_arm.h"
+#include "hvf_arm.h"
 #include "hw/firmware/smbios.h"
 #include "qapi/visitor.h"
 #include "qapi/qapi-visit-common.h"
@@ -3030,7 +3031,35 @@  static int virt_kvm_type(MachineState *ms, const char *type_str)
 
 static int virt_hvf_get_physical_address_range(MachineState *ms)
 {
-    return 0;
+    VirtMachineState *vms = VIRT_MACHINE(ms);
+
+    int default_ipa_size = hvf_arm_get_default_ipa_bit_size();
+    int max_ipa_size = hvf_arm_get_max_ipa_bit_size();
+
+    /* We freeze the memory map to compute the highest gpa */
+    virt_set_memmap(vms, max_ipa_size);
+
+    int requested_ipa_size = 64 - clz64(vms->highest_gpa);
+
+    /*
+     * If we're <= the default IPA size just use the default.
+     * If we're above the default but below the maximum, round up to
+     * the maximum. hvf_arm_get_max_ipa_bit_size() conveniently only
+     * returns values that are valid ARM PARange values.
+     */
+    if (requested_ipa_size <= default_ipa_size) {
+        requested_ipa_size = default_ipa_size;
+    } else if (requested_ipa_size <= max_ipa_size) {
+        requested_ipa_size = max_ipa_size;
+    } else {
+        error_report("-m and ,maxmem option values "
+                     "require an IPA range (%d bits) larger than "
+                     "the one supported by the host (%d bits)",
+                     requested_ipa_size, max_ipa_size);
+        return -1;
+    }
+
+    return requested_ipa_size;
 }
 
 static void virt_machine_class_init(ObjectClass *oc, void *data)
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 19964d241e..6cea483d42 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -22,6 +22,7 @@ 
 #include <mach/mach_time.h>
 
 #include "exec/address-spaces.h"
+#include "hw/boards.h"
 #include "hw/irq.h"
 #include "qemu/main-loop.h"
 #include "sysemu/cpus.h"
@@ -297,6 +298,8 @@  void hvf_arm_init_debug(void)
 
 static void hvf_wfi(CPUState *cpu);
 
+static uint32_t chosen_ipa_bit_size;
+
 typedef struct HVFVTimer {
     /* Vtimer value during migration and paused state */
     uint64_t vtimer_val;
@@ -839,6 +842,16 @@  static uint64_t hvf_get_reg(CPUState *cpu, int rt)
     return val;
 }
 
+static void clamp_id_aa64mmfr0_parange_to_ipa_size(uint64_t *id_aa64mmfr0)
+{
+    uint32_t ipa_size = chosen_ipa_bit_size ?
+            chosen_ipa_bit_size : hvf_arm_get_max_ipa_bit_size();
+
+    /* Clamp down the PARange to the IPA size the kernel supports. */
+    uint8_t index = round_down_to_parange_index(ipa_size);
+    *id_aa64mmfr0 = (*id_aa64mmfr0 & ~R_ID_AA64MMFR0_PARANGE_MASK) | index;
+}
+
 static bool hvf_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 {
     ARMISARegisters host_isar = {};
@@ -882,6 +895,8 @@  static bool hvf_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
     r |= hv_vcpu_get_sys_reg(fd, HV_SYS_REG_MIDR_EL1, &ahcf->midr);
     r |= hv_vcpu_destroy(fd);
 
+    clamp_id_aa64mmfr0_parange_to_ipa_size(&host_isar.id_aa64mmfr0);
+
     ahcf->isar = host_isar;
 
     /*
@@ -904,6 +919,30 @@  static bool hvf_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
     return r == HV_SUCCESS;
 }
 
+uint32_t hvf_arm_get_default_ipa_bit_size(void)
+{
+    uint32_t default_ipa_size;
+    hv_return_t ret = hv_vm_config_get_default_ipa_size(&default_ipa_size);
+    assert_hvf_ok(ret);
+
+    return default_ipa_size;
+}
+
+uint32_t hvf_arm_get_max_ipa_bit_size(void)
+{
+    uint32_t max_ipa_size;
+    hv_return_t ret = hv_vm_config_get_max_ipa_size(&max_ipa_size);
+    assert_hvf_ok(ret);
+
+    /*
+     * We clamp any IPA size we want to back the VM with to a valid PARange
+     * value so the guest doesn't try and map memory outside of the valid range.
+     * This logic just clamps the passed in IPA bit size to the first valid
+     * PARange value <= to it.
+     */
+    return round_down_to_parange_bit_size(max_ipa_size);
+}
+
 void hvf_arm_set_cpu_features_from_host(ARMCPU *cpu)
 {
     if (!arm_host_cpu_features.dtb_compatible) {
@@ -931,8 +970,18 @@  void hvf_arch_vcpu_destroy(CPUState *cpu)
 
 hv_return_t hvf_arch_vm_create(MachineState *ms, uint32_t pa_range)
 {
+    hv_return_t ret;
     hv_vm_config_t config = hv_vm_config_create();
-    hv_return_t ret = hv_vm_create(config);
+
+    ret = hv_vm_config_set_ipa_size(config, pa_range);
+    if (ret != HV_SUCCESS) {
+        goto cleanup;
+    }
+    chosen_ipa_bit_size = pa_range;
+
+    ret = hv_vm_create(config);
+
+cleanup:
     os_release(config);
 
     return ret;
@@ -1004,6 +1053,11 @@  int hvf_arch_init_vcpu(CPUState *cpu)
                               &arm_cpu->isar.id_aa64mmfr0);
     assert_hvf_ok(ret);
 
+    clamp_id_aa64mmfr0_parange_to_ipa_size(&arm_cpu->isar.id_aa64mmfr0);
+    ret = hv_vcpu_set_sys_reg(cpu->accel->fd, HV_SYS_REG_ID_AA64MMFR0_EL1,
+                              arm_cpu->isar.id_aa64mmfr0);
+    assert_hvf_ok(ret);
+
     return 0;
 }
 
diff --git a/target/arm/hvf_arm.h b/target/arm/hvf_arm.h
index e848c1d27d..26c717b382 100644
--- a/target/arm/hvf_arm.h
+++ b/target/arm/hvf_arm.h
@@ -22,4 +22,23 @@  void hvf_arm_init_debug(void);
 
 void hvf_arm_set_cpu_features_from_host(ARMCPU *cpu);
 
+#ifdef CONFIG_HVF
+
+uint32_t hvf_arm_get_default_ipa_bit_size(void);
+uint32_t hvf_arm_get_max_ipa_bit_size(void);
+
+#else
+
+static inline uint32_t hvf_arm_get_default_ipa_bit_size(void)
+{
+    return 0;
+}
+
+static inline uint32_t hvf_arm_get_max_ipa_bit_size(void)
+{
+    return 0;
+}
+
+#endif
+
 #endif
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 203a2dae14..c5d7b0b492 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -450,6 +450,25 @@  static inline void update_spsel(CPUARMState *env, uint32_t imm)
  */
 unsigned int arm_pamax(ARMCPU *cpu);
 
+/*
+ * round_down_to_parange_index
+ * @bit_size: uint8_t
+ *
+ * Rounds down the bit_size supplied to the first supported ARM physical
+ * address range and returns the index for this. The index is intended to
+ * be used to set ID_AA64MMFR0_EL1's PARANGE bits.
+ */
+uint8_t round_down_to_parange_index(uint8_t bit_size);
+
+/*
+ * round_down_to_parange_bit_size
+ * @bit_size: uint8_t
+ *
+ * Rounds down the bit_size supplied to the first supported ARM physical
+ * address range bit size and returns this.
+ */
+uint8_t round_down_to_parange_bit_size(uint8_t bit_size);
+
 /* Return true if extended addresses are enabled.
  * This is always the case if our translation regime is 64 bit,
  * but depends on TTBCR.EAE for 32 bit.
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 278004661b..defd6b84de 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -96,6 +96,21 @@  static const uint8_t pamax_map[] = {
     [6] = 52,
 };
 
+uint8_t round_down_to_parange_index(uint8_t bit_size)
+{
+    for (int i = ARRAY_SIZE(pamax_map) - 1; i >= 0; i--) {
+        if (pamax_map[i] <= bit_size) {
+            return i;
+        }
+    }
+    g_assert_not_reached();
+}
+
+uint8_t round_down_to_parange_bit_size(uint8_t bit_size)
+{
+    return pamax_map[round_down_to_parange_index(bit_size)];
+}
+
 /*
  * The cpu-specific constant value of PAMax; also used by hw/arm/virt.
  * Note that machvirt_init calls this on a CPU that is inited but not realized!