diff mbox series

[RESEND,v3,2/5] spapr: move NUMA data init to post-CAS

Message ID 20210825143943.529733-3-danielhb413@gmail.com (mailing list archive)
State New, archived
Headers show
Series pSeries FORM2 affinity support | expand

Commit Message

Daniel Henrique Barboza Aug. 25, 2021, 2:39 p.m. UTC
The pSeries machine will support a new NUMA affinity form, FORM2.
This new FORM will be negotiated via ibm,architecture-vec5 during
CAS. All artifacts and assumptions that are currently on use for
FORM1 affinity will be deprecated in a guest that chooses to use
FORM2. This means that we're going to wait until CAS to determine
whether we're going to use FORM1 and FORM2.

This patch does that by moving all NUMA data init functions to post-CAS
time. spapr_numa_associativity_init() is moved from spapr_machine_init()
to do_client_architecture_support(). Straightforward change since the
initialization of spapr->numa_assoc_array is transparent to the guest.

spapr_numa_write_rtas_dt() is more complex. The function is called from
spapr_dt_rtas(), which in turned is called by spapr_build_fdt().
spapr_build_fdt() is called in two places: spapr_machine_reset() and
do_client_architecture_support(). This means that we're writing RTAS
nodes with NUMA artifacts without going through CAS first, and then
writing it again post CAS. This is not an issue because, at this moment,
we always write the same FORM1 NUMA affinity properties in the DT.
With the upcoming FORM2 support, we're now reliant on guest choice to
decide what to write.

The proposed solution is to change spapr_numa_write_rtas_dt() to not
write the DT until we're post-CAS. This is a benign guest visible change
(a well behaved guest wouldn't bother to read NUMA properties before CAs),
but to be on the safe side, let's wrap it with a machine class flag to skip
this logic unless we're running with the latest machine type. This also
means that FORM2 support will not be available for older machine types.

Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
---
 hw/ppc/spapr.c         |  6 +++---
 hw/ppc/spapr_hcall.c   |  4 ++++
 hw/ppc/spapr_numa.c    | 11 +++++++++++
 include/hw/ppc/spapr.h |  1 +
 4 files changed, 19 insertions(+), 3 deletions(-)

Comments

Greg Kurz Aug. 25, 2021, 4:46 p.m. UTC | #1
On Wed, 25 Aug 2021 11:39:40 -0300
Daniel Henrique Barboza <danielhb413@gmail.com> wrote:

> The pSeries machine will support a new NUMA affinity form, FORM2.
> This new FORM will be negotiated via ibm,architecture-vec5 during
> CAS. All artifacts and assumptions that are currently on use for
> FORM1 affinity will be deprecated in a guest that chooses to use
> FORM2. This means that we're going to wait until CAS to determine
> whether we're going to use FORM1 and FORM2.
> 
> This patch does that by moving all NUMA data init functions to post-CAS
> time. spapr_numa_associativity_init() is moved from spapr_machine_init()
> to do_client_architecture_support(). Straightforward change since the
> initialization of spapr->numa_assoc_array is transparent to the guest.
> 
> spapr_numa_write_rtas_dt() is more complex. The function is called from
> spapr_dt_rtas(), which in turned is called by spapr_build_fdt().

It seems some other functions like spapr_numa_write_associativity_dt()
or spapr_numa_get_vcpu_assoc() could also be affected by the delayed call
to spapr_numa_associativity_init().

> spapr_build_fdt() is called in two places: spapr_machine_reset() and
> do_client_architecture_support(). This means that we're writing RTAS
> nodes with NUMA artifacts without going through CAS first, and then
> writing it again post CAS. This is not an issue because, at this moment,
> we always write the same FORM1 NUMA affinity properties in the DT.
> With the upcoming FORM2 support, we're now reliant on guest choice to
> decide what to write.
> 
> The proposed solution is to change spapr_numa_write_rtas_dt() to not
> write the DT until we're post-CAS. This is a benign guest visible change
> (a well behaved guest wouldn't bother to read NUMA properties before CAs),
> but to be on the safe side, let's wrap it with a machine class flag to skip
> this logic unless we're running with the latest machine type. This also
> means that FORM2 support will not be available for older machine types.
> 
> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> ---
>  hw/ppc/spapr.c         |  6 +++---
>  hw/ppc/spapr_hcall.c   |  4 ++++
>  hw/ppc/spapr_numa.c    | 11 +++++++++++
>  include/hw/ppc/spapr.h |  1 +
>  4 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 5459f9a7e9..d8363bda88 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2809,9 +2809,6 @@ static void spapr_machine_init(MachineState *machine)
>  
>      spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
>  
> -    /* Init numa_assoc_array */
> -    spapr_numa_associativity_init(spapr, machine);
> -
>      if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
>          ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>                                spapr->max_compat_pvr)) {
> @@ -4709,8 +4706,11 @@ DEFINE_SPAPR_MACHINE(6_1, "6.1", true);
>   */
>  static void spapr_machine_6_0_class_options(MachineClass *mc)
>  {
> +    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +
>      spapr_machine_6_1_class_options(mc);
>      compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
> +    smc->pre_6_1_numa_affinity = true;

Versions should be bumped now that 6.1 is released.

>  }
>  
>  DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 0e9a5b2e40..1cc88716c1 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -11,6 +11,7 @@
>  #include "helper_regs.h"
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_cpu_core.h"
> +#include "hw/ppc/spapr_numa.h"
>  #include "mmu-hash64.h"
>  #include "cpu-models.h"
>  #include "trace.h"
> @@ -1197,6 +1198,9 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>      spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00);
>      spapr_ovec_cleanup(ov1_guest);
>  
> +    /* Init numa_assoc_array */
> +    spapr_numa_associativity_init(spapr, MACHINE(spapr));
> +
>      /*
>       * Ensure the guest asks for an interrupt mode we support;
>       * otherwise terminate the boot.
> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
> index 04a86f9b5b..b0bd056546 100644
> --- a/hw/ppc/spapr_numa.c
> +++ b/hw/ppc/spapr_numa.c
> @@ -379,6 +379,17 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
>   */
>  void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
>  {
> +    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
> +
> +    /*
> +     * pre-6.1 machine types were writing RTAS information before

pre-6.2

> +     * CAS. Check if that's case before changing their existing
> +     * behavior.
> +     */
> +    if (spapr_ovec_empty(spapr->ov5_cas) && !smc->pre_6_1_numa_affinity) {

Checking emptiness of spapr->ov5_cas is a hack since the guest
could have cleared all the bits... I'm not a big fan of checks
for pre/post CAS actually even if I had to add one for memory
hot unplug support some time back.

I'm not sure about the motivation for this patch. Is it *just* to
avoid the supposedly useless generation of FORM1 properties in
the initial DT ? If yes, this isn't a hot path so I don't think
it's worth the pain. We can start with FORM1 and switch to FORM2
if the guest requests so during CAS, no ?

> +        return;
> +    }
> +
>      spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas);
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 637652ad16..068a29535a 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -145,6 +145,7 @@ struct SpaprMachineClass {
>      hwaddr rma_limit;          /* clamp the RMA to this size */
>      bool pre_5_1_assoc_refpoints;
>      bool pre_5_2_numa_associativity;
> +    bool pre_6_1_numa_affinity;
>  
>      bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio,
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 5459f9a7e9..d8363bda88 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2809,9 +2809,6 @@  static void spapr_machine_init(MachineState *machine)
 
     spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
 
-    /* Init numa_assoc_array */
-    spapr_numa_associativity_init(spapr, machine);
-
     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
                               spapr->max_compat_pvr)) {
@@ -4709,8 +4706,11 @@  DEFINE_SPAPR_MACHINE(6_1, "6.1", true);
  */
 static void spapr_machine_6_0_class_options(MachineClass *mc)
 {
+    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
     spapr_machine_6_1_class_options(mc);
     compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+    smc->pre_6_1_numa_affinity = true;
 }
 
 DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 0e9a5b2e40..1cc88716c1 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -11,6 +11,7 @@ 
 #include "helper_regs.h"
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_cpu_core.h"
+#include "hw/ppc/spapr_numa.h"
 #include "mmu-hash64.h"
 #include "cpu-models.h"
 #include "trace.h"
@@ -1197,6 +1198,9 @@  target_ulong do_client_architecture_support(PowerPCCPU *cpu,
     spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00);
     spapr_ovec_cleanup(ov1_guest);
 
+    /* Init numa_assoc_array */
+    spapr_numa_associativity_init(spapr, MACHINE(spapr));
+
     /*
      * Ensure the guest asks for an interrupt mode we support;
      * otherwise terminate the boot.
diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index 04a86f9b5b..b0bd056546 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -379,6 +379,17 @@  static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
  */
 void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
 {
+    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+
+    /*
+     * pre-6.1 machine types were writing RTAS information before
+     * CAS. Check if that's case before changing their existing
+     * behavior.
+     */
+    if (spapr_ovec_empty(spapr->ov5_cas) && !smc->pre_6_1_numa_affinity) {
+        return;
+    }
+
     spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas);
 }
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 637652ad16..068a29535a 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -145,6 +145,7 @@  struct SpaprMachineClass {
     hwaddr rma_limit;          /* clamp the RMA to this size */
     bool pre_5_1_assoc_refpoints;
     bool pre_5_2_numa_associativity;
+    bool pre_6_1_numa_affinity;
 
     bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
                           uint64_t *buid, hwaddr *pio,