Message ID | 20200716225655.24289-1-arbab@linux.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v4] spapr: Add a new level of NUMA for GPUs | expand |
On Thu, Jul 16, 2020 at 05:56:55PM -0500, Reza Arbab wrote: > NUMA nodes corresponding to GPU memory currently have the same > affinity/distance as normal memory nodes. Add a third NUMA associativity > reference point enabling us to give GPU nodes more distance. > > This is guest visible information, which shouldn't change under a > running guest across migration between different qemu versions, so make > the change effective only in new (pseries > 5.0) machine types. > > Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5): > > node distances: > node 0 1 2 3 4 5 > 0: 10 40 40 40 40 40 > 1: 40 10 40 40 40 40 > 2: 40 40 10 40 40 40 > 3: 40 40 40 10 40 40 > 4: 40 40 40 40 10 40 > 5: 40 40 40 40 40 10 > > After: > > node distances: > node 0 1 2 3 4 5 > 0: 10 40 80 80 80 80 > 1: 40 10 80 80 80 80 > 2: 80 80 10 80 80 80 > 3: 80 80 80 10 80 80 > 4: 80 80 80 80 10 80 > 5: 80 80 80 80 80 10 > > These are the same distances as on the host, mirroring the change made > to host firmware in skiboot commit f845a648b8cb ("numa/associativity: > Add a new level of NUMA for GPU's"). Applied to ppc-for-5.1. > > Signed-off-by: Reza Arbab <arbab@linux.ibm.com> > --- > v4: > * Use nvslot->numa_id for distinction at all levels of ibm,associativity > * Use ARRAY_SIZE(refpoints) > * Rebase > > v3: > * Squash into one patch > * Add PHB compat property > --- > hw/ppc/spapr.c | 21 +++++++++++++++++++-- > hw/ppc/spapr_pci.c | 2 ++ > hw/ppc/spapr_pci_nvlink2.c | 13 ++++++++++--- > include/hw/pci-host/spapr.h | 1 + > include/hw/ppc/spapr.h | 1 + > 5 files changed, 33 insertions(+), 5 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 299908cc7396..0ae293ec9431 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -890,10 +890,16 @@ static int spapr_dt_rng(void *fdt) > static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) > { > MachineState *ms = MACHINE(spapr); > + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms); > int rtas; > GString *hypertas = g_string_sized_new(256); > GString *qemu_hypertas = g_string_sized_new(256); > - uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) }; > + uint32_t refpoints[] = { > + cpu_to_be32(0x4), > + cpu_to_be32(0x4), > + cpu_to_be32(0x2), > + }; > + uint32_t nr_refpoints = ARRAY_SIZE(refpoints); > uint64_t max_device_addr = MACHINE(spapr)->device_memory->base + > memory_region_size(&MACHINE(spapr)->device_memory->mr); > uint32_t lrdr_capacity[] = { > @@ -945,8 +951,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) > qemu_hypertas->str, qemu_hypertas->len)); > g_string_free(qemu_hypertas, TRUE); > > + if (smc->pre_5_1_assoc_refpoints) { > + nr_refpoints = 2; > + } > + > _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", > - refpoints, sizeof(refpoints))); > + refpoints, nr_refpoints * sizeof(refpoints[0]))); > > _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", > maxdomains, sizeof(maxdomains))); > @@ -4584,9 +4594,16 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true); > */ > static void spapr_machine_5_0_class_options(MachineClass *mc) > { > + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); > + static GlobalProperty compat[] = { > + { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" }, > + }; > + > spapr_machine_5_1_class_options(mc); > compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len); > + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); > mc->numa_mem_supported = true; > + smc->pre_5_1_assoc_refpoints = true; > } > > DEFINE_SPAPR_MACHINE(5_0, "5.0", false); > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 2a6a48744aaa..16739334e35f 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -2035,6 +2035,8 @@ static Property spapr_phb_properties[] = { > pcie_ecs, true), > DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0), > DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0), > + DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState, > + pre_5_1_assoc, false), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c > index dd8cd6db9654..76ae77ebc851 100644 > --- a/hw/ppc/spapr_pci_nvlink2.c > +++ b/hw/ppc/spapr_pci_nvlink2.c > @@ -362,9 +362,9 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) > &error_abort); > uint32_t associativity[] = { > cpu_to_be32(0x4), > - SPAPR_GPU_NUMA_ID, > - SPAPR_GPU_NUMA_ID, > - SPAPR_GPU_NUMA_ID, > + cpu_to_be32(nvslot->numa_id), > + cpu_to_be32(nvslot->numa_id), > + cpu_to_be32(nvslot->numa_id), > cpu_to_be32(nvslot->numa_id) > }; > uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); > @@ -375,6 +375,13 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) > _FDT(off); > _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); > _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); > + > + if (sphb->pre_5_1_assoc) { > + associativity[1] = SPAPR_GPU_NUMA_ID; > + associativity[2] = SPAPR_GPU_NUMA_ID; > + associativity[3] = SPAPR_GPU_NUMA_ID; > + } > + > _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, > sizeof(associativity)))); > > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 8877ff51fbf7..600eb55c3488 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -94,6 +94,7 @@ struct SpaprPhbState { > hwaddr nv2_gpa_win_addr; > hwaddr nv2_atsd_win_addr; > SpaprPhbPciNvGpuConfig *nvgpus; > + bool pre_5_1_assoc; > }; > > #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index c421410e3fb8..3134d339e8fe 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -129,6 +129,7 @@ struct SpaprMachineClass { > bool linux_pci_probe; > bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ > hwaddr rma_limit; /* clamp the RMA to this size */ > + bool pre_5_1_assoc_refpoints; > > void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, > uint64_t *buid, hwaddr *pio,
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 299908cc7396..0ae293ec9431 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -890,10 +890,16 @@ static int spapr_dt_rng(void *fdt) static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) { MachineState *ms = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms); int rtas; GString *hypertas = g_string_sized_new(256); GString *qemu_hypertas = g_string_sized_new(256); - uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) }; + uint32_t refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t nr_refpoints = ARRAY_SIZE(refpoints); uint64_t max_device_addr = MACHINE(spapr)->device_memory->base + memory_region_size(&MACHINE(spapr)->device_memory->mr); uint32_t lrdr_capacity[] = { @@ -945,8 +951,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) qemu_hypertas->str, qemu_hypertas->len)); g_string_free(qemu_hypertas, TRUE); + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } + _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", - refpoints, sizeof(refpoints))); + refpoints, nr_refpoints * sizeof(refpoints[0]))); _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", maxdomains, sizeof(maxdomains))); @@ -4584,9 +4594,16 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true); */ static void spapr_machine_5_0_class_options(MachineClass *mc) { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + static GlobalProperty compat[] = { + { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" }, + }; + spapr_machine_5_1_class_options(mc); compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len); + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); mc->numa_mem_supported = true; + smc->pre_5_1_assoc_refpoints = true; } DEFINE_SPAPR_MACHINE(5_0, "5.0", false); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 2a6a48744aaa..16739334e35f 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -2035,6 +2035,8 @@ static Property spapr_phb_properties[] = { pcie_ecs, true), DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0), DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0), + DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState, + pre_5_1_assoc, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c index dd8cd6db9654..76ae77ebc851 100644 --- a/hw/ppc/spapr_pci_nvlink2.c +++ b/hw/ppc/spapr_pci_nvlink2.c @@ -362,9 +362,9 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) &error_abort); uint32_t associativity[] = { cpu_to_be32(0x4), - SPAPR_GPU_NUMA_ID, - SPAPR_GPU_NUMA_ID, - SPAPR_GPU_NUMA_ID, + cpu_to_be32(nvslot->numa_id), + cpu_to_be32(nvslot->numa_id), + cpu_to_be32(nvslot->numa_id), cpu_to_be32(nvslot->numa_id) }; uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); @@ -375,6 +375,13 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) _FDT(off); _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); + + if (sphb->pre_5_1_assoc) { + associativity[1] = SPAPR_GPU_NUMA_ID; + associativity[2] = SPAPR_GPU_NUMA_ID; + associativity[3] = SPAPR_GPU_NUMA_ID; + } + _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, sizeof(associativity)))); diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 8877ff51fbf7..600eb55c3488 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -94,6 +94,7 @@ struct SpaprPhbState { hwaddr nv2_gpa_win_addr; hwaddr nv2_atsd_win_addr; SpaprPhbPciNvGpuConfig *nvgpus; + bool pre_5_1_assoc; }; #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index c421410e3fb8..3134d339e8fe 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -129,6 +129,7 @@ struct SpaprMachineClass { bool linux_pci_probe; bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ hwaddr rma_limit; /* clamp the RMA to this size */ + bool pre_5_1_assoc_refpoints; void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio,
NUMA nodes corresponding to GPU memory currently have the same affinity/distance as normal memory nodes. Add a third NUMA associativity reference point enabling us to give GPU nodes more distance. This is guest visible information, which shouldn't change under a running guest across migration between different qemu versions, so make the change effective only in new (pseries > 5.0) machine types. Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5): node distances: node 0 1 2 3 4 5 0: 10 40 40 40 40 40 1: 40 10 40 40 40 40 2: 40 40 10 40 40 40 3: 40 40 40 10 40 40 4: 40 40 40 40 10 40 5: 40 40 40 40 40 10 After: node distances: node 0 1 2 3 4 5 0: 10 40 80 80 80 80 1: 40 10 80 80 80 80 2: 80 80 10 80 80 80 3: 80 80 80 10 80 80 4: 80 80 80 80 10 80 5: 80 80 80 80 80 10 These are the same distances as on the host, mirroring the change made to host firmware in skiboot commit f845a648b8cb ("numa/associativity: Add a new level of NUMA for GPU's"). Signed-off-by: Reza Arbab <arbab@linux.ibm.com> --- v4: * Use nvslot->numa_id for distinction at all levels of ibm,associativity * Use ARRAY_SIZE(refpoints) * Rebase v3: * Squash into one patch * Add PHB compat property --- hw/ppc/spapr.c | 21 +++++++++++++++++++-- hw/ppc/spapr_pci.c | 2 ++ hw/ppc/spapr_pci_nvlink2.c | 13 ++++++++++--- include/hw/pci-host/spapr.h | 1 + include/hw/ppc/spapr.h | 1 + 5 files changed, 33 insertions(+), 5 deletions(-)