diff mbox

[RFC,v2] spapr: Support ibm, dynamic-memory-v2 property

Message ID 20180409062538.1095-1-bharata@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bharata B Rao April 9, 2018, 6:25 a.m. UTC
The new property ibm,dynamic-memory-v2 allows memory to be represented
in a more compact manner in device tree.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
v1: https://lists.nongnu.org/archive/html/qemu-devel/2018-03/msg01788.html
Changes in v1:
- Minor cleanups in the error paths
- Rebased on top of ppc-for-2.13

 docs/specs/ppc-spapr-hotplug.txt |  19 +++
 hw/ppc/spapr.c                   | 257 ++++++++++++++++++++++++++++++++-------
 include/hw/ppc/spapr.h           |   1 +
 include/hw/ppc/spapr_ovec.h      |   1 +
 4 files changed, 233 insertions(+), 45 deletions(-)

Comments

David Gibson April 10, 2018, 3:02 a.m. UTC | #1
On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> The new property ibm,dynamic-memory-v2 allows memory to be represented
> in a more compact manner in device tree.

I still need to look at this in more detail, but to start with:
what's the rationale for this new format?

It's more compact, but why do we care?  The embedded people always
whinge about the size of the deivce tree, but I didn't think that was
really a concern with PAPR.

> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
> v1: https://lists.nongnu.org/archive/html/qemu-devel/2018-03/msg01788.html
> Changes in v1:
> - Minor cleanups in the error paths
> - Rebased on top of ppc-for-2.13
> 
>  docs/specs/ppc-spapr-hotplug.txt |  19 +++
>  hw/ppc/spapr.c                   | 257 ++++++++++++++++++++++++++++++++-------
>  include/hw/ppc/spapr.h           |   1 +
>  include/hw/ppc/spapr_ovec.h      |   1 +
>  4 files changed, 233 insertions(+), 45 deletions(-)
> 
> diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
> index f57e2a09c6..cc7833108e 100644
> --- a/docs/specs/ppc-spapr-hotplug.txt
> +++ b/docs/specs/ppc-spapr-hotplug.txt
> @@ -387,4 +387,23 @@ Each LMB list entry consists of the following elements:
>  - A 32bit flags word. The bit at bit position 0x00000008 defines whether
>    the LMB is assigned to the the partition as of boot time.
>  
> +ibm,dynamic-memory-v2
> +
> +This property describes the dynamically reconfigurable memory. This is
> +an alternate and newer way to describe dyanamically reconfigurable memory.
> +It is a property encoded array that has an integer N (the number of
> +LMB set entries) followed by N LMB set entries. There is an LMB set entry
> +for each sequential group of LMBs that share common attributes.
> +
> +Each LMB set entry consists of the following elements:
> +
> +- Number of sequential LMBs in the entry represented by a 32bit integer.
> +- Logical address of the first LMB in the set encoded as a 64bit integer.
> +- DRC index of the first LMB in the set.
> +- Associativity list index that is used as an index into
> +  ibm,associativity-lookup-arrays property described earlier. This
> +  is used to retrieve the right associativity list to be used for all
> +  the LMBs in this set.
> +- A 32bit flags word that applies to all the LMBs in the set.
> +
>  [1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3ffadd6ac7..4a24fac38c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -669,63 +669,139 @@ static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
>      return -1;
>  }
>  
> -/*
> - * Adds ibm,dynamic-reconfiguration-memory node.
> - * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> - * of this device tree node.
> - */
> -static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> +struct of_drconf_cell_v2 {
> +     uint32_t seq_lmbs;
> +     uint64_t base_addr;
> +     uint32_t drc_index;
> +     uint32_t aa_index;
> +     uint32_t flags;
> +} __attribute__((packed));
> +
> +#define SPAPR_DRCONF_CELL_SIZE 6
> +
> +/* ibm,dynamic-memory-v2 */
> +static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> +                                   int offset, MemoryDeviceInfoList *dimms)
>  {
> -    MachineState *machine = MACHINE(spapr);
> -    int ret, i, offset;
> -    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> -    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> -    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> -    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> -                       memory_region_size(&spapr->hotplug_memory.mr)) /
> -                       lmb_size;
>      uint32_t *int_buf, *cur_index, buf_len;
> -    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> -    MemoryDeviceInfoList *dimms = NULL;
> +    int ret;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint64_t addr, cur_addr, size;
> +    uint32_t nr_boot_lmbs = (spapr->hotplug_memory.base / lmb_size);
> +    uint64_t mem_end = spapr->hotplug_memory.base +
> +                       memory_region_size(&spapr->hotplug_memory.mr);
> +    uint32_t node, nr_entries = 0;
> +    sPAPRDRConnector *drc;
> +    typedef struct drconf_cell_queue {
> +        struct of_drconf_cell_v2 cell;
> +        QSIMPLEQ_ENTRY(drconf_cell_queue) entry;
> +    } drconf_cell_queue;
> +    QSIMPLEQ_HEAD(, drconf_cell_queue) drconf_queue
> +        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
> +    drconf_cell_queue *elem, *next;
> +    MemoryDeviceInfoList *info;
>  
> -    /*
> -     * Don't create the node if there is no hotpluggable memory
> -     */
> -    if (machine->ram_size == machine->maxram_size) {
> -        return 0;
> -    }
> +    /* Entry to cover RAM and the gap area */
> +    elem = g_malloc0(sizeof(drconf_cell_queue));
> +    elem->cell.seq_lmbs = cpu_to_be32(nr_boot_lmbs);
> +    elem->cell.base_addr = cpu_to_be64(0);
> +    elem->cell.drc_index = cpu_to_be32(0);
> +    elem->cell.aa_index = cpu_to_be32(-1);
> +    elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
> +                                   SPAPR_LMB_FLAGS_DRC_INVALID);
> +    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +    nr_entries++;
> +
> +    cur_addr = spapr->hotplug_memory.base;
> +    for (info = dimms; info; info = info->next) {
> +        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
> +
> +        addr = di->addr;
> +        size = di->size;
> +        node = di->node;
> +
> +        /* Entry for hot-pluggable area */
> +        if (cur_addr < addr) {
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> +            g_assert(drc);
>  
> -    /*
> -     * Allocate enough buffer size to fit in ibm,dynamic-memory
> -     * or ibm,associativity-lookup-arrays
> -     */
> -    buf_len = MAX(nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1, nr_nodes * 4 + 2)
> -              * sizeof(uint32_t);
> -    cur_index = int_buf = g_malloc0(buf_len);
> +            elem = g_malloc0(sizeof(drconf_cell_queue));
> +            elem->cell.seq_lmbs = cpu_to_be32((addr - cur_addr) / lmb_size);
> +            elem->cell.base_addr = cpu_to_be64(cur_addr);
> +            elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +            elem->cell.aa_index = cpu_to_be32(-1);
> +            elem->cell.flags = cpu_to_be32(0);
> +            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +            nr_entries++;
> +        }
>  
> -    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> +        /* Entry for DIMM */
> +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> +        g_assert(drc);
>  
> -    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> -                    sizeof(prop_lmb_size));
> -    if (ret < 0) {
> -        goto out;
> +        elem = g_malloc0(sizeof(drconf_cell_queue));
> +        elem->cell.seq_lmbs = cpu_to_be32(size / lmb_size);
> +        elem->cell.base_addr = cpu_to_be64(addr);
> +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +        elem->cell.aa_index = cpu_to_be32(node);
> +        elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
> +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +        nr_entries++;
> +        cur_addr = addr + size;
>      }
>  
> -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> -    if (ret < 0) {
> -        goto out;
> +    /* Entry for remaining hotpluggable area */
> +    if (cur_addr < mem_end) {
> +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> +        g_assert(drc);
> +
> +        elem = g_malloc0(sizeof(drconf_cell_queue));
> +        elem->cell.seq_lmbs = cpu_to_be32((mem_end - cur_addr) / lmb_size);
> +        elem->cell.base_addr = cpu_to_be64(cur_addr);
> +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +        elem->cell.aa_index = cpu_to_be32(-1);
> +        elem->cell.flags = cpu_to_be32(0);
> +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +        nr_entries++;
>      }
>  
> -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> -    if (ret < 0) {
> -        goto out;
> +    buf_len = (nr_entries * SPAPR_DRCONF_CELL_SIZE + 1) * sizeof(uint32_t);
> +    int_buf = cur_index = g_malloc0(buf_len);
> +    int_buf[0] = cpu_to_be32(nr_entries);
> +    cur_index++;
> +    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
> +        memcpy(cur_index, &elem->cell,
> +               SPAPR_DRCONF_CELL_SIZE * sizeof(uint32_t));
> +        cur_index += SPAPR_DRCONF_CELL_SIZE;
> +        QSIMPLEQ_REMOVE(&drconf_queue, elem, drconf_cell_queue, entry);
> +        g_free(elem);
>      }
>  
> -    if (hotplug_lmb_start) {
> -        dimms = qmp_pc_dimm_device_list();
> +    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
> +    g_free(int_buf);
> +    if (ret < 0) {
> +        return -1;
>      }
> +    return 0;
> +}
>  
> -    /* ibm,dynamic-memory */
> +/* ibm,dynamic-memory */
> +static int spapr_populate_drmem_v1(sPAPRMachineState *spapr, void *fdt,
> +                                   int offset, MemoryDeviceInfoList *dimms)
> +{
> +    int i, ret;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> +    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> +                       memory_region_size(&spapr->hotplug_memory.mr)) /
> +                       lmb_size;
> +    uint32_t *int_buf, *cur_index, buf_len;
> +
> +    /*
> +     * Allocate enough buffer size to fit in ibm,dynamic-memory
> +     */
> +    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
> +    cur_index = int_buf = g_malloc0(buf_len);
>      int_buf[0] = cpu_to_be32(nr_lmbs);
>      cur_index++;
>      for (i = 0; i < nr_lmbs; i++) {
> @@ -765,13 +841,71 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
>  
>          cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
>      }
> -    qapi_free_MemoryDeviceInfoList(dimms);
>      ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
> +    g_free(int_buf);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +/*
> + * Adds ibm,dynamic-reconfiguration-memory node.
> + * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> + * of this device tree node.
> + */
> +static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> +{
> +    MachineState *machine = MACHINE(spapr);
> +    int ret, i, offset;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> +    uint32_t *int_buf, *cur_index, buf_len;
> +    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> +    MemoryDeviceInfoList *dimms = NULL;
> +
> +    /*
> +     * Don't create the node if there is no hotpluggable memory
> +     */
> +    if (machine->ram_size == machine->maxram_size) {
> +        return 0;
> +    }
> +
> +    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> +
> +    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> +                    sizeof(prop_lmb_size));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
> +    dimms = qmp_pc_dimm_device_list();
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
> +        ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
> +    } else {
> +        ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
> +    }
> +    qapi_free_MemoryDeviceInfoList(dimms);
> +
>      if (ret < 0) {
>          goto out;
>      }
>  
>      /* ibm,associativity-lookup-arrays */
> +    buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
> +    cur_index = int_buf = g_malloc0(buf_len);
> +
>      cur_index = int_buf;
>      int_buf[0] = cpu_to_be32(nr_nodes);
>      int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
> @@ -788,8 +922,9 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
>      }
>      ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
>              (cur_index - int_buf) * sizeof(uint32_t));
> -out:
>      g_free(int_buf);
> +
> +out:
>      return ret;
>  }
>  
> @@ -2500,6 +2635,11 @@ static void spapr_machine_init(MachineState *machine)
>          spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
>      }
>  
> +    /* advertise support for ibm,dyamic-memory-v2 */
> +    if (spapr->use_ibm_dynamic_memory_v2) {
> +        spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
> +    }
> +
>      /* init CPUs */
>      spapr_init_cpus(spapr);
>  
> @@ -2907,12 +3047,27 @@ static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
>      visit_type_uint32(v, name, (uint32_t *)opaque, errp);
>  }
>  
> +static bool spapr_get_drmem_v2(Object *obj, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->use_ibm_dynamic_memory_v2;
> +}
> +
> +static void spapr_set_drmem_v2(Object *obj, bool value, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    spapr->use_ibm_dynamic_memory_v2 = value;
> +}
> +
>  static void spapr_instance_init(Object *obj)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>  
>      spapr->htab_fd = -1;
>      spapr->use_hotplug_event_source = true;
> +    spapr->use_ibm_dynamic_memory_v2 = true;
>      object_property_add_str(obj, "kvm-type",
>                              spapr_get_kvm_type, spapr_set_kvm_type, NULL);
>      object_property_set_description(obj, "kvm-type",
> @@ -2927,6 +3082,15 @@ static void spapr_instance_init(Object *obj)
>                                      " place of standard EPOW events when possible"
>                                      " (required for memory hot-unplug support)",
>                                      NULL);
> +    object_property_add_bool(obj, "drmem-v2",
> +                             spapr_get_drmem_v2,
> +                             spapr_set_drmem_v2,
> +                             NULL);
> +    object_property_set_description(obj, "ibm-dynamic-memory-v2",
> +                                    "Use ibm-dynamic-memory-v2 representation"
> +                                    " in place of ibm-dynamic-memory when"
> +                                    " possible",
> +                                    NULL);
>  
>      ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
>                              "Maximum permitted CPU compatibility mode",
> @@ -4042,7 +4206,10 @@ DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
>  
>  static void spapr_machine_2_11_instance_options(MachineState *machine)
>  {
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
> +
>      spapr_machine_2_12_instance_options(machine);
> +    spapr->use_ibm_dynamic_memory_v2 = false;
>  }
>  
>  static void spapr_machine_2_11_class_options(MachineClass *mc)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index d60b7c6d7a..5e044c44af 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -149,6 +149,7 @@ struct sPAPRMachineState {
>      sPAPROptionVector *ov5;         /* QEMU-supported option vectors */
>      sPAPROptionVector *ov5_cas;     /* negotiated (via CAS) option vectors */
>      uint32_t max_compat_pvr;
> +    bool use_ibm_dynamic_memory_v2;
>  
>      /* Migration state */
>      int htab_save_index;
> diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
> index bf25e5d954..0f2d8d715d 100644
> --- a/include/hw/ppc/spapr_ovec.h
> +++ b/include/hw/ppc/spapr_ovec.h
> @@ -51,6 +51,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
>  #define OV5_FORM1_AFFINITY      OV_BIT(5, 0)
>  #define OV5_HP_EVT              OV_BIT(6, 5)
>  #define OV5_HPT_RESIZE          OV_BIT(6, 7)
> +#define OV5_DRMEM_V2            OV_BIT(22, 0)
>  #define OV5_XIVE_BOTH           OV_BIT(23, 0)
>  #define OV5_XIVE_EXPLOIT        OV_BIT(23, 1) /* 1=exploitation 0=legacy */
>
Bharata B Rao April 10, 2018, 4:15 a.m. UTC | #2
On Tue, Apr 10, 2018 at 01:02:45PM +1000, David Gibson wrote:
> On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> > The new property ibm,dynamic-memory-v2 allows memory to be represented
> > in a more compact manner in device tree.
> 
> I still need to look at this in more detail, but to start with:
> what's the rationale for this new format?
> 
> It's more compact, but why do we care?  The embedded people always
> whinge about the size of the deivce tree, but I didn't think that was
> really a concern with PAPR.

Here's a real example of how this has affected us earlier:

SLOF's CAS FDT buffer size was initially 32K, was changed to 64k to
support 1TB guest memory and again changed to 2MB to support 16TB guest
memory.

With ibm,dynamic-memory-v2 we are less likely to hit such scenarios.

Also, theoretically it should be more efficient in the guest kernel
to handle LMB-sets than individual LMBs.

We aren't there yet, but I believe grouping of LMBs should eventually
help us do memory hotplug at set (or DIMM) granularity than at individual
LMB granularity (Again theoretical possibility)

Regards,
Bharata.
Alexey Kardashevskiy April 11, 2018, 3:56 a.m. UTC | #3
On 10/4/18 1:02 pm, David Gibson wrote:
> On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
>> The new property ibm,dynamic-memory-v2 allows memory to be represented
>> in a more compact manner in device tree.
> 
> I still need to look at this in more detail, but to start with:
> what's the rationale for this new format?
> 
> It's more compact, but why do we care?  The embedded people always
> whinge about the size of the deivce tree, but I didn't think that was
> really a concern with PAPR.


Well, booting a guest with 500 pci devices (let's say emulated e1000 or
virtio-net) creates >20sec delay in the guest while it is fetching the
device tree in early setup, property by property, I even have a patch for
pseries guest to get FDT as a whole blob from SLOF. It is not The Problem
but still annoying.
David Gibson April 11, 2018, 4:15 a.m. UTC | #4
On Wed, Apr 11, 2018 at 01:56:14PM +1000, Alexey Kardashevskiy wrote:
1;5002;0c> On 10/4/18 1:02 pm, David Gibson wrote:
> > On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> >> The new property ibm,dynamic-memory-v2 allows memory to be represented
> >> in a more compact manner in device tree.
> > 
> > I still need to look at this in more detail, but to start with:
> > what's the rationale for this new format?
> > 
> > It's more compact, but why do we care?  The embedded people always
> > whinge about the size of the deivce tree, but I didn't think that was
> > really a concern with PAPR.
> 
> 
> Well, booting a guest with 500 pci devices (let's say emulated e1000 or
> virtio-net) creates >20sec delay in the guest while it is fetching the
> device tree in early setup, property by property, I even have a patch for
> pseries guest to get FDT as a whole blob from SLOF. It is not The Problem
> but still annoying.

Right, but 500 PCI devices is not this case.  Sounds like the slowness
there is mostly from having lots of nodes and properties and therefore
lots of individual OF calls.  The old dynamic memory format is still
just one big property, so it shouldn't have nearly the same impact.

Besides, Bharata pretty much convinced me already with his other reasons.
David Gibson April 11, 2018, 4:21 a.m. UTC | #5
On Tue, Apr 10, 2018 at 09:45:21AM +0530, Bharata B Rao wrote:
> On Tue, Apr 10, 2018 at 01:02:45PM +1000, David Gibson wrote:
> > On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> > > The new property ibm,dynamic-memory-v2 allows memory to be represented
> > > in a more compact manner in device tree.
> > 
> > I still need to look at this in more detail, but to start with:
> > what's the rationale for this new format?
> > 
> > It's more compact, but why do we care?  The embedded people always
> > whinge about the size of the deivce tree, but I didn't think that was
> > really a concern with PAPR.
> 
> Here's a real example of how this has affected us earlier:
> 
> SLOF's CAS FDT buffer size was initially 32K, was changed to 64k to
> support 1TB guest memory and again changed to 2MB to support 16TB guest
> memory.

Ah.. I hadn't thought of the CAS buffer, that's a legitimate concern.

> With ibm,dynamic-memory-v2 we are less likely to hit such scenarios.
> 
> Also, theoretically it should be more efficient in the guest kernel
> to handle LMB-sets than individual LMBs.
> 
> We aren't there yet, but I believe grouping of LMBs should eventually
> help us do memory hotplug at set (or DIMM) granularity than at individual
> LMB granularity (Again theoretical possibility)

Ok, sounds like it might be useful.
David Gibson April 11, 2018, 4:45 a.m. UTC | #6
On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> The new property ibm,dynamic-memory-v2 allows memory to be represented
> in a more compact manner in device tree.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
> v1: https://lists.nongnu.org/archive/html/qemu-devel/2018-03/msg01788.html
> Changes in v1:
> - Minor cleanups in the error paths
> - Rebased on top of ppc-for-2.13
> 
>  docs/specs/ppc-spapr-hotplug.txt |  19 +++
>  hw/ppc/spapr.c                   | 257 ++++++++++++++++++++++++++++++++-------
>  include/hw/ppc/spapr.h           |   1 +
>  include/hw/ppc/spapr_ovec.h      |   1 +
>  4 files changed, 233 insertions(+), 45 deletions(-)



> diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
> index f57e2a09c6..cc7833108e 100644
> --- a/docs/specs/ppc-spapr-hotplug.txt
> +++ b/docs/specs/ppc-spapr-hotplug.txt
> @@ -387,4 +387,23 @@ Each LMB list entry consists of the following elements:
>  - A 32bit flags word. The bit at bit position 0x00000008 defines whether
>    the LMB is assigned to the the partition as of boot time.
>  
> +ibm,dynamic-memory-v2
> +
> +This property describes the dynamically reconfigurable memory. This is
> +an alternate and newer way to describe dyanamically reconfigurable memory.
> +It is a property encoded array that has an integer N (the number of
> +LMB set entries) followed by N LMB set entries. There is an LMB set entry
> +for each sequential group of LMBs that share common attributes.
> +
> +Each LMB set entry consists of the following elements:
> +
> +- Number of sequential LMBs in the entry represented by a 32bit integer.
> +- Logical address of the first LMB in the set encoded as a 64bit integer.
> +- DRC index of the first LMB in the set.
> +- Associativity list index that is used as an index into
> +  ibm,associativity-lookup-arrays property described earlier. This
> +  is used to retrieve the right associativity list to be used for all
> +  the LMBs in this set.
> +- A 32bit flags word that applies to all the LMBs in the set.
> +
>  [1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3ffadd6ac7..4a24fac38c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -669,63 +669,139 @@ static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
>      return -1;
>  }
>  
> -/*
> - * Adds ibm,dynamic-reconfiguration-memory node.
> - * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> - * of this device tree node.
> - */
> -static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> +struct of_drconf_cell_v2 {

qemu convention is to use CamelCase for types.

> +     uint32_t seq_lmbs;
> +     uint64_t base_addr;
> +     uint32_t drc_index;
> +     uint32_t aa_index;
> +     uint32_t flags;
> +} __attribute__((packed));
> +
> +#define SPAPR_DRCONF_CELL_SIZE 6

Define this using a sizeof() for safety.

> +/* ibm,dynamic-memory-v2 */
> +static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> +                                   int offset, MemoryDeviceInfoList *dimms)
>  {
> -    MachineState *machine = MACHINE(spapr);
> -    int ret, i, offset;
> -    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> -    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> -    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> -    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> -                       memory_region_size(&spapr->hotplug_memory.mr)) /
> -                       lmb_size;
>      uint32_t *int_buf, *cur_index, buf_len;
> -    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> -    MemoryDeviceInfoList *dimms = NULL;
> +    int ret;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint64_t addr, cur_addr, size;
> +    uint32_t nr_boot_lmbs = (spapr->hotplug_memory.base / lmb_size);
> +    uint64_t mem_end = spapr->hotplug_memory.base +
> +                       memory_region_size(&spapr->hotplug_memory.mr);
> +    uint32_t node, nr_entries = 0;
> +    sPAPRDRConnector *drc;
> +    typedef struct drconf_cell_queue {
> +        struct of_drconf_cell_v2 cell;
> +        QSIMPLEQ_ENTRY(drconf_cell_queue) entry;
> +    } drconf_cell_queue;

Likewise CamelCase here.

> +    QSIMPLEQ_HEAD(, drconf_cell_queue) drconf_queue
> +        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
> +    drconf_cell_queue *elem, *next;
> +    MemoryDeviceInfoList *info;
>  
> -    /*
> -     * Don't create the node if there is no hotpluggable memory
> -     */
> -    if (machine->ram_size == machine->maxram_size) {
> -        return 0;
> -    }
> +    /* Entry to cover RAM and the gap area */
> +    elem = g_malloc0(sizeof(drconf_cell_queue));

Please use sizeof(*elem) - it's more robust in case you need to change
types around.

> +    elem->cell.seq_lmbs = cpu_to_be32(nr_boot_lmbs);
> +    elem->cell.base_addr = cpu_to_be64(0);
> +    elem->cell.drc_index = cpu_to_be32(0);
> +    elem->cell.aa_index = cpu_to_be32(-1);
> +    elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
> +                                   SPAPR_LMB_FLAGS_DRC_INVALID);

A helper for allocating, populating and queuing a new element might be
useful.

> +    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +    nr_entries++;
> +
> +    cur_addr = spapr->hotplug_memory.base;
> +    for (info = dimms; info; info = info->next) {
> +        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
> +
> +        addr = di->addr;
> +        size = di->size;
> +        node = di->node;
> +
> +        /* Entry for hot-pluggable area */
> +        if (cur_addr < addr) {
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> +            g_assert(drc);
>  
> -    /*
> -     * Allocate enough buffer size to fit in ibm,dynamic-memory
> -     * or ibm,associativity-lookup-arrays
> -     */
> -    buf_len = MAX(nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1, nr_nodes * 4 + 2)
> -              * sizeof(uint32_t);
> -    cur_index = int_buf = g_malloc0(buf_len);
> +            elem = g_malloc0(sizeof(drconf_cell_queue));
> +            elem->cell.seq_lmbs = cpu_to_be32((addr - cur_addr) / lmb_size);
> +            elem->cell.base_addr = cpu_to_be64(cur_addr);
> +            elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +            elem->cell.aa_index = cpu_to_be32(-1);
> +            elem->cell.flags = cpu_to_be32(0);
> +            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +            nr_entries++;
> +        }
>  
> -    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> +        /* Entry for DIMM */
> +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> +        g_assert(drc);
>  
> -    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> -                    sizeof(prop_lmb_size));
> -    if (ret < 0) {
> -        goto out;
> +        elem = g_malloc0(sizeof(drconf_cell_queue));
> +        elem->cell.seq_lmbs = cpu_to_be32(size / lmb_size);
> +        elem->cell.base_addr = cpu_to_be64(addr);
> +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +        elem->cell.aa_index = cpu_to_be32(node);
> +        elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
> +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +        nr_entries++;
> +        cur_addr = addr + size;
>      }
>  
> -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> -    if (ret < 0) {
> -        goto out;
> +    /* Entry for remaining hotpluggable area */
> +    if (cur_addr < mem_end) {
> +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> +        g_assert(drc);
> +
> +        elem = g_malloc0(sizeof(drconf_cell_queue));
> +        elem->cell.seq_lmbs = cpu_to_be32((mem_end - cur_addr) / lmb_size);
> +        elem->cell.base_addr = cpu_to_be64(cur_addr);
> +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> +        elem->cell.aa_index = cpu_to_be32(-1);
> +        elem->cell.flags = cpu_to_be32(0);
> +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> +        nr_entries++;
>      }
>  
> -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> -    if (ret < 0) {
> -        goto out;
> +    buf_len = (nr_entries * SPAPR_DRCONF_CELL_SIZE + 1) * sizeof(uint32_t);
> +    int_buf = cur_index = g_malloc0(buf_len);
> +    int_buf[0] = cpu_to_be32(nr_entries);
> +    cur_index++;
> +    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
> +        memcpy(cur_index, &elem->cell,
> +               SPAPR_DRCONF_CELL_SIZE * sizeof(uint32_t));
> +        cur_index += SPAPR_DRCONF_CELL_SIZE;
> +        QSIMPLEQ_REMOVE(&drconf_queue, elem, drconf_cell_queue, entry);
> +        g_free(elem);
>      }
>  
> -    if (hotplug_lmb_start) {
> -        dimms = qmp_pc_dimm_device_list();
> +    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
> +    g_free(int_buf);
> +    if (ret < 0) {
> +        return -1;
>      }
> +    return 0;
> +}
>  
> -    /* ibm,dynamic-memory */
> +/* ibm,dynamic-memory */
> +static int spapr_populate_drmem_v1(sPAPRMachineState *spapr, void *fdt,
> +                                   int offset, MemoryDeviceInfoList *dimms)
> +{
> +    int i, ret;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> +    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> +                       memory_region_size(&spapr->hotplug_memory.mr)) /
> +                       lmb_size;
> +    uint32_t *int_buf, *cur_index, buf_len;
> +
> +    /*
> +     * Allocate enough buffer size to fit in ibm,dynamic-memory
> +     */
> +    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
> +    cur_index = int_buf = g_malloc0(buf_len);
>      int_buf[0] = cpu_to_be32(nr_lmbs);
>      cur_index++;
>      for (i = 0; i < nr_lmbs; i++) {
> @@ -765,13 +841,71 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
>  
>          cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
>      }
> -    qapi_free_MemoryDeviceInfoList(dimms);
>      ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
> +    g_free(int_buf);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +/*
> + * Adds ibm,dynamic-reconfiguration-memory node.
> + * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> + * of this device tree node.
> + */
> +static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> +{
> +    MachineState *machine = MACHINE(spapr);
> +    int ret, i, offset;
> +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> +    uint32_t *int_buf, *cur_index, buf_len;
> +    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> +    MemoryDeviceInfoList *dimms = NULL;
> +
> +    /*
> +     * Don't create the node if there is no hotpluggable memory
> +     */
> +    if (machine->ram_size == machine->maxram_size) {
> +        return 0;
> +    }
> +
> +    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> +
> +    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> +                    sizeof(prop_lmb_size));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
> +    dimms = qmp_pc_dimm_device_list();
> +    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
> +        ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
> +    } else {
> +        ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
> +    }
> +    qapi_free_MemoryDeviceInfoList(dimms);
> +
>      if (ret < 0) {
>          goto out;
>      }
>  
>      /* ibm,associativity-lookup-arrays */
> +    buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
> +    cur_index = int_buf = g_malloc0(buf_len);
> +
>      cur_index = int_buf;
>      int_buf[0] = cpu_to_be32(nr_nodes);
>      int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
> @@ -788,8 +922,9 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
>      }
>      ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
>              (cur_index - int_buf) * sizeof(uint32_t));
> -out:
>      g_free(int_buf);
> +
> +out:

No point to this label if all you do after is a plain return - you can
just use return directly instead of gotos.

>      return ret;
>  }
>  
> @@ -2500,6 +2635,11 @@ static void spapr_machine_init(MachineState *machine)
>          spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
>      }
>  
> +    /* advertise support for ibm,dyamic-memory-v2 */
> +    if (spapr->use_ibm_dynamic_memory_v2) {
> +        spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
> +    }
> +
>      /* init CPUs */
>      spapr_init_cpus(spapr);
>  
> @@ -2907,12 +3047,27 @@ static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
>      visit_type_uint32(v, name, (uint32_t *)opaque, errp);
>  }
>  
> +static bool spapr_get_drmem_v2(Object *obj, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->use_ibm_dynamic_memory_v2;
> +}
> +
> +static void spapr_set_drmem_v2(Object *obj, bool value, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    spapr->use_ibm_dynamic_memory_v2 = value;
> +}
> +
>  static void spapr_instance_init(Object *obj)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>  
>      spapr->htab_fd = -1;
>      spapr->use_hotplug_event_source = true;
> +    spapr->use_ibm_dynamic_memory_v2 = true;
>      object_property_add_str(obj, "kvm-type",
>                              spapr_get_kvm_type, spapr_set_kvm_type, NULL);
>      object_property_set_description(obj, "kvm-type",
> @@ -2927,6 +3082,15 @@ static void spapr_instance_init(Object *obj)
>                                      " place of standard EPOW events when possible"
>                                      " (required for memory hot-unplug support)",
>                                      NULL);
> +    object_property_add_bool(obj, "drmem-v2",
> +                             spapr_get_drmem_v2,
> +                             spapr_set_drmem_v2,
> +                             NULL);
> +    object_property_set_description(obj, "ibm-dynamic-memory-v2",
> +                                    "Use ibm-dynamic-memory-v2 representation"
> +                                    " in place of ibm-dynamic-memory when"
> +                                    " possible",
> +                                    NULL);

I don't really see any point to making this a user configurable
option.  Why not just always enable it if the guest says it can
support it.

>      ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
>                              "Maximum permitted CPU compatibility mode",
> @@ -4042,7 +4206,10 @@ DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
>  
>  static void spapr_machine_2_11_instance_options(MachineState *machine)
>  {
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
> +
>      spapr_machine_2_12_instance_options(machine);
> +    spapr->use_ibm_dynamic_memory_v2 = false;
>  }
>  
>  static void spapr_machine_2_11_class_options(MachineClass *mc)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index d60b7c6d7a..5e044c44af 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -149,6 +149,7 @@ struct sPAPRMachineState {
>      sPAPROptionVector *ov5;         /* QEMU-supported option vectors */
>      sPAPROptionVector *ov5_cas;     /* negotiated (via CAS) option vectors */
>      uint32_t max_compat_pvr;
> +    bool use_ibm_dynamic_memory_v2;
>  
>      /* Migration state */
>      int htab_save_index;
> diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
> index bf25e5d954..0f2d8d715d 100644
> --- a/include/hw/ppc/spapr_ovec.h
> +++ b/include/hw/ppc/spapr_ovec.h
> @@ -51,6 +51,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
>  #define OV5_FORM1_AFFINITY      OV_BIT(5, 0)
>  #define OV5_HP_EVT              OV_BIT(6, 5)
>  #define OV5_HPT_RESIZE          OV_BIT(6, 7)
> +#define OV5_DRMEM_V2            OV_BIT(22, 0)
>  #define OV5_XIVE_BOTH           OV_BIT(23, 0)
>  #define OV5_XIVE_EXPLOIT        OV_BIT(23, 1) /* 1=exploitation 0=legacy */
>
Bharata B Rao April 11, 2018, 5:01 a.m. UTC | #7
On Wed, Apr 11, 2018 at 02:45:58PM +1000, David Gibson wrote:
> On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
> > The new property ibm,dynamic-memory-v2 allows memory to be represented
> > in a more compact manner in device tree.
> > 
> > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> > ---
> > v1: https://lists.nongnu.org/archive/html/qemu-devel/2018-03/msg01788.html
> > Changes in v1:
> > - Minor cleanups in the error paths
> > - Rebased on top of ppc-for-2.13
> > 
> >  docs/specs/ppc-spapr-hotplug.txt |  19 +++
> >  hw/ppc/spapr.c                   | 257 ++++++++++++++++++++++++++++++++-------
> >  include/hw/ppc/spapr.h           |   1 +
> >  include/hw/ppc/spapr_ovec.h      |   1 +
> >  4 files changed, 233 insertions(+), 45 deletions(-)
> 
> 
> 
> > diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
> > index f57e2a09c6..cc7833108e 100644
> > --- a/docs/specs/ppc-spapr-hotplug.txt
> > +++ b/docs/specs/ppc-spapr-hotplug.txt
> > @@ -387,4 +387,23 @@ Each LMB list entry consists of the following elements:
> >  - A 32bit flags word. The bit at bit position 0x00000008 defines whether
> >    the LMB is assigned to the the partition as of boot time.
> >  
> > +ibm,dynamic-memory-v2
> > +
> > +This property describes the dynamically reconfigurable memory. This is
> > +an alternate and newer way to describe dyanamically reconfigurable memory.
> > +It is a property encoded array that has an integer N (the number of
> > +LMB set entries) followed by N LMB set entries. There is an LMB set entry
> > +for each sequential group of LMBs that share common attributes.
> > +
> > +Each LMB set entry consists of the following elements:
> > +
> > +- Number of sequential LMBs in the entry represented by a 32bit integer.
> > +- Logical address of the first LMB in the set encoded as a 64bit integer.
> > +- DRC index of the first LMB in the set.
> > +- Associativity list index that is used as an index into
> > +  ibm,associativity-lookup-arrays property described earlier. This
> > +  is used to retrieve the right associativity list to be used for all
> > +  the LMBs in this set.
> > +- A 32bit flags word that applies to all the LMBs in the set.
> > +
> >  [1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 3ffadd6ac7..4a24fac38c 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -669,63 +669,139 @@ static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
> >      return -1;
> >  }
> >  
> > -/*
> > - * Adds ibm,dynamic-reconfiguration-memory node.
> > - * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> > - * of this device tree node.
> > - */
> > -static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> > +struct of_drconf_cell_v2 {
> 
> qemu convention is to use CamelCase for types.

Ok.

> 
> > +     uint32_t seq_lmbs;
> > +     uint64_t base_addr;
> > +     uint32_t drc_index;
> > +     uint32_t aa_index;
> > +     uint32_t flags;
> > +} __attribute__((packed));
> > +
> > +#define SPAPR_DRCONF_CELL_SIZE 6
> 
> Define this using a sizeof() for safety.

Ok.

> 
> > +/* ibm,dynamic-memory-v2 */
> > +static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> > +                                   int offset, MemoryDeviceInfoList *dimms)
> >  {
> > -    MachineState *machine = MACHINE(spapr);
> > -    int ret, i, offset;
> > -    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > -    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> > -    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> > -    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> > -                       memory_region_size(&spapr->hotplug_memory.mr)) /
> > -                       lmb_size;
> >      uint32_t *int_buf, *cur_index, buf_len;
> > -    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> > -    MemoryDeviceInfoList *dimms = NULL;
> > +    int ret;
> > +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > +    uint64_t addr, cur_addr, size;
> > +    uint32_t nr_boot_lmbs = (spapr->hotplug_memory.base / lmb_size);
> > +    uint64_t mem_end = spapr->hotplug_memory.base +
> > +                       memory_region_size(&spapr->hotplug_memory.mr);
> > +    uint32_t node, nr_entries = 0;
> > +    sPAPRDRConnector *drc;
> > +    typedef struct drconf_cell_queue {
> > +        struct of_drconf_cell_v2 cell;
> > +        QSIMPLEQ_ENTRY(drconf_cell_queue) entry;
> > +    } drconf_cell_queue;
> 
> Likewise CamelCase here.

Ok.

> 
> > +    QSIMPLEQ_HEAD(, drconf_cell_queue) drconf_queue
> > +        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
> > +    drconf_cell_queue *elem, *next;
> > +    MemoryDeviceInfoList *info;
> >  
> > -    /*
> > -     * Don't create the node if there is no hotpluggable memory
> > -     */
> > -    if (machine->ram_size == machine->maxram_size) {
> > -        return 0;
> > -    }
> > +    /* Entry to cover RAM and the gap area */
> > +    elem = g_malloc0(sizeof(drconf_cell_queue));
> 
> Please use sizeof(*elem) - it's more robust in case you need to change
> types around.

Ok.

> 
> > +    elem->cell.seq_lmbs = cpu_to_be32(nr_boot_lmbs);
> > +    elem->cell.base_addr = cpu_to_be64(0);
> > +    elem->cell.drc_index = cpu_to_be32(0);
> > +    elem->cell.aa_index = cpu_to_be32(-1);
> > +    elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
> > +                                   SPAPR_LMB_FLAGS_DRC_INVALID);
> 
> A helper for allocating, populating and queuing a new element might be
> useful.

I almost did that but required the list head to be global which I didn't
like, but me give it another attempt.

> 
> > +    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> > +    nr_entries++;
> > +
> > +    cur_addr = spapr->hotplug_memory.base;
> > +    for (info = dimms; info; info = info->next) {
> > +        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
> > +
> > +        addr = di->addr;
> > +        size = di->size;
> > +        node = di->node;
> > +
> > +        /* Entry for hot-pluggable area */
> > +        if (cur_addr < addr) {
> > +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> > +            g_assert(drc);
> >  
> > -    /*
> > -     * Allocate enough buffer size to fit in ibm,dynamic-memory
> > -     * or ibm,associativity-lookup-arrays
> > -     */
> > -    buf_len = MAX(nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1, nr_nodes * 4 + 2)
> > -              * sizeof(uint32_t);
> > -    cur_index = int_buf = g_malloc0(buf_len);
> > +            elem = g_malloc0(sizeof(drconf_cell_queue));
> > +            elem->cell.seq_lmbs = cpu_to_be32((addr - cur_addr) / lmb_size);
> > +            elem->cell.base_addr = cpu_to_be64(cur_addr);
> > +            elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> > +            elem->cell.aa_index = cpu_to_be32(-1);
> > +            elem->cell.flags = cpu_to_be32(0);
> > +            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> > +            nr_entries++;
> > +        }
> >  
> > -    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> > +        /* Entry for DIMM */
> > +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> > +        g_assert(drc);
> >  
> > -    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> > -                    sizeof(prop_lmb_size));
> > -    if (ret < 0) {
> > -        goto out;
> > +        elem = g_malloc0(sizeof(drconf_cell_queue));
> > +        elem->cell.seq_lmbs = cpu_to_be32(size / lmb_size);
> > +        elem->cell.base_addr = cpu_to_be64(addr);
> > +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> > +        elem->cell.aa_index = cpu_to_be32(node);
> > +        elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
> > +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> > +        nr_entries++;
> > +        cur_addr = addr + size;
> >      }
> >  
> > -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> > -    if (ret < 0) {
> > -        goto out;
> > +    /* Entry for remaining hotpluggable area */
> > +    if (cur_addr < mem_end) {
> > +        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
> > +        g_assert(drc);
> > +
> > +        elem = g_malloc0(sizeof(drconf_cell_queue));
> > +        elem->cell.seq_lmbs = cpu_to_be32((mem_end - cur_addr) / lmb_size);
> > +        elem->cell.base_addr = cpu_to_be64(cur_addr);
> > +        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
> > +        elem->cell.aa_index = cpu_to_be32(-1);
> > +        elem->cell.flags = cpu_to_be32(0);
> > +        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> > +        nr_entries++;
> >      }
> >  
> > -    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> > -    if (ret < 0) {
> > -        goto out;
> > +    buf_len = (nr_entries * SPAPR_DRCONF_CELL_SIZE + 1) * sizeof(uint32_t);
> > +    int_buf = cur_index = g_malloc0(buf_len);
> > +    int_buf[0] = cpu_to_be32(nr_entries);
> > +    cur_index++;
> > +    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
> > +        memcpy(cur_index, &elem->cell,
> > +               SPAPR_DRCONF_CELL_SIZE * sizeof(uint32_t));
> > +        cur_index += SPAPR_DRCONF_CELL_SIZE;
> > +        QSIMPLEQ_REMOVE(&drconf_queue, elem, drconf_cell_queue, entry);
> > +        g_free(elem);
> >      }
> >  
> > -    if (hotplug_lmb_start) {
> > -        dimms = qmp_pc_dimm_device_list();
> > +    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
> > +    g_free(int_buf);
> > +    if (ret < 0) {
> > +        return -1;
> >      }
> > +    return 0;
> > +}
> >  
> > -    /* ibm,dynamic-memory */
> > +/* ibm,dynamic-memory */
> > +static int spapr_populate_drmem_v1(sPAPRMachineState *spapr, void *fdt,
> > +                                   int offset, MemoryDeviceInfoList *dimms)
> > +{
> > +    int i, ret;
> > +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > +    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
> > +    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
> > +                       memory_region_size(&spapr->hotplug_memory.mr)) /
> > +                       lmb_size;
> > +    uint32_t *int_buf, *cur_index, buf_len;
> > +
> > +    /*
> > +     * Allocate enough buffer size to fit in ibm,dynamic-memory
> > +     */
> > +    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
> > +    cur_index = int_buf = g_malloc0(buf_len);
> >      int_buf[0] = cpu_to_be32(nr_lmbs);
> >      cur_index++;
> >      for (i = 0; i < nr_lmbs; i++) {
> > @@ -765,13 +841,71 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> >  
> >          cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
> >      }
> > -    qapi_free_MemoryDeviceInfoList(dimms);
> >      ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
> > +    g_free(int_buf);
> > +    if (ret < 0) {
> > +        return -1;
> > +    }
> > +    return 0;
> > +}
> > +
> > +/*
> > + * Adds ibm,dynamic-reconfiguration-memory node.
> > + * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> > + * of this device tree node.
> > + */
> > +static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> > +{
> > +    MachineState *machine = MACHINE(spapr);
> > +    int ret, i, offset;
> > +    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > +    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> > +    uint32_t *int_buf, *cur_index, buf_len;
> > +    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
> > +    MemoryDeviceInfoList *dimms = NULL;
> > +
> > +    /*
> > +     * Don't create the node if there is no hotpluggable memory
> > +     */
> > +    if (machine->ram_size == machine->maxram_size) {
> > +        return 0;
> > +    }
> > +
> > +    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> > +
> > +    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> > +                    sizeof(prop_lmb_size));
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
> > +    dimms = qmp_pc_dimm_device_list();
> > +    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
> > +        ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
> > +    } else {
> > +        ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
> > +    }
> > +    qapi_free_MemoryDeviceInfoList(dimms);
> > +
> >      if (ret < 0) {
> >          goto out;
> >      }
> >  
> >      /* ibm,associativity-lookup-arrays */
> > +    buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
> > +    cur_index = int_buf = g_malloc0(buf_len);
> > +
> >      cur_index = int_buf;
> >      int_buf[0] = cpu_to_be32(nr_nodes);
> >      int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
> > @@ -788,8 +922,9 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
> >      }
> >      ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
> >              (cur_index - int_buf) * sizeof(uint32_t));
> > -out:
> >      g_free(int_buf);
> > +
> > +out:
> 
> No point to this label if all you do after is a plain return - you can
> just use return directly instead of gotos.

Yes, cleaned up error path which made this label pointless, but missed
removing it.

> 
> >      return ret;
> >  }
> >  
> > @@ -2500,6 +2635,11 @@ static void spapr_machine_init(MachineState *machine)
> >          spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
> >      }
> >  
> > +    /* advertise support for ibm,dyamic-memory-v2 */
> > +    if (spapr->use_ibm_dynamic_memory_v2) {
> > +        spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
> > +    }
> > +
> >      /* init CPUs */
> >      spapr_init_cpus(spapr);
> >  
> > @@ -2907,12 +3047,27 @@ static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
> >      visit_type_uint32(v, name, (uint32_t *)opaque, errp);
> >  }
> >  
> > +static bool spapr_get_drmem_v2(Object *obj, Error **errp)
> > +{
> > +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> > +
> > +    return spapr->use_ibm_dynamic_memory_v2;
> > +}
> > +
> > +static void spapr_set_drmem_v2(Object *obj, bool value, Error **errp)
> > +{
> > +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> > +
> > +    spapr->use_ibm_dynamic_memory_v2 = value;
> > +}
> > +
> >  static void spapr_instance_init(Object *obj)
> >  {
> >      sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >  
> >      spapr->htab_fd = -1;
> >      spapr->use_hotplug_event_source = true;
> > +    spapr->use_ibm_dynamic_memory_v2 = true;
> >      object_property_add_str(obj, "kvm-type",
> >                              spapr_get_kvm_type, spapr_set_kvm_type, NULL);
> >      object_property_set_description(obj, "kvm-type",
> > @@ -2927,6 +3082,15 @@ static void spapr_instance_init(Object *obj)
> >                                      " place of standard EPOW events when possible"
> >                                      " (required for memory hot-unplug support)",
> >                                      NULL);
> > +    object_property_add_bool(obj, "drmem-v2",
> > +                             spapr_get_drmem_v2,
> > +                             spapr_set_drmem_v2,
> > +                             NULL);
> > +    object_property_set_description(obj, "ibm-dynamic-memory-v2",
> > +                                    "Use ibm-dynamic-memory-v2 representation"
> > +                                    " in place of ibm-dynamic-memory when"
> > +                                    " possible",
> > +                                    NULL);
> 
> I don't really see any point to making this a user configurable
> option.  Why not just always enable it if the guest says it can
> support it.

Makes sense. While thinking about it, I wonder why other properties like
modern-hotplug-events remain user configurable.

Thanks for the review, will post next version with your suggested changes.

Regards,
Bharata.
David Gibson April 11, 2018, 5:07 a.m. UTC | #8
On Wed, Apr 11, 2018 at 10:31:28AM +0530, Bharata B Rao wrote:
> On Wed, Apr 11, 2018 at 02:45:58PM +1000, David Gibson wrote:
> > On Mon, Apr 09, 2018 at 11:55:38AM +0530, Bharata B Rao wrote:
[snip]
> > > @@ -2927,6 +3082,15 @@ static void spapr_instance_init(Object *obj)
> > >                                      " place of standard EPOW events when possible"
> > >                                      " (required for memory hot-unplug support)",
> > >                                      NULL);
> > > +    object_property_add_bool(obj, "drmem-v2",
> > > +                             spapr_get_drmem_v2,
> > > +                             spapr_set_drmem_v2,
> > > +                             NULL);
> > > +    object_property_set_description(obj, "ibm-dynamic-memory-v2",
> > > +                                    "Use ibm-dynamic-memory-v2 representation"
> > > +                                    " in place of ibm-dynamic-memory when"
> > > +                                    " possible",
> > > +                                    NULL);
> > 
> > I don't really see any point to making this a user configurable
> > option.  Why not just always enable it if the guest says it can
> > support it.
> 
> Makes sense. While thinking about it, I wonder why other properties like
> modern-hotplug-events remain user configurable.

So, modern-hotplug-events affects the runtime behaviour of the events
subsystem.  That means we have to keep that behaviour consistent
across migration, which means specifically we have to keep the old
behaviour for older machine types.  Generally the easiest way to do
that is to have a compatiblity property whose defaults get set by the
machine type.

dynamic-memory-v2 doesn't affect runtime behaviour, only how stuff is
advertised in the device tree, so there's no need to maintain the old
behaviour even for old machine types: a guest started on an old
version will still have the v1 info from when it booted, and that info
will still work after migrating to a new qemu.  Likewise a guest
started on a new version will still have the v2 info and that will
still work on the DRC objects if migrated to an old version; if it
reboots then it will get v1 info, which is fine.

So, for this case we don't need the backwards compat property.

> Thanks for the review, will post next version with your suggested changes.
> 
> Regards,
> Bharata.
> 
>
diff mbox

Patch

diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
index f57e2a09c6..cc7833108e 100644
--- a/docs/specs/ppc-spapr-hotplug.txt
+++ b/docs/specs/ppc-spapr-hotplug.txt
@@ -387,4 +387,23 @@  Each LMB list entry consists of the following elements:
 - A 32bit flags word. The bit at bit position 0x00000008 defines whether
   the LMB is assigned to the the partition as of boot time.
 
+ibm,dynamic-memory-v2
+
+This property describes the dynamically reconfigurable memory. This is
+an alternate and newer way to describe dyanamically reconfigurable memory.
+It is a property encoded array that has an integer N (the number of
+LMB set entries) followed by N LMB set entries. There is an LMB set entry
+for each sequential group of LMBs that share common attributes.
+
+Each LMB set entry consists of the following elements:
+
+- Number of sequential LMBs in the entry represented by a 32bit integer.
+- Logical address of the first LMB in the set encoded as a 64bit integer.
+- DRC index of the first LMB in the set.
+- Associativity list index that is used as an index into
+  ibm,associativity-lookup-arrays property described earlier. This
+  is used to retrieve the right associativity list to be used for all
+  the LMBs in this set.
+- A 32bit flags word that applies to all the LMBs in the set.
+
 [1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3ffadd6ac7..4a24fac38c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -669,63 +669,139 @@  static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
     return -1;
 }
 
-/*
- * Adds ibm,dynamic-reconfiguration-memory node.
- * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
- * of this device tree node.
- */
-static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
+struct of_drconf_cell_v2 {
+     uint32_t seq_lmbs;
+     uint64_t base_addr;
+     uint32_t drc_index;
+     uint32_t aa_index;
+     uint32_t flags;
+} __attribute__((packed));
+
+#define SPAPR_DRCONF_CELL_SIZE 6
+
+/* ibm,dynamic-memory-v2 */
+static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
+                                   int offset, MemoryDeviceInfoList *dimms)
 {
-    MachineState *machine = MACHINE(spapr);
-    int ret, i, offset;
-    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
-    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
-    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
-    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
-                       memory_region_size(&spapr->hotplug_memory.mr)) /
-                       lmb_size;
     uint32_t *int_buf, *cur_index, buf_len;
-    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
-    MemoryDeviceInfoList *dimms = NULL;
+    int ret;
+    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
+    uint64_t addr, cur_addr, size;
+    uint32_t nr_boot_lmbs = (spapr->hotplug_memory.base / lmb_size);
+    uint64_t mem_end = spapr->hotplug_memory.base +
+                       memory_region_size(&spapr->hotplug_memory.mr);
+    uint32_t node, nr_entries = 0;
+    sPAPRDRConnector *drc;
+    typedef struct drconf_cell_queue {
+        struct of_drconf_cell_v2 cell;
+        QSIMPLEQ_ENTRY(drconf_cell_queue) entry;
+    } drconf_cell_queue;
+    QSIMPLEQ_HEAD(, drconf_cell_queue) drconf_queue
+        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
+    drconf_cell_queue *elem, *next;
+    MemoryDeviceInfoList *info;
 
-    /*
-     * Don't create the node if there is no hotpluggable memory
-     */
-    if (machine->ram_size == machine->maxram_size) {
-        return 0;
-    }
+    /* Entry to cover RAM and the gap area */
+    elem = g_malloc0(sizeof(drconf_cell_queue));
+    elem->cell.seq_lmbs = cpu_to_be32(nr_boot_lmbs);
+    elem->cell.base_addr = cpu_to_be64(0);
+    elem->cell.drc_index = cpu_to_be32(0);
+    elem->cell.aa_index = cpu_to_be32(-1);
+    elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
+                                   SPAPR_LMB_FLAGS_DRC_INVALID);
+    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
+    nr_entries++;
+
+    cur_addr = spapr->hotplug_memory.base;
+    for (info = dimms; info; info = info->next) {
+        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
+
+        addr = di->addr;
+        size = di->size;
+        node = di->node;
+
+        /* Entry for hot-pluggable area */
+        if (cur_addr < addr) {
+            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
+            g_assert(drc);
 
-    /*
-     * Allocate enough buffer size to fit in ibm,dynamic-memory
-     * or ibm,associativity-lookup-arrays
-     */
-    buf_len = MAX(nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1, nr_nodes * 4 + 2)
-              * sizeof(uint32_t);
-    cur_index = int_buf = g_malloc0(buf_len);
+            elem = g_malloc0(sizeof(drconf_cell_queue));
+            elem->cell.seq_lmbs = cpu_to_be32((addr - cur_addr) / lmb_size);
+            elem->cell.base_addr = cpu_to_be64(cur_addr);
+            elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
+            elem->cell.aa_index = cpu_to_be32(-1);
+            elem->cell.flags = cpu_to_be32(0);
+            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
+            nr_entries++;
+        }
 
-    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
+        /* Entry for DIMM */
+        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
+        g_assert(drc);
 
-    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
-                    sizeof(prop_lmb_size));
-    if (ret < 0) {
-        goto out;
+        elem = g_malloc0(sizeof(drconf_cell_queue));
+        elem->cell.seq_lmbs = cpu_to_be32(size / lmb_size);
+        elem->cell.base_addr = cpu_to_be64(addr);
+        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
+        elem->cell.aa_index = cpu_to_be32(node);
+        elem->cell.flags = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
+        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
+        nr_entries++;
+        cur_addr = addr + size;
     }
 
-    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
-    if (ret < 0) {
-        goto out;
+    /* Entry for remaining hotpluggable area */
+    if (cur_addr < mem_end) {
+        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
+        g_assert(drc);
+
+        elem = g_malloc0(sizeof(drconf_cell_queue));
+        elem->cell.seq_lmbs = cpu_to_be32((mem_end - cur_addr) / lmb_size);
+        elem->cell.base_addr = cpu_to_be64(cur_addr);
+        elem->cell.drc_index = cpu_to_be32(spapr_drc_index(drc));
+        elem->cell.aa_index = cpu_to_be32(-1);
+        elem->cell.flags = cpu_to_be32(0);
+        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
+        nr_entries++;
     }
 
-    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
-    if (ret < 0) {
-        goto out;
+    buf_len = (nr_entries * SPAPR_DRCONF_CELL_SIZE + 1) * sizeof(uint32_t);
+    int_buf = cur_index = g_malloc0(buf_len);
+    int_buf[0] = cpu_to_be32(nr_entries);
+    cur_index++;
+    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
+        memcpy(cur_index, &elem->cell,
+               SPAPR_DRCONF_CELL_SIZE * sizeof(uint32_t));
+        cur_index += SPAPR_DRCONF_CELL_SIZE;
+        QSIMPLEQ_REMOVE(&drconf_queue, elem, drconf_cell_queue, entry);
+        g_free(elem);
     }
 
-    if (hotplug_lmb_start) {
-        dimms = qmp_pc_dimm_device_list();
+    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
+    g_free(int_buf);
+    if (ret < 0) {
+        return -1;
     }
+    return 0;
+}
 
-    /* ibm,dynamic-memory */
+/* ibm,dynamic-memory */
+static int spapr_populate_drmem_v1(sPAPRMachineState *spapr, void *fdt,
+                                   int offset, MemoryDeviceInfoList *dimms)
+{
+    int i, ret;
+    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
+    uint32_t hotplug_lmb_start = spapr->hotplug_memory.base / lmb_size;
+    uint32_t nr_lmbs = (spapr->hotplug_memory.base +
+                       memory_region_size(&spapr->hotplug_memory.mr)) /
+                       lmb_size;
+    uint32_t *int_buf, *cur_index, buf_len;
+
+    /*
+     * Allocate enough buffer size to fit in ibm,dynamic-memory
+     */
+    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
+    cur_index = int_buf = g_malloc0(buf_len);
     int_buf[0] = cpu_to_be32(nr_lmbs);
     cur_index++;
     for (i = 0; i < nr_lmbs; i++) {
@@ -765,13 +841,71 @@  static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
 
         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
     }
-    qapi_free_MemoryDeviceInfoList(dimms);
     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
+    g_free(int_buf);
+    if (ret < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+/*
+ * Adds ibm,dynamic-reconfiguration-memory node.
+ * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
+ * of this device tree node.
+ */
+static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
+{
+    MachineState *machine = MACHINE(spapr);
+    int ret, i, offset;
+    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
+    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
+    uint32_t *int_buf, *cur_index, buf_len;
+    int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
+    MemoryDeviceInfoList *dimms = NULL;
+
+    /*
+     * Don't create the node if there is no hotpluggable memory
+     */
+    if (machine->ram_size == machine->maxram_size) {
+        return 0;
+    }
+
+    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
+
+    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
+                    sizeof(prop_lmb_size));
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
+    dimms = qmp_pc_dimm_device_list();
+    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
+        ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
+    } else {
+        ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
+    }
+    qapi_free_MemoryDeviceInfoList(dimms);
+
     if (ret < 0) {
         goto out;
     }
 
     /* ibm,associativity-lookup-arrays */
+    buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
+    cur_index = int_buf = g_malloc0(buf_len);
+
     cur_index = int_buf;
     int_buf[0] = cpu_to_be32(nr_nodes);
     int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
@@ -788,8 +922,9 @@  static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
     }
     ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
             (cur_index - int_buf) * sizeof(uint32_t));
-out:
     g_free(int_buf);
+
+out:
     return ret;
 }
 
@@ -2500,6 +2635,11 @@  static void spapr_machine_init(MachineState *machine)
         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
     }
 
+    /* advertise support for ibm,dyamic-memory-v2 */
+    if (spapr->use_ibm_dynamic_memory_v2) {
+        spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
+    }
+
     /* init CPUs */
     spapr_init_cpus(spapr);
 
@@ -2907,12 +3047,27 @@  static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
     visit_type_uint32(v, name, (uint32_t *)opaque, errp);
 }
 
+static bool spapr_get_drmem_v2(Object *obj, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    return spapr->use_ibm_dynamic_memory_v2;
+}
+
+static void spapr_set_drmem_v2(Object *obj, bool value, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    spapr->use_ibm_dynamic_memory_v2 = value;
+}
+
 static void spapr_instance_init(Object *obj)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
 
     spapr->htab_fd = -1;
     spapr->use_hotplug_event_source = true;
+    spapr->use_ibm_dynamic_memory_v2 = true;
     object_property_add_str(obj, "kvm-type",
                             spapr_get_kvm_type, spapr_set_kvm_type, NULL);
     object_property_set_description(obj, "kvm-type",
@@ -2927,6 +3082,15 @@  static void spapr_instance_init(Object *obj)
                                     " place of standard EPOW events when possible"
                                     " (required for memory hot-unplug support)",
                                     NULL);
+    object_property_add_bool(obj, "drmem-v2",
+                             spapr_get_drmem_v2,
+                             spapr_set_drmem_v2,
+                             NULL);
+    object_property_set_description(obj, "ibm-dynamic-memory-v2",
+                                    "Use ibm-dynamic-memory-v2 representation"
+                                    " in place of ibm-dynamic-memory when"
+                                    " possible",
+                                    NULL);
 
     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
                             "Maximum permitted CPU compatibility mode",
@@ -4042,7 +4206,10 @@  DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
 
 static void spapr_machine_2_11_instance_options(MachineState *machine)
 {
+    sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
+
     spapr_machine_2_12_instance_options(machine);
+    spapr->use_ibm_dynamic_memory_v2 = false;
 }
 
 static void spapr_machine_2_11_class_options(MachineClass *mc)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index d60b7c6d7a..5e044c44af 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -149,6 +149,7 @@  struct sPAPRMachineState {
     sPAPROptionVector *ov5;         /* QEMU-supported option vectors */
     sPAPROptionVector *ov5_cas;     /* negotiated (via CAS) option vectors */
     uint32_t max_compat_pvr;
+    bool use_ibm_dynamic_memory_v2;
 
     /* Migration state */
     int htab_save_index;
diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
index bf25e5d954..0f2d8d715d 100644
--- a/include/hw/ppc/spapr_ovec.h
+++ b/include/hw/ppc/spapr_ovec.h
@@ -51,6 +51,7 @@  typedef struct sPAPROptionVector sPAPROptionVector;
 #define OV5_FORM1_AFFINITY      OV_BIT(5, 0)
 #define OV5_HP_EVT              OV_BIT(6, 5)
 #define OV5_HPT_RESIZE          OV_BIT(6, 7)
+#define OV5_DRMEM_V2            OV_BIT(22, 0)
 #define OV5_XIVE_BOTH           OV_BIT(23, 0)
 #define OV5_XIVE_EXPLOIT        OV_BIT(23, 1) /* 1=exploitation 0=legacy */