diff mbox

[RFC,v2,09/21] pc: Add dimm paravirt SRAT info

Message ID 1342002726-18258-10-git-send-email-vasilis.liaskovitis@profitbricks.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vasilis Liaskovitis July 11, 2012, 10:31 a.m. UTC
The numa_fw_cfg paravirt interface is extended to include SRAT information for
all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
denoting start address, size and node proximity. The new info is appended after
existing numa info, so that the fw_cfg layout does not break.  This information
is used by Seabios to build hotplug memory device objects at runtime.
nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
to SeaBIOS.

v1->v2:
Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
to break existing layout
Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt

Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
---
 docs/specs/fwcfg.txt |   28 ++++++++++++++++++++++++++
 hw/pc.c              |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
 vl.c                 |    2 +-
 3 files changed, 80 insertions(+), 3 deletions(-)
 create mode 100644 docs/specs/fwcfg.txt

Comments

Blue Swirl July 12, 2012, 7:48 p.m. UTC | #1
On Wed, Jul 11, 2012 at 10:31 AM, Vasilis Liaskovitis
<vasilis.liaskovitis@profitbricks.com> wrote:
> The numa_fw_cfg paravirt interface is extended to include SRAT information for
> all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
> denoting start address, size and node proximity. The new info is appended after
> existing numa info, so that the fw_cfg layout does not break.  This information
> is used by Seabios to build hotplug memory device objects at runtime.
> nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
> to SeaBIOS.
>
> v1->v2:
> Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
> to break existing layout
> Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt
>
> Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
> ---
>  docs/specs/fwcfg.txt |   28 ++++++++++++++++++++++++++
>  hw/pc.c              |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  vl.c                 |    2 +-
>  3 files changed, 80 insertions(+), 3 deletions(-)
>  create mode 100644 docs/specs/fwcfg.txt
>
> diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
> new file mode 100644
> index 0000000..e6fcd8f
> --- /dev/null
> +++ b/docs/specs/fwcfg.txt
> @@ -0,0 +1,28 @@
> +QEMU<->BIOS Paravirt Documentation
> +--------------------------------------
> +
> +This document describes paravirt data structures passed from QEMU to BIOS.
> +
> +fw_cfg SRAT paravirt info
> +--------------------
> +The SRAT info passed from QEMU to BIOS has the following layout:
> +
> +-----------------------------------------------------------------------------------------------
> +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
> +
> +-----------------------------------------------------------------------------------------------
> +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
> +
> +Entry 0 contains the number of numa nodes (nb_numa_nodes).
> +
> +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
> +one of the vCPUs in the system.
> +
> +Entries max_cpus+1..max_cpus+nb_numa_nodes+1:  The next nb_numa_nodes entries
> +describe the memory size for each one of the NUMA nodes in the system.
> +
> +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
> +
> +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
> +the physical address offset, size (in bytes), and node proximity for the
> +respective dimm.

The size and endianness are not specified, you are using LE 64 bit
values for each item.

> diff --git a/hw/pc.c b/hw/pc.c
> index ef9901a..cf651d0 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -598,12 +598,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
>      return index;
>  }
>
> +static void setup_hp_dimms(uint64_t *fw_cfg_slots);
> +
>  static void *bochs_bios_init(void)
>  {
>      void *fw_cfg;
>      uint8_t *smbios_table;
>      size_t smbios_len;
>      uint64_t *numa_fw_cfg;
> +    uint64_t *hp_dimms_fw_cfg;
>      int i, j;
>
>      register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
> @@ -638,8 +641,10 @@ static void *bochs_bios_init(void)
>      /* allocate memory for the NUMA channel: one (64bit) word for the number
>       * of nodes, one word for each VCPU->node and one word for each node to
>       * hold the amount of memory.
> +     * Finally one word for the number of hotplug memory slots and three words
> +     * for each hotplug memory slot (start address, size and node proximity).
>       */
> -    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
> +    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
>      numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
>      for (i = 0; i < max_cpus; i++) {
>          for (j = 0; j < nb_numa_nodes; j++) {
> @@ -652,8 +657,15 @@ static void *bochs_bios_init(void)
>      for (i = 0; i < nb_numa_nodes; i++) {
>          numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
>      }
> +
> +    numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
> +
> +    hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
> +    if (nb_hp_dimms)
> +        setup_hp_dimms(hp_dimms_fw_cfg);

Braces.

> +
>      fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
> -                     (1 + max_cpus + nb_numa_nodes) * 8);
> +                     (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
>
>      return fw_cfg;
>  }
> @@ -1223,3 +1235,40 @@ target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
>
>      return ret;
>  }
> +
> +static void setup_hp_dimms(uint64_t *fw_cfg_slots)
> +{
> +    int i = 0;
> +    Error *err = NULL;
> +    DeviceState *dev;
> +    DimmState *slot;
> +    const char *type;
> +    BusChild *kid;
> +    BusState *bus = sysbus_get_default();
> +
> +    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> +        dev = kid->child;
> +        type = object_property_get_str(OBJECT(dev), "type", &err);
> +        if (err) {
> +            error_free(err);
> +            fprintf(stderr, "error getting device type\n");
> +            exit(1);
> +        }
> +
> +        if (!strcmp(type, "dimm")) {
> +            if (!dev->id) {
> +                fprintf(stderr, "error getting dimm device id\n");
> +                exit(1);
> +            }
> +            slot = DIMM(dev);
> +            /* determine starting physical address for this memory slot */
> +            assert(slot->start);
> +            fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
> +            fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
> +            fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
> +            i++;
> +        }
> +    }
> +    assert(i == nb_hp_dimms);
> +}
> +
> diff --git a/vl.c b/vl.c
> index 0ff8818..37c9798 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2335,7 +2335,7 @@ int main(int argc, char **argv, char **envp)
>          node_cpumask[i] = 0;
>      }
>
> -    nb_numa_nodes = 0;
> +    nb_numa_nodes = 1;
>      nb_nics = 0;
>
>      autostart= 1;
> --
> 1.7.9
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis July 13, 2012, 5:40 p.m. UTC | #2
On Thu, Jul 12, 2012 at 07:48:04PM +0000, Blue Swirl wrote:
> On Wed, Jul 11, 2012 at 10:31 AM, Vasilis Liaskovitis
> <vasilis.liaskovitis@profitbricks.com> wrote:
> > The numa_fw_cfg paravirt interface is extended to include SRAT information for
> > all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
> > denoting start address, size and node proximity. The new info is appended after
> > existing numa info, so that the fw_cfg layout does not break.  This information
> > is used by Seabios to build hotplug memory device objects at runtime.
> > nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
> > to SeaBIOS.
> >
> > v1->v2:
> > Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
> > to break existing layout
> > Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt
> >
> > Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
> > ---
> >  docs/specs/fwcfg.txt |   28 ++++++++++++++++++++++++++
> >  hw/pc.c              |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  vl.c                 |    2 +-
> >  3 files changed, 80 insertions(+), 3 deletions(-)
> >  create mode 100644 docs/specs/fwcfg.txt
> >
> > diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
> > new file mode 100644
> > index 0000000..e6fcd8f
> > --- /dev/null
> > +++ b/docs/specs/fwcfg.txt
> > @@ -0,0 +1,28 @@
> > +QEMU<->BIOS Paravirt Documentation
> > +--------------------------------------
> > +
> > +This document describes paravirt data structures passed from QEMU to BIOS.
> > +
> > +fw_cfg SRAT paravirt info
> > +--------------------
> > +The SRAT info passed from QEMU to BIOS has the following layout:
> > +
> > +-----------------------------------------------------------------------------------------------
> > +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
> > +
> > +-----------------------------------------------------------------------------------------------
> > +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
> > +
> > +Entry 0 contains the number of numa nodes (nb_numa_nodes).
> > +
> > +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
> > +one of the vCPUs in the system.
> > +
> > +Entries max_cpus+1..max_cpus+nb_numa_nodes+1:  The next nb_numa_nodes entries
> > +describe the memory size for each one of the NUMA nodes in the system.
> > +
> > +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
> > +
> > +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
> > +the physical address offset, size (in bytes), and node proximity for the
> > +respective dimm.
> 
> The size and endianness are not specified, you are using LE 64 bit
> values for each item.

thanks, I 'll update.

> 
> > diff --git a/hw/pc.c b/hw/pc.c
> > index ef9901a..cf651d0 100644
> > --- a/hw/pc.c
> > +++ b/hw/pc.c
> > @@ -598,12 +598,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
> >      return index;
> >  }
> >
> > +static void setup_hp_dimms(uint64_t *fw_cfg_slots);
> > +
> >  static void *bochs_bios_init(void)
> >  {
> >      void *fw_cfg;
> >      uint8_t *smbios_table;
> >      size_t smbios_len;
> >      uint64_t *numa_fw_cfg;
> > +    uint64_t *hp_dimms_fw_cfg;
> >      int i, j;
> >
> >      register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
> > @@ -638,8 +641,10 @@ static void *bochs_bios_init(void)
> >      /* allocate memory for the NUMA channel: one (64bit) word for the number
> >       * of nodes, one word for each VCPU->node and one word for each node to
> >       * hold the amount of memory.
> > +     * Finally one word for the number of hotplug memory slots and three words
> > +     * for each hotplug memory slot (start address, size and node proximity).
> >       */
> > -    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
> > +    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
> >      numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
> >      for (i = 0; i < max_cpus; i++) {
> >          for (j = 0; j < nb_numa_nodes; j++) {
> > @@ -652,8 +657,15 @@ static void *bochs_bios_init(void)
> >      for (i = 0; i < nb_numa_nodes; i++) {
> >          numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
> >      }
> > +
> > +    numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
> > +
> > +    hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
> > +    if (nb_hp_dimms)
> > +        setup_hp_dimms(hp_dimms_fw_cfg);
> 
> Braces.
> 
> > +
> >      fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
> > -                     (1 + max_cpus + nb_numa_nodes) * 8);
> > +                     (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
> >
> >      return fw_cfg;
> >  }
> > @@ -1223,3 +1235,40 @@ target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
> >
> >      return ret;
> >  }
> > +
> > +static void setup_hp_dimms(uint64_t *fw_cfg_slots)
> > +{
> > +    int i = 0;
> > +    Error *err = NULL;
> > +    DeviceState *dev;
> > +    DimmState *slot;
> > +    const char *type;
> > +    BusChild *kid;
> > +    BusState *bus = sysbus_get_default();
> > +
> > +    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> > +        dev = kid->child;
> > +        type = object_property_get_str(OBJECT(dev), "type", &err);
> > +        if (err) {
> > +            error_free(err);
> > +            fprintf(stderr, "error getting device type\n");
> > +            exit(1);
> > +        }
> > +
> > +        if (!strcmp(type, "dimm")) {
> > +            if (!dev->id) {
> > +                fprintf(stderr, "error getting dimm device id\n");
> > +                exit(1);
> > +            }
> > +            slot = DIMM(dev);
> > +            /* determine starting physical address for this memory slot */
> > +            assert(slot->start);
> > +            fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
> > +            fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
> > +            fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
> > +            i++;
> > +        }
> > +    }
> > +    assert(i == nb_hp_dimms);
> > +}
> > +
> > diff --git a/vl.c b/vl.c
> > index 0ff8818..37c9798 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -2335,7 +2335,7 @@ int main(int argc, char **argv, char **envp)
> >          node_cpumask[i] = 0;
> >      }
> >
> > -    nb_numa_nodes = 0;
> > +    nb_numa_nodes = 1;
> >      nb_nics = 0;
> >
> >      autostart= 1;
> > --
> > 1.7.9
> >
> >
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
new file mode 100644
index 0000000..e6fcd8f
--- /dev/null
+++ b/docs/specs/fwcfg.txt
@@ -0,0 +1,28 @@ 
+QEMU<->BIOS Paravirt Documentation
+--------------------------------------
+
+This document describes paravirt data structures passed from QEMU to BIOS.
+
+fw_cfg SRAT paravirt info
+--------------------
+The SRAT info passed from QEMU to BIOS has the following layout:
+
+-----------------------------------------------------------------------------------------------
+#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
+
+-----------------------------------------------------------------------------------------------
+#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
+
+Entry 0 contains the number of numa nodes (nb_numa_nodes).
+
+Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
+one of the vCPUs in the system.
+
+Entries max_cpus+1..max_cpus+nb_numa_nodes+1:  The next nb_numa_nodes entries
+describe the memory size for each one of the NUMA nodes in the system.
+
+Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
+
+The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
+the physical address offset, size (in bytes), and node proximity for the
+respective dimm.
diff --git a/hw/pc.c b/hw/pc.c
index ef9901a..cf651d0 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -598,12 +598,15 @@  int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
     return index;
 }
 
+static void setup_hp_dimms(uint64_t *fw_cfg_slots);
+
 static void *bochs_bios_init(void)
 {
     void *fw_cfg;
     uint8_t *smbios_table;
     size_t smbios_len;
     uint64_t *numa_fw_cfg;
+    uint64_t *hp_dimms_fw_cfg;
     int i, j;
 
     register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -638,8 +641,10 @@  static void *bochs_bios_init(void)
     /* allocate memory for the NUMA channel: one (64bit) word for the number
      * of nodes, one word for each VCPU->node and one word for each node to
      * hold the amount of memory.
+     * Finally one word for the number of hotplug memory slots and three words
+     * for each hotplug memory slot (start address, size and node proximity).
      */
-    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
     numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
     for (i = 0; i < max_cpus; i++) {
         for (j = 0; j < nb_numa_nodes; j++) {
@@ -652,8 +657,15 @@  static void *bochs_bios_init(void)
     for (i = 0; i < nb_numa_nodes; i++) {
         numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
     }
+
+    numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
+
+    hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+    if (nb_hp_dimms)
+        setup_hp_dimms(hp_dimms_fw_cfg);
+
     fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
-                     (1 + max_cpus + nb_numa_nodes) * 8);
+                     (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
 
     return fw_cfg;
 }
@@ -1223,3 +1235,40 @@  target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
 
     return ret;
 }
+
+static void setup_hp_dimms(uint64_t *fw_cfg_slots)
+{
+    int i = 0;
+    Error *err = NULL;
+    DeviceState *dev;
+    DimmState *slot;
+    const char *type;
+    BusChild *kid;
+    BusState *bus = sysbus_get_default();
+
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        dev = kid->child;
+        type = object_property_get_str(OBJECT(dev), "type", &err);
+        if (err) {
+            error_free(err);
+            fprintf(stderr, "error getting device type\n");
+            exit(1);
+        }
+
+        if (!strcmp(type, "dimm")) {
+            if (!dev->id) {
+                fprintf(stderr, "error getting dimm device id\n");
+                exit(1);
+            }
+            slot = DIMM(dev);
+            /* determine starting physical address for this memory slot */
+            assert(slot->start);
+            fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
+            fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
+            fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
+            i++;
+        }
+    }
+    assert(i == nb_hp_dimms);
+}
+
diff --git a/vl.c b/vl.c
index 0ff8818..37c9798 100644
--- a/vl.c
+++ b/vl.c
@@ -2335,7 +2335,7 @@  int main(int argc, char **argv, char **envp)
         node_cpumask[i] = 0;
     }
 
-    nb_numa_nodes = 0;
+    nb_numa_nodes = 1;
     nb_nics = 0;
 
     autostart= 1;