diff mbox series

hw/riscv/virt: Add hotplugging and virtio-md-pci support

Message ID 20240514110615.399065-1-bjorn@kernel.org (mailing list archive)
State New
Headers show
Series hw/riscv/virt: Add hotplugging and virtio-md-pci support | expand

Commit Message

Björn Töpel May 14, 2024, 11:06 a.m. UTC
From: Björn Töpel <bjorn@rivosinc.com>

Virtio-based memory devices allows for dynamic resizing of virtual
machine memory, and requires proper hotplugging (add/remove) support
to work.

Enable virtio-md-pci with the corresponding missing hotplugging
callbacks for the RISC-V "virt" machine.

Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
---
This is basic support for MHP that works with DT. There some minor
ACPI SRAT plumbing in there as well. Ideally we'd like proper ACPI MHP
support as well. I have a branch [1], where I've applied this patch,
plus ACPI GED/PC-DIMM MHP support on top of Sunil's QEMU branch
(contains some ACPI DSDT additions) [2], for the curious/brave ones.
However, the ACPI MHP support this is not testable on upstream Linux
yet (ACPI AIA support, and ACPI NUMA SRAT series are ongoing).

I'll follow-up with proper ACPI GED/PC-DIMM MHP patches, once the
dependencies land (Linux kernel and QEMU).

I'll post the Linux MHP/virtio-mem v2 patches later this week!


Cheers,
Björn

[1] https://github.com/bjoto/qemu/commits/virtio-mem-pc-dimm-mhp-acpi/
[2] https://lore.kernel.org/linux-riscv/20240501121742.1215792-1-sunilvl@ventanamicro.com/
---
 hw/riscv/Kconfig           |  2 ++
 hw/riscv/virt-acpi-build.c |  7 +++++
 hw/riscv/virt.c            | 64 +++++++++++++++++++++++++++++++++++++-
 hw/virtio/virtio-mem.c     |  2 +-
 4 files changed, 73 insertions(+), 2 deletions(-)


base-commit: 9360070196789cc8b9404b2efaf319384e64b107

Comments

Daniel Henrique Barboza May 18, 2024, 5:23 p.m. UTC | #1
Hi Björj,

On 5/14/24 08:06, Björn Töpel wrote:
> From: Björn Töpel <bjorn@rivosinc.com>
> 
> Virtio-based memory devices allows for dynamic resizing of virtual
> machine memory, and requires proper hotplugging (add/remove) support
> to work.
> 
> Enable virtio-md-pci with the corresponding missing hotplugging
> callbacks for the RISC-V "virt" machine.
> 
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
> This is basic support for MHP that works with DT. There some minor
> ACPI SRAT plumbing in there as well. Ideally we'd like proper ACPI MHP
> support as well. I have a branch [1], where I've applied this patch,
> plus ACPI GED/PC-DIMM MHP support on top of Sunil's QEMU branch
> (contains some ACPI DSDT additions) [2], for the curious/brave ones.
> However, the ACPI MHP support this is not testable on upstream Linux
> yet (ACPI AIA support, and ACPI NUMA SRAT series are ongoing).
> 
> I'll follow-up with proper ACPI GED/PC-DIMM MHP patches, once the
> dependencies land (Linux kernel and QEMU).
> 
> I'll post the Linux MHP/virtio-mem v2 patches later this week!
> 
> 
> Cheers,
> Björn
> 
> [1] https://github.com/bjoto/qemu/commits/virtio-mem-pc-dimm-mhp-acpi/
> [2] https://lore.kernel.org/linux-riscv/20240501121742.1215792-1-sunilvl@ventanamicro.com/
> ---
>   hw/riscv/Kconfig           |  2 ++
>   hw/riscv/virt-acpi-build.c |  7 +++++
>   hw/riscv/virt.c            | 64 +++++++++++++++++++++++++++++++++++++-
>   hw/virtio/virtio-mem.c     |  2 +-
>   4 files changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index a2030e3a6ff0..08f82dbb681a 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -56,6 +56,8 @@ config RISCV_VIRT
>       select PLATFORM_BUS
>       select ACPI
>       select ACPI_PCI
> +    select VIRTIO_MEM_SUPPORTED
> +    select VIRTIO_PMEM_SUPPORTED
>   
>   config SHAKTI_C
>       bool
> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
> index 0925528160f8..6dc3baa9ec86 100644
> --- a/hw/riscv/virt-acpi-build.c
> +++ b/hw/riscv/virt-acpi-build.c
> @@ -610,6 +610,13 @@ build_srat(GArray *table_data, BIOSLinker *linker, RISCVVirtState *vms)
>           }
>       }
>   
> +    if (ms->device_memory) {
> +        build_srat_memory(table_data, ms->device_memory->base,
> +                          memory_region_size(&ms->device_memory->mr),
> +                          ms->numa_state->num_nodes - 1,
> +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> +    }
> +
>       acpi_table_end(linker, &table);

When the time comes I believe we'll want this chunk in a separated ACPI patch.

>   }
>   
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index 4fdb66052587..16c2bdbfe6b6 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -53,6 +53,8 @@
>   #include "hw/pci-host/gpex.h"
>   #include "hw/display/ramfb.h"
>   #include "hw/acpi/aml-build.h"
> +#include "hw/mem/memory-device.h"
> +#include "hw/virtio/virtio-mem-pci.h"
>   #include "qapi/qapi-visit-common.h"
>   #include "hw/virtio/virtio-iommu.h"
>   
> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>       DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>       int i, base_hartid, hart_count;
>       int socket_count = riscv_socket_count(machine);
> +    hwaddr device_memory_base, device_memory_size;
>   
>       /* Check socket count limit */
>       if (VIRT_SOCKETS_MAX < socket_count) {
> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>       memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>                                   mask_rom);
>   
> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
> +                                  GiB);
> +    device_memory_size = machine->maxram_size - machine->ram_size;
> +
> +    if (riscv_is_32bit(&s->soc[0])) {
> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
> +
> +        if (memtop > UINT32_MAX) {
> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
> +                         memtop - UINT32_MAX);
> +            exit(EXIT_FAILURE);
> +        }
> +    }
> +
> +    if (device_memory_size > 0) {
> +        machine_memory_devices_init(machine, device_memory_base,
> +                                    device_memory_size);
> +    }
> +

I think we need a design discussion before proceeding here. You're allocating all
available memory as a memory device area, but in theory we might also support
pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
the board.) in the future too. If you're not familiar with this feature you can
check it out the docs in [1].

As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
type of hotplug by checking how much 'ram_slots' we're allocating for it:

device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;

Other boards do the same with ms->ram_slots. We should consider doing it as well,
now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
having to change the memory layout later in the road and breaking existing
setups.

If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
them.

Note: I do not have the visibility of discussions on the memory management space,
and I might be missing details such as "we don't care about pc-dimm hotplug
anymore, it's legacy, we're going to support only virtio-md-pci from now on". We
had a situation like that with virtio-balloon and virtio-mem in the past, and I'm
not sure if this might fall in the same category.

I see that David Hildenbrand is also CCed in the patch so he'll let us know if
I'm out of line with what I'm asking.


[1] https://github.com/qemu/qemu/blob/master/docs/memory-hotplug.txt


Thanks,

Daniel




>       /*
>        * Init fw_cfg. Must be done before riscv_load_fdt, otherwise the
>        * device tree cannot be altered and we get FDT_ERR_NOSPACE.
> @@ -1712,12 +1734,21 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
>       MachineClass *mc = MACHINE_GET_CLASS(machine);
>   
>       if (device_is_dynamic_sysbus(mc, dev) ||
> -        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
> +        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
> +        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
>           return HOTPLUG_HANDLER(machine);
>       }
>       return NULL;
>   }
>   
> +static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
> +                                            DeviceState *dev, Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
> +        virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
> +    }
> +}
> +
>   static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
>                                           DeviceState *dev, Error **errp)
>   {
> @@ -1735,6 +1766,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
>       if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
>           create_fdt_virtio_iommu(s, pci_get_bdf(PCI_DEVICE(dev)));
>       }
> +
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
> +        virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
> +    }
> +}
> +
> +static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
> +                                                  DeviceState *dev,
> +                                                  Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
> +        virtio_md_pci_unplug_request(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev),
> +                                     errp);
> +    } else {
> +        error_setg(errp, "device unplug request for unsupported device type: %s",
> +                   object_get_typename(OBJECT(dev)));
> +    }
> +}
> +
> +static void virt_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
> +                                          DeviceState *dev, Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
> +        virtio_md_pci_unplug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
> +    } else {
> +        error_setg(errp, "virt: device unplug for unsupported device"
> +                   " type: %s", object_get_typename(OBJECT(dev)));
> +    }
>   }
>   
>   static void virt_machine_class_init(ObjectClass *oc, void *data)
> @@ -1757,7 +1816,10 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
>       assert(!mc->get_hotplug_handler);
>       mc->get_hotplug_handler = virt_machine_get_hotplug_handler;
>   
> +    hc->pre_plug = virt_machine_device_pre_plug_cb;
>       hc->plug = virt_machine_device_plug_cb;
> +    hc->unplug_request = virt_machine_device_unplug_request_cb;
> +    hc->unplug = virt_machine_device_unplug_cb;
>   
>       machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
>   #ifdef CONFIG_TPM
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index ffd119ebacb7..26c976a3b9b8 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -161,7 +161,7 @@ static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
>    * necessary (as the section size can change). But it's more likely that the
>    * section size will rather get smaller and not bigger over time.
>    */
> -#if defined(TARGET_X86_64) || defined(TARGET_I386)
> +#if defined(TARGET_X86_64) || defined(TARGET_I386)  || defined(TARGET_RISCV)
>   #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
>   #elif defined(TARGET_ARM)
>   #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
> 
> base-commit: 9360070196789cc8b9404b2efaf319384e64b107
David Hildenbrand May 18, 2024, 7:50 p.m. UTC | #2
Hi,


>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>> index 4fdb66052587..16c2bdbfe6b6 100644
>> --- a/hw/riscv/virt.c
>> +++ b/hw/riscv/virt.c
>> @@ -53,6 +53,8 @@
>>    #include "hw/pci-host/gpex.h"
>>    #include "hw/display/ramfb.h"
>>    #include "hw/acpi/aml-build.h"
>> +#include "hw/mem/memory-device.h"
>> +#include "hw/virtio/virtio-mem-pci.h"
>>    #include "qapi/qapi-visit-common.h"
>>    #include "hw/virtio/virtio-iommu.h"
>>    
>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>        DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>        int i, base_hartid, hart_count;
>>        int socket_count = riscv_socket_count(machine);
>> +    hwaddr device_memory_base, device_memory_size;
>>    
>>        /* Check socket count limit */
>>        if (VIRT_SOCKETS_MAX < socket_count) {
>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>        memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>                                    mask_rom);
>>    
>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>> +                                  GiB);
>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>> +
>> +    if (riscv_is_32bit(&s->soc[0])) {
>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>> +
>> +        if (memtop > UINT32_MAX) {
>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>> +                         memtop - UINT32_MAX);
>> +            exit(EXIT_FAILURE);
>> +        }
>> +    }
>> +
>> +    if (device_memory_size > 0) {
>> +        machine_memory_devices_init(machine, device_memory_base,
>> +                                    device_memory_size);
>> +    }
>> +
> 
> I think we need a design discussion before proceeding here. You're allocating all
> available memory as a memory device area, but in theory we might also support
> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
> the board.) in the future too. If you're not familiar with this feature you can
> check it out the docs in [1].

Note that DIMMs are memory devices as well. You can plug into the memory 
device area both, ACPI-based memory devices (DIMM, NVDIMM) or 
virtio-based memory devices (virtio-mem, virtio-pmem).

> 
> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
> type of hotplug by checking how much 'ram_slots' we're allocating for it:
> 
> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
> 

Note that we increased the region size to be able to fit most requests 
even if alignment of memory devices is weird. See below.

In sane setups, this is usually not required (adding a single additional 
GB for some flexiility might be good enough).

> Other boards do the same with ms->ram_slots. We should consider doing it as well,
> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
> having to change the memory layout later in the road and breaking existing
> setups.
> 
> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
> them.

This only reserves some *additional* space to fixup weird alignment of 
memory devices. *not* the actual space for these devices.

We don't consider each DIMM to be 1 GiB in size, but add an additional 1 
GiB in case we have to align DIMMs in physical address space.

I *think* this dates back to old x86 handling where we aligned the 
address of each DIMM to be at a 1 GiB boundary. So if you would have 
plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of 
space in the area after aligning inside the memory device area.

> 
> Note: I do not have the visibility of discussions on the memory management space,
> and I might be missing details such as "we don't care about pc-dimm hotplug
> anymore, it's legacy, we're going to support only virtio-md-pci from now on". We
> had a situation like that with virtio-balloon and virtio-mem in the past, and I'm
> not sure if this might fall in the same category.

Not sure if I got your comment right, but virtio-mem was never supposed 
to be a virtio-balloon replacement (especially of the 
free-page-reporting and memory stats part).

> 
> I see that David Hildenbrand is also CCed in the patch so he'll let us know if
> I'm out of line with what I'm asking.

Supporting PC-DIMMs might be required at some point when dealing with 
OSes that don't support virtio-mem and friends.
Daniel Henrique Barboza May 19, 2024, 9:24 a.m. UTC | #3
On 5/18/24 16:50, David Hildenbrand wrote:
> 
> Hi,
> 
> 
>>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>>> index 4fdb66052587..16c2bdbfe6b6 100644
>>> --- a/hw/riscv/virt.c
>>> +++ b/hw/riscv/virt.c
>>> @@ -53,6 +53,8 @@
>>>    #include "hw/pci-host/gpex.h"
>>>    #include "hw/display/ramfb.h"
>>>    #include "hw/acpi/aml-build.h"
>>> +#include "hw/mem/memory-device.h"
>>> +#include "hw/virtio/virtio-mem-pci.h"
>>>    #include "qapi/qapi-visit-common.h"
>>>    #include "hw/virtio/virtio-iommu.h"
>>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>>        DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>>        int i, base_hartid, hart_count;
>>>        int socket_count = riscv_socket_count(machine);
>>> +    hwaddr device_memory_base, device_memory_size;
>>>        /* Check socket count limit */
>>>        if (VIRT_SOCKETS_MAX < socket_count) {
>>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>>        memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>>                                    mask_rom);
>>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>>> +                                  GiB);
>>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>>> +
>>> +    if (riscv_is_32bit(&s->soc[0])) {
>>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>>> +
>>> +        if (memtop > UINT32_MAX) {
>>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>>> +                         memtop - UINT32_MAX);
>>> +            exit(EXIT_FAILURE);
>>> +        }
>>> +    }
>>> +
>>> +    if (device_memory_size > 0) {
>>> +        machine_memory_devices_init(machine, device_memory_base,
>>> +                                    device_memory_size);
>>> +    }
>>> +
>>
>> I think we need a design discussion before proceeding here. You're allocating all
>> available memory as a memory device area, but in theory we might also support
>> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
>> the board.) in the future too. If you're not familiar with this feature you can
>> check it out the docs in [1].
> 
> Note that DIMMs are memory devices as well. You can plug into the memory device area both, ACPI-based memory devices (DIMM, NVDIMM) or virtio-based memory devices (virtio-mem, virtio-pmem).
> 
>>
>> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
>> type of hotplug by checking how much 'ram_slots' we're allocating for it:
>>
>> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
>>
> 
> Note that we increased the region size to be able to fit most requests even if alignment of memory devices is weird. See below.
> 
> In sane setups, this is usually not required (adding a single additional GB for some flexiility might be good enough).
> 
>> Other boards do the same with ms->ram_slots. We should consider doing it as well,
>> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
>> having to change the memory layout later in the road and breaking existing
>> setups.
>>
>> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
>> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
>> them.
> 
> This only reserves some *additional* space to fixup weird alignment of memory devices. *not* the actual space for these devices.
> 
> We don't consider each DIMM to be 1 GiB in size, but add an additional 1 GiB in case we have to align DIMMs in physical address space.
> 
> I *think* this dates back to old x86 handling where we aligned the address of each DIMM to be at a 1 GiB boundary. So if you would have plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of space in the area after aligning inside the memory device area.
> 

Thanks for the explanation. I missed the part where the ram_slots were being
used just to solve potential alignment issues and pc-dimms could occupy the same
space being allocated via machine_memory_devices_init().

This patch isn't far off then. If we take care to avoid plugging unaligned memory
we might not even need this spare area.

>>
>> Note: I do not have the visibility of discussions on the memory management space,
>> and I might be missing details such as "we don't care about pc-dimm hotplug
>> anymore, it's legacy, we're going to support only virtio-md-pci from now on". We
>> had a situation like that with virtio-balloon and virtio-mem in the past, and I'm
>> not sure if this might fall in the same category.
> 
> Not sure if I got your comment right, but virtio-mem was never supposed to be a virtio-balloon replacement (especially of the free-page-reporting and memory stats part).

I was trying to refer to a situation we faced 3+ years ago in the powerpc machines,
where we were trying to add virtio-mem support there given that virtio-mem is/was
been seen (as far as I can remember anyways) as a more robust solution than
virtio-balloon + DIMM hotplug for guest memory management from the host point of
view.

I'm probably misrepresenting the whole situation though, it has been awhile :)


Thanks,


Daniel


> 
>>
>> I see that David Hildenbrand is also CCed in the patch so he'll let us know if
>> I'm out of line with what I'm asking.
> 
> Supporting PC-DIMMs might be required at some point when dealing with OSes that don't support virtio-mem and friends.
>
Björn Töpel May 20, 2024, 6:33 p.m. UTC | #4
Daniel,

Thanks for taking a look!

Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:

> Hi Björj,
>
> On 5/14/24 08:06, Björn Töpel wrote:
>> From: Björn Töpel <bjorn@rivosinc.com>
>> 
>> Virtio-based memory devices allows for dynamic resizing of virtual
>> machine memory, and requires proper hotplugging (add/remove) support
>> to work.
>> 
>> Enable virtio-md-pci with the corresponding missing hotplugging
>> callbacks for the RISC-V "virt" machine.
>> 
>> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
>> ---
>> This is basic support for MHP that works with DT. There some minor
>> ACPI SRAT plumbing in there as well. Ideally we'd like proper ACPI MHP
>> support as well. I have a branch [1], where I've applied this patch,
>> plus ACPI GED/PC-DIMM MHP support on top of Sunil's QEMU branch
>> (contains some ACPI DSDT additions) [2], for the curious/brave ones.
>> However, the ACPI MHP support this is not testable on upstream Linux
>> yet (ACPI AIA support, and ACPI NUMA SRAT series are ongoing).
>> 
>> I'll follow-up with proper ACPI GED/PC-DIMM MHP patches, once the
>> dependencies land (Linux kernel and QEMU).
>> 
>> I'll post the Linux MHP/virtio-mem v2 patches later this week!
>> 
>> 
>> Cheers,
>> Björn
>> 
>> [1] https://github.com/bjoto/qemu/commits/virtio-mem-pc-dimm-mhp-acpi/
>> [2] https://lore.kernel.org/linux-riscv/20240501121742.1215792-1-sunilvl@ventanamicro.com/
>> ---
>>   hw/riscv/Kconfig           |  2 ++
>>   hw/riscv/virt-acpi-build.c |  7 +++++
>>   hw/riscv/virt.c            | 64 +++++++++++++++++++++++++++++++++++++-
>>   hw/virtio/virtio-mem.c     |  2 +-
>>   4 files changed, 73 insertions(+), 2 deletions(-)
>> 
>> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
>> index a2030e3a6ff0..08f82dbb681a 100644
>> --- a/hw/riscv/Kconfig
>> +++ b/hw/riscv/Kconfig
>> @@ -56,6 +56,8 @@ config RISCV_VIRT
>>       select PLATFORM_BUS
>>       select ACPI
>>       select ACPI_PCI
>> +    select VIRTIO_MEM_SUPPORTED
>> +    select VIRTIO_PMEM_SUPPORTED
>>   
>>   config SHAKTI_C
>>       bool
>> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
>> index 0925528160f8..6dc3baa9ec86 100644
>> --- a/hw/riscv/virt-acpi-build.c
>> +++ b/hw/riscv/virt-acpi-build.c
>> @@ -610,6 +610,13 @@ build_srat(GArray *table_data, BIOSLinker *linker, RISCVVirtState *vms)
>>           }
>>       }
>>   
>> +    if (ms->device_memory) {
>> +        build_srat_memory(table_data, ms->device_memory->base,
>> +                          memory_region_size(&ms->device_memory->mr),
>> +                          ms->numa_state->num_nodes - 1,
>> +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
>> +    }
>> +
>>       acpi_table_end(linker, &table);
>
> When the time comes I believe we'll want this chunk in a separated ACPI patch.

Hmm, I first thought about adding this to the ACPI MHP series, but then
realized that virtio-mem relies on SRAT for ACPI boots (again -- RISC-V
Linux does not support that upstream yet...).

Do you mean that you'd prefer this chunk in a separate patch?


Björn
Björn Töpel May 20, 2024, 6:51 p.m. UTC | #5
Daniel/David,

Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:

> On 5/18/24 16:50, David Hildenbrand wrote:
>> 
>> Hi,
>> 
>> 
>>>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>>>> index 4fdb66052587..16c2bdbfe6b6 100644
>>>> --- a/hw/riscv/virt.c
>>>> +++ b/hw/riscv/virt.c
>>>> @@ -53,6 +53,8 @@
>>>>    #include "hw/pci-host/gpex.h"
>>>>    #include "hw/display/ramfb.h"
>>>>    #include "hw/acpi/aml-build.h"
>>>> +#include "hw/mem/memory-device.h"
>>>> +#include "hw/virtio/virtio-mem-pci.h"
>>>>    #include "qapi/qapi-visit-common.h"
>>>>    #include "hw/virtio/virtio-iommu.h"
>>>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>>>        DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>>>        int i, base_hartid, hart_count;
>>>>        int socket_count = riscv_socket_count(machine);
>>>> +    hwaddr device_memory_base, device_memory_size;
>>>>        /* Check socket count limit */
>>>>        if (VIRT_SOCKETS_MAX < socket_count) {
>>>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>>>        memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>>>                                    mask_rom);
>>>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>>>> +                                  GiB);
>>>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>>>> +
>>>> +    if (riscv_is_32bit(&s->soc[0])) {
>>>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>>>> +
>>>> +        if (memtop > UINT32_MAX) {
>>>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>>>> +                         memtop - UINT32_MAX);
>>>> +            exit(EXIT_FAILURE);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (device_memory_size > 0) {
>>>> +        machine_memory_devices_init(machine, device_memory_base,
>>>> +                                    device_memory_size);
>>>> +    }
>>>> +
>>>
>>> I think we need a design discussion before proceeding here. You're allocating all
>>> available memory as a memory device area, but in theory we might also support
>>> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
>>> the board.) in the future too. If you're not familiar with this feature you can
>>> check it out the docs in [1].
>> 
>> Note that DIMMs are memory devices as well. You can plug into the memory device area both, ACPI-based memory devices (DIMM, NVDIMM) or virtio-based memory devices (virtio-mem, virtio-pmem).
>> 
>>>
>>> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
>>> type of hotplug by checking how much 'ram_slots' we're allocating for it:
>>>
>>> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
>>>
>> 
>> Note that we increased the region size to be able to fit most requests even if alignment of memory devices is weird. See below.
>> 
>> In sane setups, this is usually not required (adding a single additional GB for some flexiility might be good enough).
>> 
>>> Other boards do the same with ms->ram_slots. We should consider doing it as well,
>>> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
>>> having to change the memory layout later in the road and breaking existing
>>> setups.
>>>
>>> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
>>> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
>>> them.
>> 
>> This only reserves some *additional* space to fixup weird alignment of memory devices. *not* the actual space for these devices.
>> 
>> We don't consider each DIMM to be 1 GiB in size, but add an additional 1 GiB in case we have to align DIMMs in physical address space.
>> 
>> I *think* this dates back to old x86 handling where we aligned the address of each DIMM to be at a 1 GiB boundary. So if you would have plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of space in the area after aligning inside the memory device area.
>> 
>
> Thanks for the explanation. I missed the part where the ram_slots were being
> used just to solve potential alignment issues and pc-dimms could occupy the same
> space being allocated via machine_memory_devices_init().
>
> This patch isn't far off then. If we take care to avoid plugging unaligned memory
> we might not even need this spare area.

I'm a bit lost here, so please bare with me. We don't require the 1 GiB
alignment on RV AFAIU. I'm having a hard time figuring out what missing
in my patch.

[...]

>>> I see that David Hildenbrand is also CCed in the patch so he'll let us know if
>>> I'm out of line with what I'm asking.
>> 
>> Supporting PC-DIMMs might be required at some point when dealing with OSes that don't support virtio-mem and friends.

...and also for testing the PC-DIMM ACPI patching path. ;-)


Cheers,
Björn
Daniel Henrique Barboza May 20, 2024, 7:24 p.m. UTC | #6
On 5/20/24 15:33, Björn Töpel wrote:
> Daniel,
> 
> Thanks for taking a look!
> 
> Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:
> 
>> Hi Björj,
>>
>> On 5/14/24 08:06, Björn Töpel wrote:
>>> From: Björn Töpel <bjorn@rivosinc.com>
>>>
>>> Virtio-based memory devices allows for dynamic resizing of virtual
>>> machine memory, and requires proper hotplugging (add/remove) support
>>> to work.
>>>
>>> Enable virtio-md-pci with the corresponding missing hotplugging
>>> callbacks for the RISC-V "virt" machine.
>>>
>>> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
>>> ---
>>> This is basic support for MHP that works with DT. There some minor
>>> ACPI SRAT plumbing in there as well. Ideally we'd like proper ACPI MHP
>>> support as well. I have a branch [1], where I've applied this patch,
>>> plus ACPI GED/PC-DIMM MHP support on top of Sunil's QEMU branch
>>> (contains some ACPI DSDT additions) [2], for the curious/brave ones.
>>> However, the ACPI MHP support this is not testable on upstream Linux
>>> yet (ACPI AIA support, and ACPI NUMA SRAT series are ongoing).
>>>
>>> I'll follow-up with proper ACPI GED/PC-DIMM MHP patches, once the
>>> dependencies land (Linux kernel and QEMU).
>>>
>>> I'll post the Linux MHP/virtio-mem v2 patches later this week!
>>>
>>>
>>> Cheers,
>>> Björn
>>>
>>> [1] https://github.com/bjoto/qemu/commits/virtio-mem-pc-dimm-mhp-acpi/
>>> [2] https://lore.kernel.org/linux-riscv/20240501121742.1215792-1-sunilvl@ventanamicro.com/
>>> ---
>>>    hw/riscv/Kconfig           |  2 ++
>>>    hw/riscv/virt-acpi-build.c |  7 +++++
>>>    hw/riscv/virt.c            | 64 +++++++++++++++++++++++++++++++++++++-
>>>    hw/virtio/virtio-mem.c     |  2 +-
>>>    4 files changed, 73 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
>>> index a2030e3a6ff0..08f82dbb681a 100644
>>> --- a/hw/riscv/Kconfig
>>> +++ b/hw/riscv/Kconfig
>>> @@ -56,6 +56,8 @@ config RISCV_VIRT
>>>        select PLATFORM_BUS
>>>        select ACPI
>>>        select ACPI_PCI
>>> +    select VIRTIO_MEM_SUPPORTED
>>> +    select VIRTIO_PMEM_SUPPORTED
>>>    
>>>    config SHAKTI_C
>>>        bool
>>> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
>>> index 0925528160f8..6dc3baa9ec86 100644
>>> --- a/hw/riscv/virt-acpi-build.c
>>> +++ b/hw/riscv/virt-acpi-build.c
>>> @@ -610,6 +610,13 @@ build_srat(GArray *table_data, BIOSLinker *linker, RISCVVirtState *vms)
>>>            }
>>>        }
>>>    
>>> +    if (ms->device_memory) {
>>> +        build_srat_memory(table_data, ms->device_memory->base,
>>> +                          memory_region_size(&ms->device_memory->mr),
>>> +                          ms->numa_state->num_nodes - 1,
>>> +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
>>> +    }
>>> +
>>>        acpi_table_end(linker, &table);
>>
>> When the time comes I believe we'll want this chunk in a separated ACPI patch.
> 
> Hmm, I first thought about adding this to the ACPI MHP series, but then
> realized that virtio-mem relies on SRAT for ACPI boots (again -- RISC-V
> Linux does not support that upstream yet...).
> 
> Do you mean that you'd prefer this chunk in a separate patch?

TBH I wouldn't mind keeping this ACPI chunk here but I reckon that the ACPI
subsystem review is usually done in separate, with a different set of people
reviewing it and so on.

We might as well keep it here for now. If more ACPI changes ended up being done
(e.g. ACPI unit test changes) then doing a separated ACPI patch makes more sense.


Thanks,

Daniel


> 
> 
> Björn
Daniel Henrique Barboza May 20, 2024, 8:04 p.m. UTC | #7
On 5/20/24 15:51, Björn Töpel wrote:
> Daniel/David,
> 
> Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:
> 
>> On 5/18/24 16:50, David Hildenbrand wrote:
>>>
>>> Hi,
>>>
>>>
>>>>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>>>>> index 4fdb66052587..16c2bdbfe6b6 100644
>>>>> --- a/hw/riscv/virt.c
>>>>> +++ b/hw/riscv/virt.c
>>>>> @@ -53,6 +53,8 @@
>>>>>     #include "hw/pci-host/gpex.h"
>>>>>     #include "hw/display/ramfb.h"
>>>>>     #include "hw/acpi/aml-build.h"
>>>>> +#include "hw/mem/memory-device.h"
>>>>> +#include "hw/virtio/virtio-mem-pci.h"
>>>>>     #include "qapi/qapi-visit-common.h"
>>>>>     #include "hw/virtio/virtio-iommu.h"
>>>>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>>>>         DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>>>>         int i, base_hartid, hart_count;
>>>>>         int socket_count = riscv_socket_count(machine);
>>>>> +    hwaddr device_memory_base, device_memory_size;
>>>>>         /* Check socket count limit */
>>>>>         if (VIRT_SOCKETS_MAX < socket_count) {
>>>>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>>>>         memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>>>>                                     mask_rom);
>>>>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>>>>> +                                  GiB);
>>>>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>>>>> +
>>>>> +    if (riscv_is_32bit(&s->soc[0])) {
>>>>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>>>>> +
>>>>> +        if (memtop > UINT32_MAX) {
>>>>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>>>>> +                         memtop - UINT32_MAX);
>>>>> +            exit(EXIT_FAILURE);
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    if (device_memory_size > 0) {
>>>>> +        machine_memory_devices_init(machine, device_memory_base,
>>>>> +                                    device_memory_size);
>>>>> +    }
>>>>> +
>>>>
>>>> I think we need a design discussion before proceeding here. You're allocating all
>>>> available memory as a memory device area, but in theory we might also support
>>>> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
>>>> the board.) in the future too. If you're not familiar with this feature you can
>>>> check it out the docs in [1].
>>>
>>> Note that DIMMs are memory devices as well. You can plug into the memory device area both, ACPI-based memory devices (DIMM, NVDIMM) or virtio-based memory devices (virtio-mem, virtio-pmem).
>>>
>>>>
>>>> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
>>>> type of hotplug by checking how much 'ram_slots' we're allocating for it:
>>>>
>>>> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
>>>>
>>>
>>> Note that we increased the region size to be able to fit most requests even if alignment of memory devices is weird. See below.
>>>
>>> In sane setups, this is usually not required (adding a single additional GB for some flexiility might be good enough).
>>>
>>>> Other boards do the same with ms->ram_slots. We should consider doing it as well,
>>>> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
>>>> having to change the memory layout later in the road and breaking existing
>>>> setups.
>>>>
>>>> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
>>>> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
>>>> them.
>>>
>>> This only reserves some *additional* space to fixup weird alignment of memory devices. *not* the actual space for these devices.
>>>
>>> We don't consider each DIMM to be 1 GiB in size, but add an additional 1 GiB in case we have to align DIMMs in physical address space.
>>>
>>> I *think* this dates back to old x86 handling where we aligned the address of each DIMM to be at a 1 GiB boundary. So if you would have plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of space in the area after aligning inside the memory device area.
>>>
>>
>> Thanks for the explanation. I missed the part where the ram_slots were being
>> used just to solve potential alignment issues and pc-dimms could occupy the same
>> space being allocated via machine_memory_devices_init().
>>
>> This patch isn't far off then. If we take care to avoid plugging unaligned memory
>> we might not even need this spare area.
> 
> I'm a bit lost here, so please bare with me. We don't require the 1 GiB
> alignment on RV AFAIU. I'm having a hard time figuring out what missing
> in my patch.

Forget about the 1 GiB size. This is something that we won't need to deal with
because we don't align in 1 Gib.

Let's say for example that we want to support pc-dimm hotplug of 256 slots like the
'virt' ARM machine does. Let's also say that we will allow users to hotplug any
DIMM size they want, taking care of any alignment issues by ourselves.

In hw/riscv/boot.c I see that our alignment sizes are 4Mb for 32 bits and 2Mb for
64 bits. Forget 32 bits a bit and let's say that our alignment is 2Mb.

So, in a worst case scenario, an user could hotplug 256 slots, all of them unaligned,
and then we would need to align each one of them by adding 2Mb. So, to account for
this alignment fixup, we would need 256 * 2Mb RAM reserved for it.

What I said about "If we take care to avoid plugging unaligned memory we might not even
need this spare area" is a possible design where we would force every hotplugged DIMM
to always be memory aligned, avoiding the need for this spare RAM for alignment. This
would put a bit of extra straing in users/management apps to always deliver aligned
DIMMs.

In hindsight this is not needed. It's fairly easy to reserve ACPI_MAX_RAM_SLOTS * (2Mb/4Mb)
and let users hotplug whatever DIMM size they want.

Hope this explains the situation a bit. If we agree on allocating this spare RAM for
hotplugged mem alignment, we'll probalby need a pre-patch to do a little handling of
ms->ram_slots, limiting it to ACPI_MAX_RAM_SLOTS (ram_slots can be changed via command
line). Then it's a matter of reserving ms->ram_slots * align_size when calculating
device_memory_size.


Thanks,

Daniel

> 
> [...]
> 
>>>> I see that David Hildenbrand is also CCed in the patch so he'll let us know if
>>>> I'm out of line with what I'm asking.
>>>
>>> Supporting PC-DIMMs might be required at some point when dealing with OSes that don't support virtio-mem and friends.
> 
> ...and also for testing the PC-DIMM ACPI patching path. ;-)
> 
> 
> Cheers,
> Björn
Björn Töpel May 21, 2024, 5:43 a.m. UTC | #8
Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:

> On 5/20/24 15:51, Björn Töpel wrote:
>> Daniel/David,
>> 
>> Daniel Henrique Barboza <dbarboza@ventanamicro.com> writes:
>> 
>>> On 5/18/24 16:50, David Hildenbrand wrote:
>>>>
>>>> Hi,
>>>>
>>>>
>>>>>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>>>>>> index 4fdb66052587..16c2bdbfe6b6 100644
>>>>>> --- a/hw/riscv/virt.c
>>>>>> +++ b/hw/riscv/virt.c
>>>>>> @@ -53,6 +53,8 @@
>>>>>>     #include "hw/pci-host/gpex.h"
>>>>>>     #include "hw/display/ramfb.h"
>>>>>>     #include "hw/acpi/aml-build.h"
>>>>>> +#include "hw/mem/memory-device.h"
>>>>>> +#include "hw/virtio/virtio-mem-pci.h"
>>>>>>     #include "qapi/qapi-visit-common.h"
>>>>>>     #include "hw/virtio/virtio-iommu.h"
>>>>>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>>>>>         DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>>>>>         int i, base_hartid, hart_count;
>>>>>>         int socket_count = riscv_socket_count(machine);
>>>>>> +    hwaddr device_memory_base, device_memory_size;
>>>>>>         /* Check socket count limit */
>>>>>>         if (VIRT_SOCKETS_MAX < socket_count) {
>>>>>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>>>>>         memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>>>>>                                     mask_rom);
>>>>>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>>>>>> +                                  GiB);
>>>>>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>>>>>> +
>>>>>> +    if (riscv_is_32bit(&s->soc[0])) {
>>>>>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>>>>>> +
>>>>>> +        if (memtop > UINT32_MAX) {
>>>>>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>>>>>> +                         memtop - UINT32_MAX);
>>>>>> +            exit(EXIT_FAILURE);
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    if (device_memory_size > 0) {
>>>>>> +        machine_memory_devices_init(machine, device_memory_base,
>>>>>> +                                    device_memory_size);
>>>>>> +    }
>>>>>> +
>>>>>
>>>>> I think we need a design discussion before proceeding here. You're allocating all
>>>>> available memory as a memory device area, but in theory we might also support
>>>>> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
>>>>> the board.) in the future too. If you're not familiar with this feature you can
>>>>> check it out the docs in [1].
>>>>
>>>> Note that DIMMs are memory devices as well. You can plug into the memory device area both, ACPI-based memory devices (DIMM, NVDIMM) or virtio-based memory devices (virtio-mem, virtio-pmem).
>>>>
>>>>>
>>>>> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
>>>>> type of hotplug by checking how much 'ram_slots' we're allocating for it:
>>>>>
>>>>> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
>>>>>
>>>>
>>>> Note that we increased the region size to be able to fit most requests even if alignment of memory devices is weird. See below.
>>>>
>>>> In sane setups, this is usually not required (adding a single additional GB for some flexiility might be good enough).
>>>>
>>>>> Other boards do the same with ms->ram_slots. We should consider doing it as well,
>>>>> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
>>>>> having to change the memory layout later in the road and breaking existing
>>>>> setups.
>>>>>
>>>>> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
>>>>> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
>>>>> them.
>>>>
>>>> This only reserves some *additional* space to fixup weird alignment of memory devices. *not* the actual space for these devices.
>>>>
>>>> We don't consider each DIMM to be 1 GiB in size, but add an additional 1 GiB in case we have to align DIMMs in physical address space.
>>>>
>>>> I *think* this dates back to old x86 handling where we aligned the address of each DIMM to be at a 1 GiB boundary. So if you would have plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of space in the area after aligning inside the memory device area.
>>>>
>>>
>>> Thanks for the explanation. I missed the part where the ram_slots were being
>>> used just to solve potential alignment issues and pc-dimms could occupy the same
>>> space being allocated via machine_memory_devices_init().
>>>
>>> This patch isn't far off then. If we take care to avoid plugging unaligned memory
>>> we might not even need this spare area.
>> 
>> I'm a bit lost here, so please bare with me. We don't require the 1 GiB
>> alignment on RV AFAIU. I'm having a hard time figuring out what missing
>> in my patch.
>
> Forget about the 1 GiB size. This is something that we won't need to deal with
> because we don't align in 1 Gib.
>
> Let's say for example that we want to support pc-dimm hotplug of 256 slots like the
> 'virt' ARM machine does. Let's also say that we will allow users to hotplug any
> DIMM size they want, taking care of any alignment issues by ourselves.
>
> In hw/riscv/boot.c I see that our alignment sizes are 4Mb for 32 bits and 2Mb for
> 64 bits. Forget 32 bits a bit and let's say that our alignment is 2Mb.
>
> So, in a worst case scenario, an user could hotplug 256 slots, all of them unaligned,
> and then we would need to align each one of them by adding 2Mb. So, to account for
> this alignment fixup, we would need 256 * 2Mb RAM reserved for it.
>
> What I said about "If we take care to avoid plugging unaligned memory we might not even
> need this spare area" is a possible design where we would force every hotplugged DIMM
> to always be memory aligned, avoiding the need for this spare RAM for alignment. This
> would put a bit of extra straing in users/management apps to always deliver aligned
> DIMMs.
>
> In hindsight this is not needed. It's fairly easy to reserve ACPI_MAX_RAM_SLOTS * (2Mb/4Mb)
> and let users hotplug whatever DIMM size they want.
>
> Hope this explains the situation a bit. If we agree on allocating this spare RAM for
> hotplugged mem alignment, we'll probalby need a pre-patch to do a little handling of
> ms->ram_slots, limiting it to ACPI_MAX_RAM_SLOTS (ram_slots can be changed via command
> line). Then it's a matter of reserving ms->ram_slots * align_size when calculating
> device_memory_size.

Thanks for the elaborate explaination! I'll take a stab at it in v2.


Björn
David Hildenbrand May 21, 2024, 8:15 a.m. UTC | #9
On 19.05.24 11:24, Daniel Henrique Barboza wrote:
> 
> On 5/18/24 16:50, David Hildenbrand wrote:
>>
>> Hi,
>>
>>
>>>> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
>>>> index 4fdb66052587..16c2bdbfe6b6 100644
>>>> --- a/hw/riscv/virt.c
>>>> +++ b/hw/riscv/virt.c
>>>> @@ -53,6 +53,8 @@
>>>>     #include "hw/pci-host/gpex.h"
>>>>     #include "hw/display/ramfb.h"
>>>>     #include "hw/acpi/aml-build.h"
>>>> +#include "hw/mem/memory-device.h"
>>>> +#include "hw/virtio/virtio-mem-pci.h"
>>>>     #include "qapi/qapi-visit-common.h"
>>>>     #include "hw/virtio/virtio-iommu.h"
>>>> @@ -1407,6 +1409,7 @@ static void virt_machine_init(MachineState *machine)
>>>>         DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
>>>>         int i, base_hartid, hart_count;
>>>>         int socket_count = riscv_socket_count(machine);
>>>> +    hwaddr device_memory_base, device_memory_size;
>>>>         /* Check socket count limit */
>>>>         if (VIRT_SOCKETS_MAX < socket_count) {
>>>> @@ -1553,6 +1556,25 @@ static void virt_machine_init(MachineState *machine)
>>>>         memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
>>>>                                     mask_rom);
>>>> +    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
>>>> +                                  GiB);
>>>> +    device_memory_size = machine->maxram_size - machine->ram_size;
>>>> +
>>>> +    if (riscv_is_32bit(&s->soc[0])) {
>>>> +        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
>>>> +
>>>> +        if (memtop > UINT32_MAX) {
>>>> +            error_report("Memory exceeds 32-bit limit by %lu bytes",
>>>> +                         memtop - UINT32_MAX);
>>>> +            exit(EXIT_FAILURE);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (device_memory_size > 0) {
>>>> +        machine_memory_devices_init(machine, device_memory_base,
>>>> +                                    device_memory_size);
>>>> +    }
>>>> +
>>>
>>> I think we need a design discussion before proceeding here. You're allocating all
>>> available memory as a memory device area, but in theory we might also support
>>> pc-dimm hotplugs (which would be the equivalent of adding physical RAM dimms to
>>> the board.) in the future too. If you're not familiar with this feature you can
>>> check it out the docs in [1].
>>
>> Note that DIMMs are memory devices as well. You can plug into the memory device area both, ACPI-based memory devices (DIMM, NVDIMM) or virtio-based memory devices (virtio-mem, virtio-pmem).
>>
>>>
>>> As an example, the 'virt' ARM board (hw/arm/virt.c) reserves a space for this
>>> type of hotplug by checking how much 'ram_slots' we're allocating for it:
>>>
>>> device_memory_size = ms->maxram_size - ms->ram_size + ms->ram_slots * GiB;
>>>
>>
>> Note that we increased the region size to be able to fit most requests even if alignment of memory devices is weird. See below.
>>
>> In sane setups, this is usually not required (adding a single additional GB for some flexiility might be good enough).
>>
>>> Other boards do the same with ms->ram_slots. We should consider doing it as well,
>>> now, even if we're not up to the point of supporting pc-dimm hotplug, to avoid
>>> having to change the memory layout later in the road and breaking existing
>>> setups.
>>>
>>> If we want to copy the ARM board, ram_slots is capped to ACPI_MAX_RAM_SLOTS (256).
>>> Each RAM slot is considered to be a 1GiB dimm, i.e. we would reserve 256GiB for
>>> them.
>>
>> This only reserves some *additional* space to fixup weird alignment of memory devices. *not* the actual space for these devices.
>>
>> We don't consider each DIMM to be 1 GiB in size, but add an additional 1 GiB in case we have to align DIMMs in physical address space.
>>
>> I *think* this dates back to old x86 handling where we aligned the address of each DIMM to be at a 1 GiB boundary. So if you would have plugged two 128 MiB DIMMs, you'd have required more than 256 MiB of space in the area after aligning inside the memory device area.
>>
> 
> Thanks for the explanation. I missed the part where the ram_slots were being
> used just to solve potential alignment issues and pc-dimms could occupy the same
> space being allocated via machine_memory_devices_init().
> 
> This patch isn't far off then. If we take care to avoid plugging unaligned memory
> we might not even need this spare area.

Right.

> 
>>>
>>> Note: I do not have the visibility of discussions on the memory management space,
>>> and I might be missing details such as "we don't care about pc-dimm hotplug
>>> anymore, it's legacy, we're going to support only virtio-md-pci from now on". We
>>> had a situation like that with virtio-balloon and virtio-mem in the past, and I'm
>>> not sure if this might fall in the same category.
>>
>> Not sure if I got your comment right, but virtio-mem was never supposed to be a virtio-balloon replacement (especially of the free-page-reporting and memory stats part).
> 
> I was trying to refer to a situation we faced 3+ years ago in the powerpc machines,
> where we were trying to add virtio-mem support there given that virtio-mem is/was
> been seen (as far as I can remember anyways) as a more robust solution than
> virtio-balloon + DIMM hotplug for guest memory management from the host point of
> view.

I only heard once that ppc support was being worked on, but don't know 
of any details what the issue was. PPC+KVM is not really a priority 
anymore, so my primary focus for the future is adding s390x support (I 
had prototypes but some things are tricky).

Supporting architectures that do their own weird paravirtualized stuff 
already or don't really support memory hotplug is a bit more tricky than 
the others. PPC with dlpar falls into that category :)
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index a2030e3a6ff0..08f82dbb681a 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -56,6 +56,8 @@  config RISCV_VIRT
     select PLATFORM_BUS
     select ACPI
     select ACPI_PCI
+    select VIRTIO_MEM_SUPPORTED
+    select VIRTIO_PMEM_SUPPORTED
 
 config SHAKTI_C
     bool
diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
index 0925528160f8..6dc3baa9ec86 100644
--- a/hw/riscv/virt-acpi-build.c
+++ b/hw/riscv/virt-acpi-build.c
@@ -610,6 +610,13 @@  build_srat(GArray *table_data, BIOSLinker *linker, RISCVVirtState *vms)
         }
     }
 
+    if (ms->device_memory) {
+        build_srat_memory(table_data, ms->device_memory->base,
+                          memory_region_size(&ms->device_memory->mr),
+                          ms->numa_state->num_nodes - 1,
+                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+    }
+
     acpi_table_end(linker, &table);
 }
 
diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index 4fdb66052587..16c2bdbfe6b6 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -53,6 +53,8 @@ 
 #include "hw/pci-host/gpex.h"
 #include "hw/display/ramfb.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/mem/memory-device.h"
+#include "hw/virtio/virtio-mem-pci.h"
 #include "qapi/qapi-visit-common.h"
 #include "hw/virtio/virtio-iommu.h"
 
@@ -1407,6 +1409,7 @@  static void virt_machine_init(MachineState *machine)
     DeviceState *mmio_irqchip, *virtio_irqchip, *pcie_irqchip;
     int i, base_hartid, hart_count;
     int socket_count = riscv_socket_count(machine);
+    hwaddr device_memory_base, device_memory_size;
 
     /* Check socket count limit */
     if (VIRT_SOCKETS_MAX < socket_count) {
@@ -1553,6 +1556,25 @@  static void virt_machine_init(MachineState *machine)
     memory_region_add_subregion(system_memory, memmap[VIRT_MROM].base,
                                 mask_rom);
 
+    device_memory_base = ROUND_UP(s->memmap[VIRT_DRAM].base + machine->ram_size,
+                                  GiB);
+    device_memory_size = machine->maxram_size - machine->ram_size;
+
+    if (riscv_is_32bit(&s->soc[0])) {
+        hwaddr memtop = device_memory_base + ROUND_UP(device_memory_size, GiB);
+
+        if (memtop > UINT32_MAX) {
+            error_report("Memory exceeds 32-bit limit by %lu bytes",
+                         memtop - UINT32_MAX);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (device_memory_size > 0) {
+        machine_memory_devices_init(machine, device_memory_base,
+                                    device_memory_size);
+    }
+
     /*
      * Init fw_cfg. Must be done before riscv_load_fdt, otherwise the
      * device tree cannot be altered and we get FDT_ERR_NOSPACE.
@@ -1712,12 +1734,21 @@  static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
     MachineClass *mc = MACHINE_GET_CLASS(machine);
 
     if (device_is_dynamic_sysbus(mc, dev) ||
-        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
         return HOTPLUG_HANDLER(machine);
     }
     return NULL;
 }
 
+static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
+                                            DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
+        virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+    }
+}
+
 static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
                                         DeviceState *dev, Error **errp)
 {
@@ -1735,6 +1766,34 @@  static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
     if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
         create_fdt_virtio_iommu(s, pci_get_bdf(PCI_DEVICE(dev)));
     }
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
+        virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+    }
+}
+
+static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
+                                                  DeviceState *dev,
+                                                  Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
+        virtio_md_pci_unplug_request(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev),
+                                     errp);
+    } else {
+        error_setg(errp, "device unplug request for unsupported device type: %s",
+                   object_get_typename(OBJECT(dev)));
+    }
+}
+
+static void virt_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
+                                          DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
+        virtio_md_pci_unplug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+    } else {
+        error_setg(errp, "virt: device unplug for unsupported device"
+                   " type: %s", object_get_typename(OBJECT(dev)));
+    }
 }
 
 static void virt_machine_class_init(ObjectClass *oc, void *data)
@@ -1757,7 +1816,10 @@  static void virt_machine_class_init(ObjectClass *oc, void *data)
     assert(!mc->get_hotplug_handler);
     mc->get_hotplug_handler = virt_machine_get_hotplug_handler;
 
+    hc->pre_plug = virt_machine_device_pre_plug_cb;
     hc->plug = virt_machine_device_plug_cb;
+    hc->unplug_request = virt_machine_device_unplug_request_cb;
+    hc->unplug = virt_machine_device_unplug_cb;
 
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
 #ifdef CONFIG_TPM
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index ffd119ebacb7..26c976a3b9b8 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -161,7 +161,7 @@  static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
  * necessary (as the section size can change). But it's more likely that the
  * section size will rather get smaller and not bigger over time.
  */
-#if defined(TARGET_X86_64) || defined(TARGET_I386)
+#if defined(TARGET_X86_64) || defined(TARGET_I386)  || defined(TARGET_RISCV)
 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
 #elif defined(TARGET_ARM)
 #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))