diff mbox

[v2,06/11] nvdimm acpi: initialize the resource used by NVDIMM ACPI

Message ID 1452624610-46945-7-git-send-email-guangrong.xiao@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Jan. 12, 2016, 6:50 p.m. UTC
32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
NVDIMM ACPI binary code

OSPM uses this port to tell QEMU the final address of the DSM memory
and notify QEMU to emulate the DSM method

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 hw/acpi/Makefile.objs   |  2 +-
 hw/acpi/nvdimm.c        | 35 +++++++++++++++++++++++++++++++++++
 hw/i386/acpi-build.c    | 10 +---------
 hw/i386/pc.c            |  8 +++++---
 hw/i386/pc_piix.c       |  5 +++++
 hw/i386/pc_q35.c        |  8 +++++++-
 include/hw/i386/pc.h    |  5 ++++-
 include/hw/mem/nvdimm.h | 28 +++++++++++++++++++++++++++-
 8 files changed, 85 insertions(+), 16 deletions(-)

Comments

Michael S. Tsirkin Feb. 4, 2016, 4:22 p.m. UTC | #1
On Wed, Jan 13, 2016 at 02:50:05AM +0800, Xiao Guangrong wrote:
> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> NVDIMM ACPI binary code
> 
> OSPM uses this port to tell QEMU the final address of the DSM memory
> and notify QEMU to emulate the DSM method
> 
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>

This will have to be rebased now that guest info is gone.

> ---
>  hw/acpi/Makefile.objs   |  2 +-
>  hw/acpi/nvdimm.c        | 35 +++++++++++++++++++++++++++++++++++
>  hw/i386/acpi-build.c    | 10 +---------
>  hw/i386/pc.c            |  8 +++++---
>  hw/i386/pc_piix.c       |  5 +++++
>  hw/i386/pc_q35.c        |  8 +++++++-
>  include/hw/i386/pc.h    |  5 ++++-
>  include/hw/mem/nvdimm.h | 28 +++++++++++++++++++++++++++-
>  8 files changed, 85 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
> index f3ade9a..faee86c 100644
> --- a/hw/acpi/Makefile.objs
> +++ b/hw/acpi/Makefile.objs
> @@ -2,7 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
>  common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
>  common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o cpu_hotplug_acpi_table.o
>  common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o memory_hotplug_acpi_table.o
> -common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
> +obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>  common-obj-$(CONFIG_ACPI) += acpi_interface.o
>  common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
>  common-obj-$(CONFIG_ACPI) += aml-build.o
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index df1b176..256cedd 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -28,6 +28,7 @@
>  
>  #include "hw/acpi/acpi.h"
>  #include "hw/acpi/aml-build.h"
> +#include "hw/nvram/fw_cfg.h"
>  #include "hw/mem/nvdimm.h"
>  
>  static int nvdimm_plugged_device_list(Object *obj, void *opaque)
> @@ -369,6 +370,40 @@ static void nvdimm_build_nfit(GSList *device_list, GArray *table_offsets,
>      g_array_free(structures, true);
>  }
>  
> +static uint64_t
> +nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    return 0;
> +}
> +
> +static void
> +nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
> +{
> +}
> +
> +static const MemoryRegionOps nvdimm_dsm_ops = {
> +    .read = nvdimm_dsm_read,
> +    .write = nvdimm_dsm_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +};
> +
> +void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
> +                            FWCfgState *fw_cfg, Object *owner)
> +{
> +    memory_region_init_io(&state->io_mr, owner, &nvdimm_dsm_ops, state,
> +                          "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
> +    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);
> +
> +    state->dsm_mem = g_array_new(false, true /* clear */, 1);
> +    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
> +    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
> +                    state->dsm_mem->len);
> +}
> +
>  #define NVDIMM_COMMON_DSM      "NCAL"
>  
>  static void nvdimm_build_common_dsm(Aml *dev)
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 1ca044f..c68cfb8 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -39,7 +39,6 @@
>  #include "hw/loader.h"
>  #include "hw/isa/isa.h"
>  #include "hw/acpi/memory_hotplug.h"
> -#include "hw/mem/nvdimm.h"
>  #include "sysemu/tpm.h"
>  #include "hw/acpi/tpm.h"
>  #include "sysemu/tpm_backend.h"
> @@ -2602,13 +2601,6 @@ static bool acpi_has_iommu(void)
>      return intel_iommu && !ambiguous;
>  }
>  
> -static bool acpi_has_nvdimm(void)
> -{
> -    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
> -
> -    return pcms->nvdimm;
> -}
> -
>  static
>  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>  {
> @@ -2692,7 +2684,7 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>          build_dmar_q35(tables_blob, tables->linker);
>      }
>  
> -    if (acpi_has_nvdimm()) {
> +    if (guest_info->has_nvdimm) {
>          nvdimm_build_acpi(table_offsets, tables_blob, tables->linker);
>      }
>  
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index c36b8cf..397de28 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1228,6 +1228,8 @@ PcGuestInfo *pc_guest_info_init(PCMachineState *pcms)
>          }
>      }
>  
> +    guest_info->has_nvdimm = pcms->acpi_nvdimm_state.is_enabled;
> +
>      guest_info_state->machine_done.notify = pc_guest_info_machine_done;
>      qemu_add_machine_init_done_notifier(&guest_info_state->machine_done);
>      return guest_info;
> @@ -1877,14 +1879,14 @@ static bool pc_machine_get_nvdimm(Object *obj, Error **errp)
>  {
>      PCMachineState *pcms = PC_MACHINE(obj);
>  
> -    return pcms->nvdimm;
> +    return pcms->acpi_nvdimm_state.is_enabled;
>  }
>  
>  static void pc_machine_set_nvdimm(Object *obj, bool value, Error **errp)
>  {
>      PCMachineState *pcms = PC_MACHINE(obj);
>  
> -    pcms->nvdimm = value;
> +    pcms->acpi_nvdimm_state.is_enabled = value;
>  }
>  
>  static void pc_machine_initfn(Object *obj)
> @@ -1923,7 +1925,7 @@ static void pc_machine_initfn(Object *obj)
>                                      &error_abort);
>  
>      /* nvdimm is disabled on default. */
> -    pcms->nvdimm = false;
> +    pcms->acpi_nvdimm_state.is_enabled = false;
>      object_property_add_bool(obj, PC_MACHINE_NVDIMM, pc_machine_get_nvdimm,
>                               pc_machine_set_nvdimm, &error_abort);
>  }
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 438cdae..2fee478 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -281,6 +281,11 @@ static void pc_init1(MachineState *machine,
>      if (pcmc->pci_enabled) {
>          pc_pci_device_init(pci_bus);
>      }
> +
> +    if (guest_info->has_nvdimm) {
> +        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
> +                               guest_info->fw_cfg, OBJECT(pcms));
> +    }
>  }
>  
>  /* Looking for a pc_compat_2_4() function? It doesn't exist.
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 412b3cd..c9334d5 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -60,6 +60,7 @@ static void pc_q35_init(MachineState *machine)
>      PCIDevice *lpc;
>      BusState *idebus[MAX_SATA_PORTS];
>      ISADevice *rtc_state;
> +    MemoryRegion *system_io = get_system_io();
>      MemoryRegion *pci_memory;
>      MemoryRegion *rom_memory;
>      MemoryRegion *ram_memory;
> @@ -176,7 +177,7 @@ static void pc_q35_init(MachineState *machine)
>      q35_host->mch.ram_memory = ram_memory;
>      q35_host->mch.pci_address_space = pci_memory;
>      q35_host->mch.system_memory = get_system_memory();
> -    q35_host->mch.address_space_io = get_system_io();
> +    q35_host->mch.address_space_io = system_io;
>      q35_host->mch.below_4g_mem_size = pcms->below_4g_mem_size;
>      q35_host->mch.above_4g_mem_size = pcms->above_4g_mem_size;
>      /* pci */
> @@ -267,6 +268,11 @@ static void pc_q35_init(MachineState *machine)
>      if (pcmc->pci_enabled) {
>          pc_pci_device_init(host_bus);
>      }
> +
> +    if (guest_info->has_nvdimm) {
> +        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
> +                               guest_info->fw_cfg, OBJECT(pcms));
> +    }
>  }
>  
>  /* Looking for a pc_compat_2_4() function? It doesn't exist.
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 8122229..362ddc4 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -17,6 +17,7 @@
>  #include "hw/boards.h"
>  #include "hw/compat.h"
>  #include "hw/mem/pc-dimm.h"
> +#include "hw/mem/nvdimm.h"
>  
>  #define HPET_INTCAP "hpet-intcap"
>  
> @@ -55,7 +56,8 @@ struct PCMachineState {
>      uint64_t max_ram_below_4g;
>      OnOffAuto vmport;
>      OnOffAuto smm;
> -    bool nvdimm;
> +
> +    AcpiNVDIMMState acpi_nvdimm_state;
>  
>      /* RAM information (sizes, addresses, configuration): */
>      ram_addr_t below_4g_mem_size, above_4g_mem_size;
> @@ -161,6 +163,7 @@ struct PcGuestInfo {
>      bool has_acpi_build;
>      bool has_reserved_memory;
>      bool rsdp_in_ram;
> +    bool has_nvdimm;
>  };
>  
>  /* parallel.c */
> diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
> index 49183c1..634c60b 100644
> --- a/include/hw/mem/nvdimm.h
> +++ b/include/hw/mem/nvdimm.h
> @@ -25,8 +25,34 @@
>  
>  #include "hw/mem/pc-dimm.h"
>  
> -#define TYPE_NVDIMM      "nvdimm"
> +#define TYPE_NVDIMM             "nvdimm"
>  
> +#define NVDIMM_DSM_MEM_FILE     "etc/acpi/nvdimm-mem"
> +
> +/*
> + * 32 bits IO port starting from 0x0a18 in guest is reserved for
> + * NVDIMM ACPI emulation.
> + */
> +#define NVDIMM_ACPI_IO_BASE     0x0a18
> +#define NVDIMM_ACPI_IO_LEN      4
> +
> +/*
> + * AcpiNVDIMMState:
> + * @is_enabled: detect if NVDIMM support is enabled.
> + *
> + * @dsm_mem: the data of the fw_cfg file NVDIMM_DSM_MEM_FILE.
> + * @io_mr: the IO region used by OSPM to transfer control to QEMU.
> + */
> +struct AcpiNVDIMMState {
> +    bool is_enabled;
> +
> +    GArray *dsm_mem;
> +    MemoryRegion io_mr;
> +};
> +typedef struct AcpiNVDIMMState AcpiNVDIMMState;
> +
> +void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
> +                            FWCfgState *fw_cfg, Object *owner);
>  void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
>                         GArray *linker);
>  #endif
> -- 
> 1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 8, 2016, 11:03 a.m. UTC | #2
On Wed, 13 Jan 2016 02:50:05 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> NVDIMM ACPI binary code
> 
> OSPM uses this port to tell QEMU the final address of the DSM memory
> and notify QEMU to emulate the DSM method
Would you need to pass control to QEMU if each NVDIMM had its whole
label area MemoryRegion mapped right after its storage MemoryRegion?

> 
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  hw/acpi/Makefile.objs   |  2 +-
>  hw/acpi/nvdimm.c        | 35 +++++++++++++++++++++++++++++++++++
>  hw/i386/acpi-build.c    | 10 +---------
>  hw/i386/pc.c            |  8 +++++---
>  hw/i386/pc_piix.c       |  5 +++++
>  hw/i386/pc_q35.c        |  8 +++++++-
>  include/hw/i386/pc.h    |  5 ++++-
>  include/hw/mem/nvdimm.h | 28 +++++++++++++++++++++++++++-
>  8 files changed, 85 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
> index f3ade9a..faee86c 100644
> --- a/hw/acpi/Makefile.objs
> +++ b/hw/acpi/Makefile.objs
> @@ -2,7 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
>  common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
>  common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o cpu_hotplug_acpi_table.o
>  common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o memory_hotplug_acpi_table.o
> -common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
> +obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>  common-obj-$(CONFIG_ACPI) += acpi_interface.o
>  common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
>  common-obj-$(CONFIG_ACPI) += aml-build.o
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index df1b176..256cedd 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -28,6 +28,7 @@
>  
>  #include "hw/acpi/acpi.h"
>  #include "hw/acpi/aml-build.h"
> +#include "hw/nvram/fw_cfg.h"
>  #include "hw/mem/nvdimm.h"
>  
>  static int nvdimm_plugged_device_list(Object *obj, void *opaque)
> @@ -369,6 +370,40 @@ static void nvdimm_build_nfit(GSList *device_list, GArray *table_offsets,
>      g_array_free(structures, true);
>  }
>  
> +static uint64_t
> +nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    return 0;
> +}
> +
> +static void
> +nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
> +{
> +}
> +
> +static const MemoryRegionOps nvdimm_dsm_ops = {
> +    .read = nvdimm_dsm_read,
> +    .write = nvdimm_dsm_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +};
> +
> +void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
> +                            FWCfgState *fw_cfg, Object *owner)
> +{
> +    memory_region_init_io(&state->io_mr, owner, &nvdimm_dsm_ops, state,
> +                          "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
> +    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);
> +
> +    state->dsm_mem = g_array_new(false, true /* clear */, 1);
> +    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
> +    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
> +                    state->dsm_mem->len);
> +}
> +
>  #define NVDIMM_COMMON_DSM      "NCAL"
>  
>  static void nvdimm_build_common_dsm(Aml *dev)
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 1ca044f..c68cfb8 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -39,7 +39,6 @@
>  #include "hw/loader.h"
>  #include "hw/isa/isa.h"
>  #include "hw/acpi/memory_hotplug.h"
> -#include "hw/mem/nvdimm.h"
>  #include "sysemu/tpm.h"
>  #include "hw/acpi/tpm.h"
>  #include "sysemu/tpm_backend.h"
> @@ -2602,13 +2601,6 @@ static bool acpi_has_iommu(void)
>      return intel_iommu && !ambiguous;
>  }
>  
> -static bool acpi_has_nvdimm(void)
> -{
> -    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
> -
> -    return pcms->nvdimm;
> -}
> -
>  static
>  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>  {
> @@ -2692,7 +2684,7 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>          build_dmar_q35(tables_blob, tables->linker);
>      }
>  
> -    if (acpi_has_nvdimm()) {
> +    if (guest_info->has_nvdimm) {
>          nvdimm_build_acpi(table_offsets, tables_blob, tables->linker);
>      }
>  
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index c36b8cf..397de28 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1228,6 +1228,8 @@ PcGuestInfo *pc_guest_info_init(PCMachineState *pcms)
>          }
>      }
>  
> +    guest_info->has_nvdimm = pcms->acpi_nvdimm_state.is_enabled;
> +
>      guest_info_state->machine_done.notify = pc_guest_info_machine_done;
>      qemu_add_machine_init_done_notifier(&guest_info_state->machine_done);
>      return guest_info;
> @@ -1877,14 +1879,14 @@ static bool pc_machine_get_nvdimm(Object *obj, Error **errp)
>  {
>      PCMachineState *pcms = PC_MACHINE(obj);
>  
> -    return pcms->nvdimm;
> +    return pcms->acpi_nvdimm_state.is_enabled;
>  }
>  
>  static void pc_machine_set_nvdimm(Object *obj, bool value, Error **errp)
>  {
>      PCMachineState *pcms = PC_MACHINE(obj);
>  
> -    pcms->nvdimm = value;
> +    pcms->acpi_nvdimm_state.is_enabled = value;
>  }
>  
>  static void pc_machine_initfn(Object *obj)
> @@ -1923,7 +1925,7 @@ static void pc_machine_initfn(Object *obj)
>                                      &error_abort);
>  
>      /* nvdimm is disabled on default. */
> -    pcms->nvdimm = false;
> +    pcms->acpi_nvdimm_state.is_enabled = false;
>      object_property_add_bool(obj, PC_MACHINE_NVDIMM, pc_machine_get_nvdimm,
>                               pc_machine_set_nvdimm, &error_abort);
>  }
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 438cdae..2fee478 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -281,6 +281,11 @@ static void pc_init1(MachineState *machine,
>      if (pcmc->pci_enabled) {
>          pc_pci_device_init(pci_bus);
>      }
> +
> +    if (guest_info->has_nvdimm) {
> +        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
> +                               guest_info->fw_cfg, OBJECT(pcms));
> +    }
>  }
>  
>  /* Looking for a pc_compat_2_4() function? It doesn't exist.
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 412b3cd..c9334d5 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -60,6 +60,7 @@ static void pc_q35_init(MachineState *machine)
>      PCIDevice *lpc;
>      BusState *idebus[MAX_SATA_PORTS];
>      ISADevice *rtc_state;
> +    MemoryRegion *system_io = get_system_io();
>      MemoryRegion *pci_memory;
>      MemoryRegion *rom_memory;
>      MemoryRegion *ram_memory;
> @@ -176,7 +177,7 @@ static void pc_q35_init(MachineState *machine)
>      q35_host->mch.ram_memory = ram_memory;
>      q35_host->mch.pci_address_space = pci_memory;
>      q35_host->mch.system_memory = get_system_memory();
> -    q35_host->mch.address_space_io = get_system_io();
> +    q35_host->mch.address_space_io = system_io;
>      q35_host->mch.below_4g_mem_size = pcms->below_4g_mem_size;
>      q35_host->mch.above_4g_mem_size = pcms->above_4g_mem_size;
>      /* pci */
> @@ -267,6 +268,11 @@ static void pc_q35_init(MachineState *machine)
>      if (pcmc->pci_enabled) {
>          pc_pci_device_init(host_bus);
>      }
> +
> +    if (guest_info->has_nvdimm) {
> +        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
> +                               guest_info->fw_cfg, OBJECT(pcms));
> +    }
>  }
>  
>  /* Looking for a pc_compat_2_4() function? It doesn't exist.
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 8122229..362ddc4 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -17,6 +17,7 @@
>  #include "hw/boards.h"
>  #include "hw/compat.h"
>  #include "hw/mem/pc-dimm.h"
> +#include "hw/mem/nvdimm.h"
>  
>  #define HPET_INTCAP "hpet-intcap"
>  
> @@ -55,7 +56,8 @@ struct PCMachineState {
>      uint64_t max_ram_below_4g;
>      OnOffAuto vmport;
>      OnOffAuto smm;
> -    bool nvdimm;
> +
> +    AcpiNVDIMMState acpi_nvdimm_state;
>  
>      /* RAM information (sizes, addresses, configuration): */
>      ram_addr_t below_4g_mem_size, above_4g_mem_size;
> @@ -161,6 +163,7 @@ struct PcGuestInfo {
>      bool has_acpi_build;
>      bool has_reserved_memory;
>      bool rsdp_in_ram;
> +    bool has_nvdimm;
>  };
>  
>  /* parallel.c */
> diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
> index 49183c1..634c60b 100644
> --- a/include/hw/mem/nvdimm.h
> +++ b/include/hw/mem/nvdimm.h
> @@ -25,8 +25,34 @@
>  
>  #include "hw/mem/pc-dimm.h"
>  
> -#define TYPE_NVDIMM      "nvdimm"
> +#define TYPE_NVDIMM             "nvdimm"
>  
> +#define NVDIMM_DSM_MEM_FILE     "etc/acpi/nvdimm-mem"
> +
> +/*
> + * 32 bits IO port starting from 0x0a18 in guest is reserved for
> + * NVDIMM ACPI emulation.
> + */
> +#define NVDIMM_ACPI_IO_BASE     0x0a18
> +#define NVDIMM_ACPI_IO_LEN      4
> +
> +/*
> + * AcpiNVDIMMState:
> + * @is_enabled: detect if NVDIMM support is enabled.
> + *
> + * @dsm_mem: the data of the fw_cfg file NVDIMM_DSM_MEM_FILE.
> + * @io_mr: the IO region used by OSPM to transfer control to QEMU.
> + */
> +struct AcpiNVDIMMState {
> +    bool is_enabled;
> +
> +    GArray *dsm_mem;
> +    MemoryRegion io_mr;
> +};
> +typedef struct AcpiNVDIMMState AcpiNVDIMMState;
> +
> +void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
> +                            FWCfgState *fw_cfg, Object *owner);
>  void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
>                         GArray *linker);
>  #endif

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 14, 2016, 5:57 a.m. UTC | #3
On 02/08/2016 07:03 PM, Igor Mammedov wrote:
> On Wed, 13 Jan 2016 02:50:05 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>> NVDIMM ACPI binary code
>>
>> OSPM uses this port to tell QEMU the final address of the DSM memory
>> and notify QEMU to emulate the DSM method
> Would you need to pass control to QEMU if each NVDIMM had its whole
> label area MemoryRegion mapped right after its storage MemoryRegion?
>

No, label data is not mapped into guest's address space and it only
can be accessed by DSM method indirectly.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 15, 2016, 9:11 a.m. UTC | #4
On Sun, 14 Feb 2016 13:57:27 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
> > On Wed, 13 Jan 2016 02:50:05 +0800
> > Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >  
> >> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >> NVDIMM ACPI binary code
> >>
> >> OSPM uses this port to tell QEMU the final address of the DSM memory
> >> and notify QEMU to emulate the DSM method  
> > Would you need to pass control to QEMU if each NVDIMM had its whole
> > label area MemoryRegion mapped right after its storage MemoryRegion?
> >  
> 
> No, label data is not mapped into guest's address space and it only
> can be accessed by DSM method indirectly.
Yep, per spec label data should be accessed via _DSM but question
wasn't about it,
Why would one map only 4Kb window and serialize label data
via it if it could be mapped as whole, that way _DMS method will be
much less complicated and there won't be need to add/support a protocol
for its serialization.
 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 15, 2016, 9:18 a.m. UTC | #5
On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
> On Sun, 14 Feb 2016 13:57:27 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> 
> > On 02/08/2016 07:03 PM, Igor Mammedov wrote:
> > > On Wed, 13 Jan 2016 02:50:05 +0800
> > > Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> > >  
> > >> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> > >> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> > >> NVDIMM ACPI binary code
> > >>
> > >> OSPM uses this port to tell QEMU the final address of the DSM memory
> > >> and notify QEMU to emulate the DSM method  
> > > Would you need to pass control to QEMU if each NVDIMM had its whole
> > > label area MemoryRegion mapped right after its storage MemoryRegion?
> > >  
> > 
> > No, label data is not mapped into guest's address space and it only
> > can be accessed by DSM method indirectly.
> Yep, per spec label data should be accessed via _DSM but question
> wasn't about it,
> Why would one map only 4Kb window and serialize label data
> via it if it could be mapped as whole, that way _DMS method will be
> much less complicated and there won't be need to add/support a protocol
> for its serialization.
>  

Is it ever accessed on data path? If not I prefer the current approach:
limit the window used, the serialization protocol seems rather simple.
Xiao Guangrong Feb. 15, 2016, 10:13 a.m. UTC | #6
On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
>> On Sun, 14 Feb 2016 13:57:27 +0800
>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>
>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
>>>> On Wed, 13 Jan 2016 02:50:05 +0800
>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>
>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>>>>> NVDIMM ACPI binary code
>>>>>
>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
>>>>> and notify QEMU to emulate the DSM method
>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
>>>>
>>>
>>> No, label data is not mapped into guest's address space and it only
>>> can be accessed by DSM method indirectly.
>> Yep, per spec label data should be accessed via _DSM but question
>> wasn't about it,

Ah, sorry, i missed your question.

>> Why would one map only 4Kb window and serialize label data
>> via it if it could be mapped as whole, that way _DMS method will be
>> much less complicated and there won't be need to add/support a protocol
>> for its serialization.
>>
>
> Is it ever accessed on data path? If not I prefer the current approach:

The label data is only accessed via two DSM commands - Get Namespace Label
Data and Set Namespace Label Data, no other place need to be emulated.

> limit the window used, the serialization protocol seems rather simple.
>

Yes.

Label data is at least 128k which is big enough for BIOS as it allocates
memory at 0 ~ 4G which is tight region. It also needs guest OS to support
lager max-xfer (the max size that can be transferred one time), the size
in current Linux NVDIMM driver is 4k.

However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
the case that too many nvdimm devices present in the system and their FIT
info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
and we can append 256 memory devices at most, so 12 pages are needed to
contain this info. The prototype we implemented is using ourself-defined
protocol to read piece of _FIT and concatenate them before return to Guest,
please refer to:
https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a

As 12 pages are not small region for BIOS and the _FIT size may be extended in the
future development (eg, if PBLK is introduced) i am not sure if we need this. Of
course, another approach to simplify it is that we limit the number of NVDIMM
device to make sure their _FIT < 4k.




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 15, 2016, 10:30 a.m. UTC | #7
On Mon, Feb 15, 2016 at 06:13:38PM +0800, Xiao Guangrong wrote:
> 
> 
> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
> >On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
> >>On Sun, 14 Feb 2016 13:57:27 +0800
> >>Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>
> >>>On 02/08/2016 07:03 PM, Igor Mammedov wrote:
> >>>>On Wed, 13 Jan 2016 02:50:05 +0800
> >>>>Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>
> >>>>>32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >>>>>ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >>>>>NVDIMM ACPI binary code
> >>>>>
> >>>>>OSPM uses this port to tell QEMU the final address of the DSM memory
> >>>>>and notify QEMU to emulate the DSM method
> >>>>Would you need to pass control to QEMU if each NVDIMM had its whole
> >>>>label area MemoryRegion mapped right after its storage MemoryRegion?
> >>>>
> >>>
> >>>No, label data is not mapped into guest's address space and it only
> >>>can be accessed by DSM method indirectly.
> >>Yep, per spec label data should be accessed via _DSM but question
> >>wasn't about it,
> 
> Ah, sorry, i missed your question.
> 
> >>Why would one map only 4Kb window and serialize label data
> >>via it if it could be mapped as whole, that way _DMS method will be
> >>much less complicated and there won't be need to add/support a protocol
> >>for its serialization.
> >>
> >
> >Is it ever accessed on data path? If not I prefer the current approach:
> 
> The label data is only accessed via two DSM commands - Get Namespace Label
> Data and Set Namespace Label Data, no other place need to be emulated.
> 
> >limit the window used, the serialization protocol seems rather simple.
> >
> 
> Yes.
> 
> Label data is at least 128k which is big enough for BIOS as it allocates
> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> lager max-xfer (the max size that can be transferred one time), the size
> in current Linux NVDIMM driver is 4k.
> 
> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> the case that too many nvdimm devices present in the system and their FIT
> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> and we can append 256 memory devices at most, so 12 pages are needed to
> contain this info. The prototype we implemented is using ourself-defined
> protocol to read piece of _FIT and concatenate them before return to Guest,
> please refer to:
> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> 
> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> course, another approach to simplify it is that we limit the number of NVDIMM
> device to make sure their _FIT < 4k.
> 
> 
> 

Bigger buffer would only be reasonable in 64 bit window (>4G),
<4G area is too busy for that.
Would that be a problem?
Igor Mammedov Feb. 15, 2016, 10:47 a.m. UTC | #8
On Mon, 15 Feb 2016 18:13:38 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:  
> >> On Sun, 14 Feb 2016 13:57:27 +0800
> >> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>  
> >>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:  
> >>>> On Wed, 13 Jan 2016 02:50:05 +0800
> >>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>  
> >>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >>>>> NVDIMM ACPI binary code
> >>>>>
> >>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
> >>>>> and notify QEMU to emulate the DSM method  
> >>>> Would you need to pass control to QEMU if each NVDIMM had its whole
> >>>> label area MemoryRegion mapped right after its storage MemoryRegion?
> >>>>  
> >>>
> >>> No, label data is not mapped into guest's address space and it only
> >>> can be accessed by DSM method indirectly.  
> >> Yep, per spec label data should be accessed via _DSM but question
> >> wasn't about it,  
> 
> Ah, sorry, i missed your question.
> 
> >> Why would one map only 4Kb window and serialize label data
> >> via it if it could be mapped as whole, that way _DMS method will be
> >> much less complicated and there won't be need to add/support a protocol
> >> for its serialization.
> >>  
> >
> > Is it ever accessed on data path? If not I prefer the current approach:  
> 
> The label data is only accessed via two DSM commands - Get Namespace Label
> Data and Set Namespace Label Data, no other place need to be emulated.
> 
> > limit the window used, the serialization protocol seems rather simple.
> >  
> 
> Yes.
> 
> Label data is at least 128k which is big enough for BIOS as it allocates
> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> lager max-xfer (the max size that can be transferred one time), the size
> in current Linux NVDIMM driver is 4k.
> 
> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> the case that too many nvdimm devices present in the system and their FIT
> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> and we can append 256 memory devices at most, so 12 pages are needed to
> contain this info. The prototype we implemented is using ourself-defined
> protocol to read piece of _FIT and concatenate them before return to Guest,
> please refer to:
> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> 
> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> course, another approach to simplify it is that we limit the number of NVDIMM
> device to make sure their _FIT < 4k.
My suggestion is not to have only one label area for every NVDIMM but
rather to map each label area right after each NVDIMM's data memory.
That way _DMS can be made non-serialized and guest could handle
label data in parallel.

As for a _FIT we can use the same approach as mem hotplug
(IO port window) or Michael's idea to add vendor specific
PCI_config region to a current PM device to avoid using
IO ports.


> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 15, 2016, 11:22 a.m. UTC | #9
On 02/15/2016 06:47 PM, Igor Mammedov wrote:
> On Mon, 15 Feb 2016 18:13:38 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
>>>> On Sun, 14 Feb 2016 13:57:27 +0800
>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>
>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>
>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>>>>>>> NVDIMM ACPI binary code
>>>>>>>
>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
>>>>>>> and notify QEMU to emulate the DSM method
>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
>>>>>>
>>>>>
>>>>> No, label data is not mapped into guest's address space and it only
>>>>> can be accessed by DSM method indirectly.
>>>> Yep, per spec label data should be accessed via _DSM but question
>>>> wasn't about it,
>>
>> Ah, sorry, i missed your question.
>>
>>>> Why would one map only 4Kb window and serialize label data
>>>> via it if it could be mapped as whole, that way _DMS method will be
>>>> much less complicated and there won't be need to add/support a protocol
>>>> for its serialization.
>>>>
>>>
>>> Is it ever accessed on data path? If not I prefer the current approach:
>>
>> The label data is only accessed via two DSM commands - Get Namespace Label
>> Data and Set Namespace Label Data, no other place need to be emulated.
>>
>>> limit the window used, the serialization protocol seems rather simple.
>>>
>>
>> Yes.
>>
>> Label data is at least 128k which is big enough for BIOS as it allocates
>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
>> lager max-xfer (the max size that can be transferred one time), the size
>> in current Linux NVDIMM driver is 4k.
>>
>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
>> the case that too many nvdimm devices present in the system and their FIT
>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
>> and we can append 256 memory devices at most, so 12 pages are needed to
>> contain this info. The prototype we implemented is using ourself-defined
>> protocol to read piece of _FIT and concatenate them before return to Guest,
>> please refer to:
>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
>>
>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
>> course, another approach to simplify it is that we limit the number of NVDIMM
>> device to make sure their _FIT < 4k.
> My suggestion is not to have only one label area for every NVDIMM but
> rather to map each label area right after each NVDIMM's data memory.
> That way _DMS can be made non-serialized and guest could handle
> label data in parallel.
>

Sounds great to me. I like this idea. :D

> As for a _FIT we can use the same approach as mem hotplug
> (IO port window) or Michael's idea to add vendor specific
> PCI_config region to a current PM device to avoid using
> IO ports.

Thanks for your reminder, i will learn it.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 15, 2016, 11:36 a.m. UTC | #10
On Mon, Feb 15, 2016 at 06:13:38PM +0800, Xiao Guangrong wrote:
> 
> 
> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
> >On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
> >>On Sun, 14 Feb 2016 13:57:27 +0800
> >>Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>
> >>>On 02/08/2016 07:03 PM, Igor Mammedov wrote:
> >>>>On Wed, 13 Jan 2016 02:50:05 +0800
> >>>>Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>
> >>>>>32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >>>>>ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >>>>>NVDIMM ACPI binary code
> >>>>>
> >>>>>OSPM uses this port to tell QEMU the final address of the DSM memory
> >>>>>and notify QEMU to emulate the DSM method
> >>>>Would you need to pass control to QEMU if each NVDIMM had its whole
> >>>>label area MemoryRegion mapped right after its storage MemoryRegion?
> >>>>
> >>>
> >>>No, label data is not mapped into guest's address space and it only
> >>>can be accessed by DSM method indirectly.
> >>Yep, per spec label data should be accessed via _DSM but question
> >>wasn't about it,
> 
> Ah, sorry, i missed your question.
> 
> >>Why would one map only 4Kb window and serialize label data
> >>via it if it could be mapped as whole, that way _DMS method will be
> >>much less complicated and there won't be need to add/support a protocol
> >>for its serialization.
> >>
> >
> >Is it ever accessed on data path? If not I prefer the current approach:
> 
> The label data is only accessed via two DSM commands - Get Namespace Label
> Data and Set Namespace Label Data, no other place need to be emulated.
> 
> >limit the window used, the serialization protocol seems rather simple.
> >
> 
> Yes.
> 
> Label data is at least 128k which is big enough for BIOS as it allocates
> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> lager max-xfer (the max size that can be transferred one time), the size
> in current Linux NVDIMM driver is 4k.
> 
> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> the case that too many nvdimm devices present in the system and their FIT
> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> and we can append 256 memory devices at most, so 12 pages are needed to
> contain this info.

BTW 12 pages isn't too scary even for BIOS.

> The prototype we implemented is using ourself-defined
> protocol to read piece of _FIT and concatenate them before return to Guest,
> please refer to:
> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> 
> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> course, another approach to simplify it is that we limit the number of NVDIMM
> device to make sure their _FIT < 4k.
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 15, 2016, 11:45 a.m. UTC | #11
On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:
> On Mon, 15 Feb 2016 18:13:38 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> 
> > On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
> > > On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:  
> > >> On Sun, 14 Feb 2016 13:57:27 +0800
> > >> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> > >>  
> > >>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:  
> > >>>> On Wed, 13 Jan 2016 02:50:05 +0800
> > >>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> > >>>>  
> > >>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> > >>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> > >>>>> NVDIMM ACPI binary code
> > >>>>>
> > >>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
> > >>>>> and notify QEMU to emulate the DSM method  
> > >>>> Would you need to pass control to QEMU if each NVDIMM had its whole
> > >>>> label area MemoryRegion mapped right after its storage MemoryRegion?
> > >>>>  
> > >>>
> > >>> No, label data is not mapped into guest's address space and it only
> > >>> can be accessed by DSM method indirectly.  
> > >> Yep, per spec label data should be accessed via _DSM but question
> > >> wasn't about it,  
> > 
> > Ah, sorry, i missed your question.
> > 
> > >> Why would one map only 4Kb window and serialize label data
> > >> via it if it could be mapped as whole, that way _DMS method will be
> > >> much less complicated and there won't be need to add/support a protocol
> > >> for its serialization.
> > >>  
> > >
> > > Is it ever accessed on data path? If not I prefer the current approach:  
> > 
> > The label data is only accessed via two DSM commands - Get Namespace Label
> > Data and Set Namespace Label Data, no other place need to be emulated.
> > 
> > > limit the window used, the serialization protocol seems rather simple.
> > >  
> > 
> > Yes.
> > 
> > Label data is at least 128k which is big enough for BIOS as it allocates
> > memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> > lager max-xfer (the max size that can be transferred one time), the size
> > in current Linux NVDIMM driver is 4k.
> > 
> > However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> > the case that too many nvdimm devices present in the system and their FIT
> > info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> > and we can append 256 memory devices at most, so 12 pages are needed to
> > contain this info. The prototype we implemented is using ourself-defined
> > protocol to read piece of _FIT and concatenate them before return to Guest,
> > please refer to:
> > https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> > 
> > As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> > future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> > course, another approach to simplify it is that we limit the number of NVDIMM
> > device to make sure their _FIT < 4k.
> My suggestion is not to have only one label area for every NVDIMM but
> rather to map each label area right after each NVDIMM's data memory.
> That way _DMS can be made non-serialized and guest could handle
> label data in parallel.

I think that alignment considerations would mean we are burning up
1G of phys address space for this. For PAE we only have 64G
of this address space, so this would be a problem.


> As for a _FIT we can use the same approach as mem hotplug
> (IO port window) or Michael's idea to add vendor specific
> PCI_config region to a current PM device to avoid using
> IO ports.
> 
> 
> > 
> > 
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 15, 2016, 1:32 p.m. UTC | #12
On Mon, 15 Feb 2016 13:45:59 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:
> > On Mon, 15 Feb 2016 18:13:38 +0800
> > Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >   
> > > On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:  
> > > > On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:    
> > > >> On Sun, 14 Feb 2016 13:57:27 +0800
> > > >> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> > > >>    
> > > >>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:    
> > > >>>> On Wed, 13 Jan 2016 02:50:05 +0800
> > > >>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> > > >>>>    
> > > >>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> > > >>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> > > >>>>> NVDIMM ACPI binary code
> > > >>>>>
> > > >>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
> > > >>>>> and notify QEMU to emulate the DSM method    
> > > >>>> Would you need to pass control to QEMU if each NVDIMM had its whole
> > > >>>> label area MemoryRegion mapped right after its storage MemoryRegion?
> > > >>>>    
> > > >>>
> > > >>> No, label data is not mapped into guest's address space and it only
> > > >>> can be accessed by DSM method indirectly.    
> > > >> Yep, per spec label data should be accessed via _DSM but question
> > > >> wasn't about it,    
> > > 
> > > Ah, sorry, i missed your question.
> > >   
> > > >> Why would one map only 4Kb window and serialize label data
> > > >> via it if it could be mapped as whole, that way _DMS method will be
> > > >> much less complicated and there won't be need to add/support a protocol
> > > >> for its serialization.
> > > >>    
> > > >
> > > > Is it ever accessed on data path? If not I prefer the current approach:    
> > > 
> > > The label data is only accessed via two DSM commands - Get Namespace Label
> > > Data and Set Namespace Label Data, no other place need to be emulated.
> > >   
> > > > limit the window used, the serialization protocol seems rather simple.
> > > >    
> > > 
> > > Yes.
> > > 
> > > Label data is at least 128k which is big enough for BIOS as it allocates
> > > memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> > > lager max-xfer (the max size that can be transferred one time), the size
> > > in current Linux NVDIMM driver is 4k.
> > > 
> > > However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> > > the case that too many nvdimm devices present in the system and their FIT
> > > info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> > > and we can append 256 memory devices at most, so 12 pages are needed to
> > > contain this info. The prototype we implemented is using ourself-defined
> > > protocol to read piece of _FIT and concatenate them before return to Guest,
> > > please refer to:
> > > https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> > > 
> > > As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> > > future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> > > course, another approach to simplify it is that we limit the number of NVDIMM
> > > device to make sure their _FIT < 4k.  
> > My suggestion is not to have only one label area for every NVDIMM but
> > rather to map each label area right after each NVDIMM's data memory.
> > That way _DMS can be made non-serialized and guest could handle
> > label data in parallel.  
> 
> I think that alignment considerations would mean we are burning up
> 1G of phys address space for this. For PAE we only have 64G
> of this address space, so this would be a problem.
That's true that it will burning away address space, however that
just means that PAE guests would not be able to handle as many
NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
alignment enforced. If one needs more DIMMs he/she can switch
to 64bit guest to use them.

It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
Also with fully mapped label area for each NVDIMM we don't have to
introduce and maintain any guest visible serialization protocol
(protocol for serializing _DSM via 4K window) which becomes ABI.

If I were to choose I'd pick more efficient NVDIMM access vs
a lot of densely packed inefficient NVDIMMs and use 64bit
kernel to utilize it vs legacy PAE one.

> > As for a _FIT we can use the same approach as mem hotplug
> > (IO port window) or Michael's idea to add vendor specific
> > PCI_config region to a current PM device to avoid using
> > IO ports.
> > 
> >   
> > > 
> > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html  
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 15, 2016, 3:53 p.m. UTC | #13
On 02/15/2016 09:32 PM, Igor Mammedov wrote:
> On Mon, 15 Feb 2016 13:45:59 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>
>> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:
>>> On Mon, 15 Feb 2016 18:13:38 +0800
>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>
>>>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
>>>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
>>>>>> On Sun, 14 Feb 2016 13:57:27 +0800
>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>
>>>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
>>>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>>
>>>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>>>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>>>>>>>>> NVDIMM ACPI binary code
>>>>>>>>>
>>>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
>>>>>>>>> and notify QEMU to emulate the DSM method
>>>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
>>>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
>>>>>>>>
>>>>>>>
>>>>>>> No, label data is not mapped into guest's address space and it only
>>>>>>> can be accessed by DSM method indirectly.
>>>>>> Yep, per spec label data should be accessed via _DSM but question
>>>>>> wasn't about it,
>>>>
>>>> Ah, sorry, i missed your question.
>>>>
>>>>>> Why would one map only 4Kb window and serialize label data
>>>>>> via it if it could be mapped as whole, that way _DMS method will be
>>>>>> much less complicated and there won't be need to add/support a protocol
>>>>>> for its serialization.
>>>>>>
>>>>>
>>>>> Is it ever accessed on data path? If not I prefer the current approach:
>>>>
>>>> The label data is only accessed via two DSM commands - Get Namespace Label
>>>> Data and Set Namespace Label Data, no other place need to be emulated.
>>>>
>>>>> limit the window used, the serialization protocol seems rather simple.
>>>>>
>>>>
>>>> Yes.
>>>>
>>>> Label data is at least 128k which is big enough for BIOS as it allocates
>>>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
>>>> lager max-xfer (the max size that can be transferred one time), the size
>>>> in current Linux NVDIMM driver is 4k.
>>>>
>>>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
>>>> the case that too many nvdimm devices present in the system and their FIT
>>>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
>>>> and we can append 256 memory devices at most, so 12 pages are needed to
>>>> contain this info. The prototype we implemented is using ourself-defined
>>>> protocol to read piece of _FIT and concatenate them before return to Guest,
>>>> please refer to:
>>>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
>>>>
>>>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
>>>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
>>>> course, another approach to simplify it is that we limit the number of NVDIMM
>>>> device to make sure their _FIT < 4k.
>>> My suggestion is not to have only one label area for every NVDIMM but
>>> rather to map each label area right after each NVDIMM's data memory.
>>> That way _DMS can be made non-serialized and guest could handle
>>> label data in parallel.
>>
>> I think that alignment considerations would mean we are burning up
>> 1G of phys address space for this. For PAE we only have 64G
>> of this address space, so this would be a problem.
> That's true that it will burning away address space, however that
> just means that PAE guests would not be able to handle as many
> NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
> alignment enforced. If one needs more DIMMs he/she can switch
> to 64bit guest to use them.
>
> It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
> Also with fully mapped label area for each NVDIMM we don't have to
> introduce and maintain any guest visible serialization protocol
> (protocol for serializing _DSM via 4K window) which becomes ABI.

It's true for label access but it is not for the long term as we will
need to support other _DSM commands such as vendor specific command,
PBLK dsm command, also NVDIMM MCE related commands will be introduced
in the future, so we will come back here at that time. :(

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 15, 2016, 5:24 p.m. UTC | #14
On Mon, 15 Feb 2016 23:53:13 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> On 02/15/2016 09:32 PM, Igor Mammedov wrote:
> > On Mon, 15 Feb 2016 13:45:59 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >  
> >> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:  
> >>> On Mon, 15 Feb 2016 18:13:38 +0800
> >>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>  
> >>>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:  
> >>>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:  
> >>>>>> On Sun, 14 Feb 2016 13:57:27 +0800
> >>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>>>  
> >>>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:  
> >>>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
> >>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>>>>>  
> >>>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >>>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >>>>>>>>> NVDIMM ACPI binary code
> >>>>>>>>>
> >>>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
> >>>>>>>>> and notify QEMU to emulate the DSM method  
> >>>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
> >>>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
> >>>>>>>>  
> >>>>>>>
> >>>>>>> No, label data is not mapped into guest's address space and it only
> >>>>>>> can be accessed by DSM method indirectly.  
> >>>>>> Yep, per spec label data should be accessed via _DSM but question
> >>>>>> wasn't about it,  
> >>>>
> >>>> Ah, sorry, i missed your question.
> >>>>  
> >>>>>> Why would one map only 4Kb window and serialize label data
> >>>>>> via it if it could be mapped as whole, that way _DMS method will be
> >>>>>> much less complicated and there won't be need to add/support a protocol
> >>>>>> for its serialization.
> >>>>>>  
> >>>>>
> >>>>> Is it ever accessed on data path? If not I prefer the current approach:  
> >>>>
> >>>> The label data is only accessed via two DSM commands - Get Namespace Label
> >>>> Data and Set Namespace Label Data, no other place need to be emulated.
> >>>>  
> >>>>> limit the window used, the serialization protocol seems rather simple.
> >>>>>  
> >>>>
> >>>> Yes.
> >>>>
> >>>> Label data is at least 128k which is big enough for BIOS as it allocates
> >>>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> >>>> lager max-xfer (the max size that can be transferred one time), the size
> >>>> in current Linux NVDIMM driver is 4k.
> >>>>
> >>>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> >>>> the case that too many nvdimm devices present in the system and their FIT
> >>>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> >>>> and we can append 256 memory devices at most, so 12 pages are needed to
> >>>> contain this info. The prototype we implemented is using ourself-defined
> >>>> protocol to read piece of _FIT and concatenate them before return to Guest,
> >>>> please refer to:
> >>>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> >>>>
> >>>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> >>>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> >>>> course, another approach to simplify it is that we limit the number of NVDIMM
> >>>> device to make sure their _FIT < 4k.  
> >>> My suggestion is not to have only one label area for every NVDIMM but
> >>> rather to map each label area right after each NVDIMM's data memory.
> >>> That way _DMS can be made non-serialized and guest could handle
> >>> label data in parallel.  
> >>
> >> I think that alignment considerations would mean we are burning up
> >> 1G of phys address space for this. For PAE we only have 64G
> >> of this address space, so this would be a problem.  
> > That's true that it will burning away address space, however that
> > just means that PAE guests would not be able to handle as many
> > NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
> > alignment enforced. If one needs more DIMMs he/she can switch
> > to 64bit guest to use them.
> >
> > It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
> > Also with fully mapped label area for each NVDIMM we don't have to
> > introduce and maintain any guest visible serialization protocol
> > (protocol for serializing _DSM via 4K window) which becomes ABI.  
> 
> It's true for label access but it is not for the long term as we will
> need to support other _DSM commands such as vendor specific command,
> PBLK dsm command, also NVDIMM MCE related commands will be introduced
> in the future, so we will come back here at that time. :(
I believe for block mode NVDIMM would also need per NVDIMM mapping
for performance reasons (parallel access).
As for the rest could that commands go via MMIO that we usually
use for control path?

> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 15, 2016, 6:35 p.m. UTC | #15
On 02/16/2016 01:24 AM, Igor Mammedov wrote:
> On Mon, 15 Feb 2016 23:53:13 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>> On 02/15/2016 09:32 PM, Igor Mammedov wrote:
>>> On Mon, 15 Feb 2016 13:45:59 +0200
>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>
>>>> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:
>>>>> On Mon, 15 Feb 2016 18:13:38 +0800
>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>
>>>>>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
>>>>>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
>>>>>>>> On Sun, 14 Feb 2016 13:57:27 +0800
>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>>
>>>>>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
>>>>>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
>>>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>>>>>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>>>>>>>>>>> NVDIMM ACPI binary code
>>>>>>>>>>>
>>>>>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
>>>>>>>>>>> and notify QEMU to emulate the DSM method
>>>>>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
>>>>>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> No, label data is not mapped into guest's address space and it only
>>>>>>>>> can be accessed by DSM method indirectly.
>>>>>>>> Yep, per spec label data should be accessed via _DSM but question
>>>>>>>> wasn't about it,
>>>>>>
>>>>>> Ah, sorry, i missed your question.
>>>>>>
>>>>>>>> Why would one map only 4Kb window and serialize label data
>>>>>>>> via it if it could be mapped as whole, that way _DMS method will be
>>>>>>>> much less complicated and there won't be need to add/support a protocol
>>>>>>>> for its serialization.
>>>>>>>>
>>>>>>>
>>>>>>> Is it ever accessed on data path? If not I prefer the current approach:
>>>>>>
>>>>>> The label data is only accessed via two DSM commands - Get Namespace Label
>>>>>> Data and Set Namespace Label Data, no other place need to be emulated.
>>>>>>
>>>>>>> limit the window used, the serialization protocol seems rather simple.
>>>>>>>
>>>>>>
>>>>>> Yes.
>>>>>>
>>>>>> Label data is at least 128k which is big enough for BIOS as it allocates
>>>>>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
>>>>>> lager max-xfer (the max size that can be transferred one time), the size
>>>>>> in current Linux NVDIMM driver is 4k.
>>>>>>
>>>>>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
>>>>>> the case that too many nvdimm devices present in the system and their FIT
>>>>>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
>>>>>> and we can append 256 memory devices at most, so 12 pages are needed to
>>>>>> contain this info. The prototype we implemented is using ourself-defined
>>>>>> protocol to read piece of _FIT and concatenate them before return to Guest,
>>>>>> please refer to:
>>>>>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
>>>>>>
>>>>>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
>>>>>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
>>>>>> course, another approach to simplify it is that we limit the number of NVDIMM
>>>>>> device to make sure their _FIT < 4k.
>>>>> My suggestion is not to have only one label area for every NVDIMM but
>>>>> rather to map each label area right after each NVDIMM's data memory.
>>>>> That way _DMS can be made non-serialized and guest could handle
>>>>> label data in parallel.
>>>>
>>>> I think that alignment considerations would mean we are burning up
>>>> 1G of phys address space for this. For PAE we only have 64G
>>>> of this address space, so this would be a problem.
>>> That's true that it will burning away address space, however that
>>> just means that PAE guests would not be able to handle as many
>>> NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
>>> alignment enforced. If one needs more DIMMs he/she can switch
>>> to 64bit guest to use them.
>>>
>>> It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
>>> Also with fully mapped label area for each NVDIMM we don't have to
>>> introduce and maintain any guest visible serialization protocol
>>> (protocol for serializing _DSM via 4K window) which becomes ABI.
>>
>> It's true for label access but it is not for the long term as we will
>> need to support other _DSM commands such as vendor specific command,
>> PBLK dsm command, also NVDIMM MCE related commands will be introduced
>> in the future, so we will come back here at that time. :(
> I believe for block mode NVDIMM would also need per NVDIMM mapping
> for performance reasons (parallel access).
> As for the rest could that commands go via MMIO that we usually
> use for control path?

So both input data and output data go through single MMIO, we need to
introduce a protocol to pass these data, that is complex?

And is any MMIO we can reuse (more complexer?) or we should allocate this
MMIO page ?the old question - where to allocated???
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 16, 2016, 11 a.m. UTC | #16
On Tue, 16 Feb 2016 02:35:41 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> On 02/16/2016 01:24 AM, Igor Mammedov wrote:
> > On Mon, 15 Feb 2016 23:53:13 +0800
> > Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >  
> >> On 02/15/2016 09:32 PM, Igor Mammedov wrote:  
> >>> On Mon, 15 Feb 2016 13:45:59 +0200
> >>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >>>  
> >>>> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:  
> >>>>> On Mon, 15 Feb 2016 18:13:38 +0800
> >>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>>  
> >>>>>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:  
> >>>>>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:  
> >>>>>>>> On Sun, 14 Feb 2016 13:57:27 +0800
> >>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>>>>>  
> >>>>>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:  
> >>>>>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
> >>>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >>>>>>>>>>  
> >>>>>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
> >>>>>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
> >>>>>>>>>>> NVDIMM ACPI binary code
> >>>>>>>>>>>
> >>>>>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
> >>>>>>>>>>> and notify QEMU to emulate the DSM method  
> >>>>>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
> >>>>>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
> >>>>>>>>>>  
> >>>>>>>>>
> >>>>>>>>> No, label data is not mapped into guest's address space and it only
> >>>>>>>>> can be accessed by DSM method indirectly.  
> >>>>>>>> Yep, per spec label data should be accessed via _DSM but question
> >>>>>>>> wasn't about it,  
> >>>>>>
> >>>>>> Ah, sorry, i missed your question.
> >>>>>>  
> >>>>>>>> Why would one map only 4Kb window and serialize label data
> >>>>>>>> via it if it could be mapped as whole, that way _DMS method will be
> >>>>>>>> much less complicated and there won't be need to add/support a protocol
> >>>>>>>> for its serialization.
> >>>>>>>>  
> >>>>>>>
> >>>>>>> Is it ever accessed on data path? If not I prefer the current approach:  
> >>>>>>
> >>>>>> The label data is only accessed via two DSM commands - Get Namespace Label
> >>>>>> Data and Set Namespace Label Data, no other place need to be emulated.
> >>>>>>  
> >>>>>>> limit the window used, the serialization protocol seems rather simple.
> >>>>>>>  
> >>>>>>
> >>>>>> Yes.
> >>>>>>
> >>>>>> Label data is at least 128k which is big enough for BIOS as it allocates
> >>>>>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
> >>>>>> lager max-xfer (the max size that can be transferred one time), the size
> >>>>>> in current Linux NVDIMM driver is 4k.
> >>>>>>
> >>>>>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
> >>>>>> the case that too many nvdimm devices present in the system and their FIT
> >>>>>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
> >>>>>> and we can append 256 memory devices at most, so 12 pages are needed to
> >>>>>> contain this info. The prototype we implemented is using ourself-defined
> >>>>>> protocol to read piece of _FIT and concatenate them before return to Guest,
> >>>>>> please refer to:
> >>>>>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
> >>>>>>
> >>>>>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
> >>>>>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
> >>>>>> course, another approach to simplify it is that we limit the number of NVDIMM
> >>>>>> device to make sure their _FIT < 4k.  
> >>>>> My suggestion is not to have only one label area for every NVDIMM but
> >>>>> rather to map each label area right after each NVDIMM's data memory.
> >>>>> That way _DMS can be made non-serialized and guest could handle
> >>>>> label data in parallel.  
> >>>>
> >>>> I think that alignment considerations would mean we are burning up
> >>>> 1G of phys address space for this. For PAE we only have 64G
> >>>> of this address space, so this would be a problem.  
> >>> That's true that it will burning away address space, however that
> >>> just means that PAE guests would not be able to handle as many
> >>> NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
> >>> alignment enforced. If one needs more DIMMs he/she can switch
> >>> to 64bit guest to use them.
> >>>
> >>> It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
> >>> Also with fully mapped label area for each NVDIMM we don't have to
> >>> introduce and maintain any guest visible serialization protocol
> >>> (protocol for serializing _DSM via 4K window) which becomes ABI.  
> >>
> >> It's true for label access but it is not for the long term as we will
> >> need to support other _DSM commands such as vendor specific command,
> >> PBLK dsm command, also NVDIMM MCE related commands will be introduced
> >> in the future, so we will come back here at that time. :(  
> > I believe for block mode NVDIMM would also need per NVDIMM mapping
> > for performance reasons (parallel access).
> > As for the rest could that commands go via MMIO that we usually
> > use for control path?  
> 
> So both input data and output data go through single MMIO, we need to
> introduce a protocol to pass these data, that is complex?
> 
> And is any MMIO we can reuse (more complexer?) or we should allocate this
> MMIO page ?the old question - where to allocated???
Maybe you could reuse/extend memhotplug IO interface,
or alternatively as Michael suggested add a vendor specific PCI_Config,
I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
which I like even better since you won't need to care about which ports
to allocate at all.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 17, 2016, 2:04 a.m. UTC | #17
On 02/16/2016 07:00 PM, Igor Mammedov wrote:
> On Tue, 16 Feb 2016 02:35:41 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>> On 02/16/2016 01:24 AM, Igor Mammedov wrote:
>>> On Mon, 15 Feb 2016 23:53:13 +0800
>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>
>>>> On 02/15/2016 09:32 PM, Igor Mammedov wrote:
>>>>> On Mon, 15 Feb 2016 13:45:59 +0200
>>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>>>
>>>>>> On Mon, Feb 15, 2016 at 11:47:42AM +0100, Igor Mammedov wrote:
>>>>>>> On Mon, 15 Feb 2016 18:13:38 +0800
>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>
>>>>>>>> On 02/15/2016 05:18 PM, Michael S. Tsirkin wrote:
>>>>>>>>> On Mon, Feb 15, 2016 at 10:11:05AM +0100, Igor Mammedov wrote:
>>>>>>>>>> On Sun, 14 Feb 2016 13:57:27 +0800
>>>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> On 02/08/2016 07:03 PM, Igor Mammedov wrote:
>>>>>>>>>>>> On Wed, 13 Jan 2016 02:50:05 +0800
>>>>>>>>>>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>> 32 bits IO port starting from 0x0a18 in guest is reserved for NVDIMM
>>>>>>>>>>>>> ACPI emulation. The table, NVDIMM_DSM_MEM_FILE, will be patched into
>>>>>>>>>>>>> NVDIMM ACPI binary code
>>>>>>>>>>>>>
>>>>>>>>>>>>> OSPM uses this port to tell QEMU the final address of the DSM memory
>>>>>>>>>>>>> and notify QEMU to emulate the DSM method
>>>>>>>>>>>> Would you need to pass control to QEMU if each NVDIMM had its whole
>>>>>>>>>>>> label area MemoryRegion mapped right after its storage MemoryRegion?
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> No, label data is not mapped into guest's address space and it only
>>>>>>>>>>> can be accessed by DSM method indirectly.
>>>>>>>>>> Yep, per spec label data should be accessed via _DSM but question
>>>>>>>>>> wasn't about it,
>>>>>>>>
>>>>>>>> Ah, sorry, i missed your question.
>>>>>>>>
>>>>>>>>>> Why would one map only 4Kb window and serialize label data
>>>>>>>>>> via it if it could be mapped as whole, that way _DMS method will be
>>>>>>>>>> much less complicated and there won't be need to add/support a protocol
>>>>>>>>>> for its serialization.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Is it ever accessed on data path? If not I prefer the current approach:
>>>>>>>>
>>>>>>>> The label data is only accessed via two DSM commands - Get Namespace Label
>>>>>>>> Data and Set Namespace Label Data, no other place need to be emulated.
>>>>>>>>
>>>>>>>>> limit the window used, the serialization protocol seems rather simple.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Yes.
>>>>>>>>
>>>>>>>> Label data is at least 128k which is big enough for BIOS as it allocates
>>>>>>>> memory at 0 ~ 4G which is tight region. It also needs guest OS to support
>>>>>>>> lager max-xfer (the max size that can be transferred one time), the size
>>>>>>>> in current Linux NVDIMM driver is 4k.
>>>>>>>>
>>>>>>>> However, using lager DSM buffer can help us to simplify NVDIMM hotplug for
>>>>>>>> the case that too many nvdimm devices present in the system and their FIT
>>>>>>>> info can not be filled into one page. Each PMEM-only device needs 0xb8 bytes
>>>>>>>> and we can append 256 memory devices at most, so 12 pages are needed to
>>>>>>>> contain this info. The prototype we implemented is using ourself-defined
>>>>>>>> protocol to read piece of _FIT and concatenate them before return to Guest,
>>>>>>>> please refer to:
>>>>>>>> https://github.com/xiaogr/qemu/commit/c46ce01c8433ac0870670304360b3c4aa414143a
>>>>>>>>
>>>>>>>> As 12 pages are not small region for BIOS and the _FIT size may be extended in the
>>>>>>>> future development (eg, if PBLK is introduced) i am not sure if we need this. Of
>>>>>>>> course, another approach to simplify it is that we limit the number of NVDIMM
>>>>>>>> device to make sure their _FIT < 4k.
>>>>>>> My suggestion is not to have only one label area for every NVDIMM but
>>>>>>> rather to map each label area right after each NVDIMM's data memory.
>>>>>>> That way _DMS can be made non-serialized and guest could handle
>>>>>>> label data in parallel.
>>>>>>
>>>>>> I think that alignment considerations would mean we are burning up
>>>>>> 1G of phys address space for this. For PAE we only have 64G
>>>>>> of this address space, so this would be a problem.
>>>>> That's true that it will burning away address space, however that
>>>>> just means that PAE guests would not be able to handle as many
>>>>> NVDIMMs as 64bit guests. The same applies to DIMMs as well, with
>>>>> alignment enforced. If one needs more DIMMs he/she can switch
>>>>> to 64bit guest to use them.
>>>>>
>>>>> It's trade of inefficient GPA consumption vs efficient NVDIMMs access.
>>>>> Also with fully mapped label area for each NVDIMM we don't have to
>>>>> introduce and maintain any guest visible serialization protocol
>>>>> (protocol for serializing _DSM via 4K window) which becomes ABI.
>>>>
>>>> It's true for label access but it is not for the long term as we will
>>>> need to support other _DSM commands such as vendor specific command,
>>>> PBLK dsm command, also NVDIMM MCE related commands will be introduced
>>>> in the future, so we will come back here at that time. :(
>>> I believe for block mode NVDIMM would also need per NVDIMM mapping
>>> for performance reasons (parallel access).
>>> As for the rest could that commands go via MMIO that we usually
>>> use for control path?
>>
>> So both input data and output data go through single MMIO, we need to
>> introduce a protocol to pass these data, that is complex?
>>
>> And is any MMIO we can reuse (more complexer?) or we should allocate this
>> MMIO page ?the old question - where to allocated???
> Maybe you could reuse/extend memhotplug IO interface,
> or alternatively as Michael suggested add a vendor specific PCI_Config,
> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
> which I like even better since you won't need to care about which ports
> to allocate at all.

Well, if Michael does not object, i will do it in the next version. :)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 17, 2016, 5:26 p.m. UTC | #18
On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
> >>>As for the rest could that commands go via MMIO that we usually
> >>>use for control path?
> >>
> >>So both input data and output data go through single MMIO, we need to
> >>introduce a protocol to pass these data, that is complex?
> >>
> >>And is any MMIO we can reuse (more complexer?) or we should allocate this
> >>MMIO page ?the old question - where to allocated???
> >Maybe you could reuse/extend memhotplug IO interface,
> >or alternatively as Michael suggested add a vendor specific PCI_Config,
> >I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
> >which I like even better since you won't need to care about which ports
> >to allocate at all.
> 
> Well, if Michael does not object, i will do it in the next version. :)

Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.
Xiao Guangrong Feb. 18, 2016, 4:03 a.m. UTC | #19
On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
> On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
>>>>> As for the rest could that commands go via MMIO that we usually
>>>>> use for control path?
>>>>
>>>> So both input data and output data go through single MMIO, we need to
>>>> introduce a protocol to pass these data, that is complex?
>>>>
>>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
>>>> MMIO page ?the old question - where to allocated???
>>> Maybe you could reuse/extend memhotplug IO interface,
>>> or alternatively as Michael suggested add a vendor specific PCI_Config,
>>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
>>> which I like even better since you won't need to care about which ports
>>> to allocate at all.
>>
>> Well, if Michael does not object, i will do it in the next version. :)
>
> Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.

Never mind i saw you were busy on other loops.

"It" means the suggestion of Igor that "map each label area right after each
NVDIMM's data memory" so we do not emulate it in QEMU and is good for the performance
of label these are the points i like. However it also brings complexity/limitation for
later DSM commands supports since both dsm input & output need to go through single MMIO.

Your idea?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Feb. 18, 2016, 10:05 a.m. UTC | #20
On Thu, 18 Feb 2016 12:03:36 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
> > On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:  
> >>>>> As for the rest could that commands go via MMIO that we usually
> >>>>> use for control path?  
> >>>>
> >>>> So both input data and output data go through single MMIO, we need to
> >>>> introduce a protocol to pass these data, that is complex?
> >>>>
> >>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
> >>>> MMIO page ?the old question - where to allocated???  
> >>> Maybe you could reuse/extend memhotplug IO interface,
> >>> or alternatively as Michael suggested add a vendor specific PCI_Config,
> >>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
> >>> which I like even better since you won't need to care about which ports
> >>> to allocate at all.  
> >>
> >> Well, if Michael does not object, i will do it in the next version. :)  
> >
> > Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.  
> 
> Never mind i saw you were busy on other loops.
> 
> "It" means the suggestion of Igor that "map each label area right after each
> NVDIMM's data memory"
Michael pointed out that putting label right after each NVDIMM
might burn up to 256GB of address space due to DIMM's alignment for 256 NVDIMMs.
However if address for each label is picked with pc_dimm_get_free_addr()
and label's MemoryRegion alignment is default 2MB then all labels
would be allocated close to each other within a single 1GB range.

That would burn only 1GB for 500 labels which is more than possible 256 NVDIMMs.
Assuming labels are mapped before storage MemoryRegion is mapped, layout with 1Gb hugepage backend CLI
  -device nvdimm,size=1G -device nvdimm,size=1G -device nvdimm,size=1G
would look like:

0  2M  4M       1G    2G    3G    4G
L1 | L2 | L3 ... | NV1 | NV2 | NV3 |

> so we do not emulate it in QEMU and is good for the performance
> of label these are the points i like. However it also brings complexity/limitation for
> later DSM commands supports since both dsm input & output need to go through single MMIO.
> 
> Your idea?
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Feb. 18, 2016, 10:20 a.m. UTC | #21
On Thu, Feb 18, 2016 at 12:03:36PM +0800, Xiao Guangrong wrote:
> 
> 
> On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
> >On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
> >>>>>As for the rest could that commands go via MMIO that we usually
> >>>>>use for control path?
> >>>>
> >>>>So both input data and output data go through single MMIO, we need to
> >>>>introduce a protocol to pass these data, that is complex?
> >>>>
> >>>>And is any MMIO we can reuse (more complexer?) or we should allocate this
> >>>>MMIO page ?the old question - where to allocated???
> >>>Maybe you could reuse/extend memhotplug IO interface,
> >>>or alternatively as Michael suggested add a vendor specific PCI_Config,
> >>>I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
> >>>which I like even better since you won't need to care about which ports
> >>>to allocate at all.
> >>
> >>Well, if Michael does not object, i will do it in the next version. :)
> >
> >Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.
> 
> Never mind i saw you were busy on other loops.
> 
> "It" means the suggestion of Igor that "map each label area right after each
> NVDIMM's data memory" so we do not emulate it in QEMU and is good for the performance
> of label these are the points i like. However it also brings complexity/limitation for
> later DSM commands supports since both dsm input & output need to go through single MMIO.
> 
> Your idea?

I think mapping right after data is problematic since it might
use 1G of address space if alignment is used (and alignment is
good for performance, so we typically do want people to use it).
Given you will add more DSM commands anyway,
I don't see a reason not to pass label data this way too.

I don't care much how are commands passed exactly.
Igor, do you have a preference or if it's not MMIO beyong DIMM
then you don't care?
Michael S. Tsirkin Feb. 19, 2016, 8:08 a.m. UTC | #22
On Thu, Feb 18, 2016 at 11:05:23AM +0100, Igor Mammedov wrote:
> On Thu, 18 Feb 2016 12:03:36 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> 
> > On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
> > > On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:  
> > >>>>> As for the rest could that commands go via MMIO that we usually
> > >>>>> use for control path?  
> > >>>>
> > >>>> So both input data and output data go through single MMIO, we need to
> > >>>> introduce a protocol to pass these data, that is complex?
> > >>>>
> > >>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
> > >>>> MMIO page ?the old question - where to allocated???  
> > >>> Maybe you could reuse/extend memhotplug IO interface,
> > >>> or alternatively as Michael suggested add a vendor specific PCI_Config,
> > >>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
> > >>> which I like even better since you won't need to care about which ports
> > >>> to allocate at all.  
> > >>
> > >> Well, if Michael does not object, i will do it in the next version. :)  
> > >
> > > Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.  
> > 
> > Never mind i saw you were busy on other loops.
> > 
> > "It" means the suggestion of Igor that "map each label area right after each
> > NVDIMM's data memory"
> Michael pointed out that putting label right after each NVDIMM
> might burn up to 256GB of address space due to DIMM's alignment for 256 NVDIMMs.
> However if address for each label is picked with pc_dimm_get_free_addr()
> and label's MemoryRegion alignment is default 2MB then all labels
> would be allocated close to each other within a single 1GB range.
> 
> That would burn only 1GB for 500 labels which is more than possible 256 NVDIMMs.

I thought about it, once we support hotplug, this means that one will
have to pre-declare how much is needed so QEMU can mark the correct
memory reserved, that would be nasty. Maybe we always pre-reserve 1Gbyte.
Okay but next time we need something, do we steal another Gigabyte?
It seems too much, I'll think it over on the weekend.

Really, most other devices manage to get by with 4K chunks just fine, I
don't see why do we are so special and need to steal gigabytes of
physically contigious phy ranges.

> Assuming labels are mapped before storage MemoryRegion is mapped, layout with 1Gb hugepage backend CLI
>   -device nvdimm,size=1G -device nvdimm,size=1G -device nvdimm,size=1G
> would look like:
> 
> 0  2M  4M       1G    2G    3G    4G
> L1 | L2 | L3 ... | NV1 | NV2 | NV3 |
> 
> > so we do not emulate it in QEMU and is good for the performance
> > of label these are the points i like. However it also brings complexity/limitation for
> > later DSM commands supports since both dsm input & output need to go through single MMIO.
> > 
> > Your idea?
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Feb. 19, 2016, 8:43 a.m. UTC | #23
On Fri, Feb 19, 2016 at 12:08 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Feb 18, 2016 at 11:05:23AM +0100, Igor Mammedov wrote:
>> On Thu, 18 Feb 2016 12:03:36 +0800
>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>
>> > On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
>> > > On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
>> > >>>>> As for the rest could that commands go via MMIO that we usually
>> > >>>>> use for control path?
>> > >>>>
>> > >>>> So both input data and output data go through single MMIO, we need to
>> > >>>> introduce a protocol to pass these data, that is complex?
>> > >>>>
>> > >>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
>> > >>>> MMIO page ?the old question - where to allocated???
>> > >>> Maybe you could reuse/extend memhotplug IO interface,
>> > >>> or alternatively as Michael suggested add a vendor specific PCI_Config,
>> > >>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
>> > >>> which I like even better since you won't need to care about which ports
>> > >>> to allocate at all.
>> > >>
>> > >> Well, if Michael does not object, i will do it in the next version. :)
>> > >
>> > > Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.
>> >
>> > Never mind i saw you were busy on other loops.
>> >
>> > "It" means the suggestion of Igor that "map each label area right after each
>> > NVDIMM's data memory"
>> Michael pointed out that putting label right after each NVDIMM
>> might burn up to 256GB of address space due to DIMM's alignment for 256 NVDIMMs.
>> However if address for each label is picked with pc_dimm_get_free_addr()
>> and label's MemoryRegion alignment is default 2MB then all labels
>> would be allocated close to each other within a single 1GB range.
>>
>> That would burn only 1GB for 500 labels which is more than possible 256 NVDIMMs.
>
> I thought about it, once we support hotplug, this means that one will
> have to pre-declare how much is needed so QEMU can mark the correct
> memory reserved, that would be nasty. Maybe we always pre-reserve 1Gbyte.
> Okay but next time we need something, do we steal another Gigabyte?
> It seems too much, I'll think it over on the weekend.
>
> Really, most other devices manage to get by with 4K chunks just fine, I
> don't see why do we are so special and need to steal gigabytes of
> physically contigious phy ranges.

What's the driving use case for labels in the guest?  For example,
NVDIMM-N devices are supported by the kernel without labels.

I certainly would not want to sacrifice 1GB alignment for a label area.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 22, 2016, 10:30 a.m. UTC | #24
On 02/19/2016 04:43 PM, Dan Williams wrote:
> On Fri, Feb 19, 2016 at 12:08 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Thu, Feb 18, 2016 at 11:05:23AM +0100, Igor Mammedov wrote:
>>> On Thu, 18 Feb 2016 12:03:36 +0800
>>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>>
>>>> On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
>>>>> On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
>>>>>>>>> As for the rest could that commands go via MMIO that we usually
>>>>>>>>> use for control path?
>>>>>>>>
>>>>>>>> So both input data and output data go through single MMIO, we need to
>>>>>>>> introduce a protocol to pass these data, that is complex?
>>>>>>>>
>>>>>>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
>>>>>>>> MMIO page ?the old question - where to allocated???
>>>>>>> Maybe you could reuse/extend memhotplug IO interface,
>>>>>>> or alternatively as Michael suggested add a vendor specific PCI_Config,
>>>>>>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
>>>>>>> which I like even better since you won't need to care about which ports
>>>>>>> to allocate at all.
>>>>>>
>>>>>> Well, if Michael does not object, i will do it in the next version. :)
>>>>>
>>>>> Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.
>>>>
>>>> Never mind i saw you were busy on other loops.
>>>>
>>>> "It" means the suggestion of Igor that "map each label area right after each
>>>> NVDIMM's data memory"
>>> Michael pointed out that putting label right after each NVDIMM
>>> might burn up to 256GB of address space due to DIMM's alignment for 256 NVDIMMs.
>>> However if address for each label is picked with pc_dimm_get_free_addr()
>>> and label's MemoryRegion alignment is default 2MB then all labels
>>> would be allocated close to each other within a single 1GB range.
>>>
>>> That would burn only 1GB for 500 labels which is more than possible 256 NVDIMMs.
>>
>> I thought about it, once we support hotplug, this means that one will
>> have to pre-declare how much is needed so QEMU can mark the correct
>> memory reserved, that would be nasty. Maybe we always pre-reserve 1Gbyte.
>> Okay but next time we need something, do we steal another Gigabyte?
>> It seems too much, I'll think it over on the weekend.
>>
>> Really, most other devices manage to get by with 4K chunks just fine, I
>> don't see why do we are so special and need to steal gigabytes of
>> physically contigious phy ranges.
>
> What's the driving use case for labels in the guest?  For example,
> NVDIMM-N devices are supported by the kernel without labels.

Yes, I see Linux driver supports label-less vNVDIMM that is exact current QEMU
doing. However, label-less is only Linux specific implementation (as it
completely bypasses namespace), other OS vendors (e.g Microsoft) will use label
storage to address their own requirements?or they do not follow namespace spec
at all. Another reason is that label is essential for PBLK support.

BTW, the label support can be dynamically configured and it will be disabled
on default.

>
> I certainly would not want to sacrifice 1GB alignment for a label area.
>

Yup, me too.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Feb. 22, 2016, 10:34 a.m. UTC | #25
On 02/19/2016 04:08 PM, Michael S. Tsirkin wrote:
> On Thu, Feb 18, 2016 at 11:05:23AM +0100, Igor Mammedov wrote:
>> On Thu, 18 Feb 2016 12:03:36 +0800
>> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>>
>>> On 02/18/2016 01:26 AM, Michael S. Tsirkin wrote:
>>>> On Wed, Feb 17, 2016 at 10:04:18AM +0800, Xiao Guangrong wrote:
>>>>>>>> As for the rest could that commands go via MMIO that we usually
>>>>>>>> use for control path?
>>>>>>>
>>>>>>> So both input data and output data go through single MMIO, we need to
>>>>>>> introduce a protocol to pass these data, that is complex?
>>>>>>>
>>>>>>> And is any MMIO we can reuse (more complexer?) or we should allocate this
>>>>>>> MMIO page ?the old question - where to allocated???
>>>>>> Maybe you could reuse/extend memhotplug IO interface,
>>>>>> or alternatively as Michael suggested add a vendor specific PCI_Config,
>>>>>> I'd suggest PM device for that (hw/acpi/[piix4.c|ihc9.c])
>>>>>> which I like even better since you won't need to care about which ports
>>>>>> to allocate at all.
>>>>>
>>>>> Well, if Michael does not object, i will do it in the next version. :)
>>>>
>>>> Sorry, the thread's so long by now that I'm no longer sure what does "it" refer to.
>>>
>>> Never mind i saw you were busy on other loops.
>>>
>>> "It" means the suggestion of Igor that "map each label area right after each
>>> NVDIMM's data memory"
>> Michael pointed out that putting label right after each NVDIMM
>> might burn up to 256GB of address space due to DIMM's alignment for 256 NVDIMMs.
>> However if address for each label is picked with pc_dimm_get_free_addr()
>> and label's MemoryRegion alignment is default 2MB then all labels
>> would be allocated close to each other within a single 1GB range.
>>
>> That would burn only 1GB for 500 labels which is more than possible 256 NVDIMMs.
>
> I thought about it, once we support hotplug, this means that one will
> have to pre-declare how much is needed so QEMU can mark the correct
> memory reserved, that would be nasty. Maybe we always pre-reserve 1Gbyte.
> Okay but next time we need something, do we steal another Gigabyte?
> It seems too much, I'll think it over on the weekend.
>

It sounds like this approach (reserve-and-allocate) is very similar as the
old propose that dynamically allocates memory from the end of hotplug-mem. :)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index f3ade9a..faee86c 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -2,7 +2,7 @@  common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
 common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o cpu_hotplug_acpi_table.o
 common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o memory_hotplug_acpi_table.o
-common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
+obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI) += acpi_interface.o
 common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
 common-obj-$(CONFIG_ACPI) += aml-build.o
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index df1b176..256cedd 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@ 
 
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
 
 static int nvdimm_plugged_device_list(Object *obj, void *opaque)
@@ -369,6 +370,40 @@  static void nvdimm_build_nfit(GSList *device_list, GArray *table_offsets,
     g_array_free(structures, true);
 }
 
+static uint64_t
+nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return 0;
+}
+
+static void
+nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+}
+
+static const MemoryRegionOps nvdimm_dsm_ops = {
+    .read = nvdimm_dsm_read,
+    .write = nvdimm_dsm_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
+                            FWCfgState *fw_cfg, Object *owner)
+{
+    memory_region_init_io(&state->io_mr, owner, &nvdimm_dsm_ops, state,
+                          "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
+    memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, &state->io_mr);
+
+    state->dsm_mem = g_array_new(false, true /* clear */, 1);
+    acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
+    fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
+                    state->dsm_mem->len);
+}
+
 #define NVDIMM_COMMON_DSM      "NCAL"
 
 static void nvdimm_build_common_dsm(Aml *dev)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 1ca044f..c68cfb8 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -39,7 +39,6 @@ 
 #include "hw/loader.h"
 #include "hw/isa/isa.h"
 #include "hw/acpi/memory_hotplug.h"
-#include "hw/mem/nvdimm.h"
 #include "sysemu/tpm.h"
 #include "hw/acpi/tpm.h"
 #include "sysemu/tpm_backend.h"
@@ -2602,13 +2601,6 @@  static bool acpi_has_iommu(void)
     return intel_iommu && !ambiguous;
 }
 
-static bool acpi_has_nvdimm(void)
-{
-    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
-
-    return pcms->nvdimm;
-}
-
 static
 void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
 {
@@ -2692,7 +2684,7 @@  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
         build_dmar_q35(tables_blob, tables->linker);
     }
 
-    if (acpi_has_nvdimm()) {
+    if (guest_info->has_nvdimm) {
         nvdimm_build_acpi(table_offsets, tables_blob, tables->linker);
     }
 
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c36b8cf..397de28 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1228,6 +1228,8 @@  PcGuestInfo *pc_guest_info_init(PCMachineState *pcms)
         }
     }
 
+    guest_info->has_nvdimm = pcms->acpi_nvdimm_state.is_enabled;
+
     guest_info_state->machine_done.notify = pc_guest_info_machine_done;
     qemu_add_machine_init_done_notifier(&guest_info_state->machine_done);
     return guest_info;
@@ -1877,14 +1879,14 @@  static bool pc_machine_get_nvdimm(Object *obj, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
 
-    return pcms->nvdimm;
+    return pcms->acpi_nvdimm_state.is_enabled;
 }
 
 static void pc_machine_set_nvdimm(Object *obj, bool value, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
 
-    pcms->nvdimm = value;
+    pcms->acpi_nvdimm_state.is_enabled = value;
 }
 
 static void pc_machine_initfn(Object *obj)
@@ -1923,7 +1925,7 @@  static void pc_machine_initfn(Object *obj)
                                     &error_abort);
 
     /* nvdimm is disabled on default. */
-    pcms->nvdimm = false;
+    pcms->acpi_nvdimm_state.is_enabled = false;
     object_property_add_bool(obj, PC_MACHINE_NVDIMM, pc_machine_get_nvdimm,
                              pc_machine_set_nvdimm, &error_abort);
 }
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 438cdae..2fee478 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -281,6 +281,11 @@  static void pc_init1(MachineState *machine,
     if (pcmc->pci_enabled) {
         pc_pci_device_init(pci_bus);
     }
+
+    if (guest_info->has_nvdimm) {
+        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
+                               guest_info->fw_cfg, OBJECT(pcms));
+    }
 }
 
 /* Looking for a pc_compat_2_4() function? It doesn't exist.
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 412b3cd..c9334d5 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -60,6 +60,7 @@  static void pc_q35_init(MachineState *machine)
     PCIDevice *lpc;
     BusState *idebus[MAX_SATA_PORTS];
     ISADevice *rtc_state;
+    MemoryRegion *system_io = get_system_io();
     MemoryRegion *pci_memory;
     MemoryRegion *rom_memory;
     MemoryRegion *ram_memory;
@@ -176,7 +177,7 @@  static void pc_q35_init(MachineState *machine)
     q35_host->mch.ram_memory = ram_memory;
     q35_host->mch.pci_address_space = pci_memory;
     q35_host->mch.system_memory = get_system_memory();
-    q35_host->mch.address_space_io = get_system_io();
+    q35_host->mch.address_space_io = system_io;
     q35_host->mch.below_4g_mem_size = pcms->below_4g_mem_size;
     q35_host->mch.above_4g_mem_size = pcms->above_4g_mem_size;
     /* pci */
@@ -267,6 +268,11 @@  static void pc_q35_init(MachineState *machine)
     if (pcmc->pci_enabled) {
         pc_pci_device_init(host_bus);
     }
+
+    if (guest_info->has_nvdimm) {
+        nvdimm_init_acpi_state(&pcms->acpi_nvdimm_state, system_io,
+                               guest_info->fw_cfg, OBJECT(pcms));
+    }
 }
 
 /* Looking for a pc_compat_2_4() function? It doesn't exist.
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 8122229..362ddc4 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -17,6 +17,7 @@ 
 #include "hw/boards.h"
 #include "hw/compat.h"
 #include "hw/mem/pc-dimm.h"
+#include "hw/mem/nvdimm.h"
 
 #define HPET_INTCAP "hpet-intcap"
 
@@ -55,7 +56,8 @@  struct PCMachineState {
     uint64_t max_ram_below_4g;
     OnOffAuto vmport;
     OnOffAuto smm;
-    bool nvdimm;
+
+    AcpiNVDIMMState acpi_nvdimm_state;
 
     /* RAM information (sizes, addresses, configuration): */
     ram_addr_t below_4g_mem_size, above_4g_mem_size;
@@ -161,6 +163,7 @@  struct PcGuestInfo {
     bool has_acpi_build;
     bool has_reserved_memory;
     bool rsdp_in_ram;
+    bool has_nvdimm;
 };
 
 /* parallel.c */
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 49183c1..634c60b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,8 +25,34 @@ 
 
 #include "hw/mem/pc-dimm.h"
 
-#define TYPE_NVDIMM      "nvdimm"
+#define TYPE_NVDIMM             "nvdimm"
 
+#define NVDIMM_DSM_MEM_FILE     "etc/acpi/nvdimm-mem"
+
+/*
+ * 32 bits IO port starting from 0x0a18 in guest is reserved for
+ * NVDIMM ACPI emulation.
+ */
+#define NVDIMM_ACPI_IO_BASE     0x0a18
+#define NVDIMM_ACPI_IO_LEN      4
+
+/*
+ * AcpiNVDIMMState:
+ * @is_enabled: detect if NVDIMM support is enabled.
+ *
+ * @dsm_mem: the data of the fw_cfg file NVDIMM_DSM_MEM_FILE.
+ * @io_mr: the IO region used by OSPM to transfer control to QEMU.
+ */
+struct AcpiNVDIMMState {
+    bool is_enabled;
+
+    GArray *dsm_mem;
+    MemoryRegion io_mr;
+};
+typedef struct AcpiNVDIMMState AcpiNVDIMMState;
+
+void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
+                            FWCfgState *fw_cfg, Object *owner);
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
                        GArray *linker);
 #endif