diff mbox series

[RFC,1/6] i386/pc: Account IOVA reserved ranges above 4G boundary

Message ID 20210622154905.30858-2-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series i386/pc: Fix creation of >= 1Tb guests on AMD systems with IOMMU | expand

Commit Message

Joao Martins June 22, 2021, 3:49 p.m. UTC
It is assumed that the whole GPA space is available to be
DMA addressable, within a given address space limit. Since
v5.4 based that is not true, and VFIO will validate whether
the selected IOVA is indeed valid i.e. not reserved by IOMMU
on behalf of some specific devices or platform-defined.

AMD systems with an IOMMU are examples of such platforms and
particularly may export only these ranges as allowed:

	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)

We already know of accounting for the 4G hole, albeit if the
guest is big enough we will fail to allocate a >1010G given
the ~12G hole at the 1Tb boundary, reserved for HyperTransport.

When creating the region above 4G, take into account what
IOVAs are allowed by defining the known allowed ranges
and search for the next free IOVA ranges. When finding a
invalid IOVA we mark them as reserved and proceed to the
next allowed IOVA region.

After accounting for the 1Tb hole on AMD hosts, mtree should
look like:

0000000100000000-000000fcffffffff (prio 0, i/o):
	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
0000010000000000-000001037fffffff (prio 0, i/o):
	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff

Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
 include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
 target/i386/cpu.h    |   3 ++
 3 files changed, 154 insertions(+), 9 deletions(-)

Comments

Igor Mammedov June 23, 2021, 7:11 a.m. UTC | #1
On Tue, 22 Jun 2021 16:49:00 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> It is assumed that the whole GPA space is available to be
> DMA addressable, within a given address space limit. Since
> v5.4 based that is not true, and VFIO will validate whether
> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> on behalf of some specific devices or platform-defined.
> 
> AMD systems with an IOMMU are examples of such platforms and
> particularly may export only these ranges as allowed:
> 
> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> 
> We already know of accounting for the 4G hole, albeit if the
> guest is big enough we will fail to allocate a >1010G given
> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> 
> When creating the region above 4G, take into account what
> IOVAs are allowed by defining the known allowed ranges
> and search for the next free IOVA ranges. When finding a
> invalid IOVA we mark them as reserved and proceed to the
> next allowed IOVA region.
> 
> After accounting for the 1Tb hole on AMD hosts, mtree should
> look like:
> 
> 0000000100000000-000000fcffffffff (prio 0, i/o):
> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> 0000010000000000-000001037fffffff (prio 0, i/o):
> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff

why not push whole ram-above-4g above 1Tb mark
when RAM is sufficiently large (regardless of used host),
instead of creating yet another hole and all complexity it brings along?


> Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
>  include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
>  target/i386/cpu.h    |   3 ++
>  3 files changed, 154 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index c6d8d0d84d91..52a5473ba846 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -91,6 +91,7 @@
>  #include "qapi/qmp/qerror.h"
>  #include "e820_memory_layout.h"
>  #include "fw_cfg.h"
> +#include "target/i386/cpu.h"
>  #include "trace.h"
>  #include CONFIG_DEVICES
>  
> @@ -860,6 +861,93 @@ void xen_load_linux(PCMachineState *pcms)
>      x86ms->fw_cfg = fw_cfg;
>  }
>  
> +struct GPARange usable_iova_ranges[] = {
> +    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
> +
> +/*
> + * AMD systems with an IOMMU have an additional hole close to the
> + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> + * on kernel version, VFIO may or may not let you DMA map those ranges.
> + * Starting v5.4 we validate it, and can't create guests on AMD machines
> + * with certain memory sizes. The range is:
> + *
> + * FD_0000_0000h - FF_FFFF_FFFFh
> + *
> + * The ranges represent the following:
> + *
> + * Base Address   Top Address  Use
> + *
> + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
> + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
> + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
> + * FD_F910_0000h FD_F91F_FFFFh System Management
> + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
> + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
> + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
> + * FD_FE00_0000h FD_FFFF_FFFFh Configuration
> + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
> + * FE_2000_0000h FF_FFFF_FFFFh Reserved
> + *
> + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
> + * Table 3: Special Address Controls (GPA) for more information.
> + */
> +#define DEFAULT_NR_USABLE_IOVAS 1
> +#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
> +    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
> +};
> +
> + uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
> +
> +static void init_usable_iova_ranges(void)
> +{
> +    uint32_t eax, vendor[3];
> +
> +    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
> +    if (IS_AMD_VENDOR(vendor)) {
> +        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
> +        nb_iova_ranges++;
> +    }
> +}
> +
> +static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
> +                                hwaddr base, hwaddr size, hwaddr offset)
> +{
> +    hwaddr start, region_size, resv_start, resv_end;
> +    struct GPARange *range;
> +    MemoryRegion *region;
> +    uint32_t index;
> +
> +    for_each_usable_range(index, base, size, range, start, region_size) {
> +        region = g_malloc(sizeof(*region));
> +        memory_region_init_alias(region, NULL, range->name, ram,
> +                                 offset, region_size);
> +        memory_region_add_subregion(system_memory, start, region);
> +        e820_add_entry(start, region_size, E820_RAM);
> +
> +        assert(size >= region_size);
> +        if (size == region_size) {
> +            return;
> +        }
> +
> +        /*
> +         * There's memory left to create a region for, so there should be
> +         * another valid IOVA range left.  Creating the reserved region
> +         * would also be pointless.
> +         */
> +        if (index + 1 == nb_iova_ranges) {
> +            return;
> +        }
> +
> +        resv_start = start + region_size;
> +        resv_end = usable_iova_ranges[index + 1].start;
> +
> +        /* Create a reserved region in the IOVA hole. */
> +        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
> +
> +        offset += region_size;
> +    }
> +}
> +
>  void pc_memory_init(PCMachineState *pcms,
>                      MemoryRegion *system_memory,
>                      MemoryRegion *rom_memory,
> @@ -867,7 +955,7 @@ void pc_memory_init(PCMachineState *pcms,
>  {
>      int linux_boot, i;
>      MemoryRegion *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g;
>      FWCfgState *fw_cfg;
>      MachineState *machine = MACHINE(pcms);
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
> @@ -877,6 +965,8 @@ void pc_memory_init(PCMachineState *pcms,
>      assert(machine->ram_size == x86ms->below_4g_mem_size +
>                                  x86ms->above_4g_mem_size);
>  
> +    init_usable_iova_ranges();
> +
>      linux_boot = (machine->kernel_filename != NULL);
>  
>      /*
> @@ -888,16 +978,11 @@ void pc_memory_init(PCMachineState *pcms,
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
>                               0, x86ms->below_4g_mem_size);
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
>      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
>      if (x86ms->above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> -                                 machine->ram,
> -                                 x86ms->below_4g_mem_size,
> -                                 x86ms->above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
> -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> +        add_memory_region(system_memory, machine->ram, 4 * GiB,
> +                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
>      }
>  
>      if (!pcmc->has_reserved_memory &&
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 1522a3359a93..73b8e2900c72 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -151,6 +151,63 @@ void pc_guest_info_init(PCMachineState *pcms);
>  #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
>  #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
>  
> +struct GPARange {
> +    uint64_t start;
> +    uint64_t end;
> +#define range_len(r) ((r).end - (r).start + 1)
> +
> +    const char *name;
> +};
> +
> +extern uint32_t nb_iova_ranges;
> +extern struct GPARange usable_iova_ranges[];
> +
> +static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
> +                                         hwaddr *start, hwaddr *region_size,
> +                                         uint32_t *index, struct GPARange **r)
> +{
> +    struct GPARange *range;
> +
> +    while (i < nb_iova_ranges) {
> +        range = &usable_iova_ranges[i];
> +        if (range->end >= base) {
> +            break;
> +        }
> +        i++;
> +    }
> +
> +    *index = i;
> +    /* index is out of bounds or no region left to process */
> +    if (i >= nb_iova_ranges || !size) {
> +        return;
> +    }
> +
> +    *start = MAX(range->start, base);
> +    *region_size = MIN(range->end - *start + 1, size);
> +    *r = range;
> +}
> +
> +/*
> + * for_each_usable_range() - iterates over usable IOVA regions
> + *
> + * @i:      range index within usable_iova_ranges
> + * @base:   requested address we want to use
> + * @size:   total size of the region with @base
> + * @r:      range indexed by @i within usable_iova_ranges
> + * @s:      calculated usable iova address base
> + * @i_size: calculated usable iova region size starting @s
> + *
> + * Use this iterator to walk through usable GPA ranges. Platforms
> + * such as AMD with IOMMU capable hardware restrict certain address
> + * ranges for Hyper Transport. This iterator helper lets user avoid
> + * using those said reserved ranges.
> + */
> +#define for_each_usable_range(i, base, size, r, s, i_size) \
> +    for (s = 0, i_size = 0, r = NULL, \
> +         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
> +         i < nb_iova_ranges && size != 0; \
> +         size -= i_size, \
> +         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
>  
>  void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
>                              MemoryRegion *pci_address_space);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 1e11071d817b..f8f15a4523c6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -869,6 +869,9 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
>  #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
>                           (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
>                           (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> +#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
> +                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
> +                         (vendor[2]) == CPUID_VENDOR_AMD_3)
>  
>  #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
>  #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
Igor Mammedov June 23, 2021, 9:03 a.m. UTC | #2
On Tue, 22 Jun 2021 16:49:00 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> It is assumed that the whole GPA space is available to be
> DMA addressable, within a given address space limit. Since
> v5.4 based that is not true, and VFIO will validate whether
> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> on behalf of some specific devices or platform-defined.
> 
> AMD systems with an IOMMU are examples of such platforms and
> particularly may export only these ranges as allowed:
> 
> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> 
> We already know of accounting for the 4G hole, albeit if the
> guest is big enough we will fail to allocate a >1010G given
> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> 
> When creating the region above 4G, take into account what
> IOVAs are allowed by defining the known allowed ranges
> and search for the next free IOVA ranges. When finding a
> invalid IOVA we mark them as reserved and proceed to the
> next allowed IOVA region.
> 
> After accounting for the 1Tb hole on AMD hosts, mtree should
> look like:
> 
> 0000000100000000-000000fcffffffff (prio 0, i/o):
> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> 0000010000000000-000001037fffffff (prio 0, i/o):
> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff

You are talking here about GPA which is guest specific thing
and then somehow it becomes tied to host. For bystanders it's
not clear from above commit message how both are related.
I'd add here an explicit explanation how AMD host is related GPAs
and clarify where you are talking about guest/host side.

also what about usecases:
 * start QEMU with Intel cpu model on AMD host with intel's iommu
 * start QEMU with AMD cpu model and AMD's iommu on Intel host
 * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)

> Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
>  include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
>  target/i386/cpu.h    |   3 ++
>  3 files changed, 154 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index c6d8d0d84d91..52a5473ba846 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -91,6 +91,7 @@
>  #include "qapi/qmp/qerror.h"
>  #include "e820_memory_layout.h"
>  #include "fw_cfg.h"
> +#include "target/i386/cpu.h"
>  #include "trace.h"
>  #include CONFIG_DEVICES
>  
> @@ -860,6 +861,93 @@ void xen_load_linux(PCMachineState *pcms)
>      x86ms->fw_cfg = fw_cfg;
>  }
>  
> +struct GPARange usable_iova_ranges[] = {
> +    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
> +
> +/*
> + * AMD systems with an IOMMU have an additional hole close to the
> + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> + * on kernel version, VFIO may or may not let you DMA map those ranges.
> + * Starting v5.4 we validate it, and can't create guests on AMD machines
> + * with certain memory sizes. The range is:
> + *
> + * FD_0000_0000h - FF_FFFF_FFFFh
> + *
> + * The ranges represent the following:
> + *
> + * Base Address   Top Address  Use
> + *
> + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
> + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
> + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
> + * FD_F910_0000h FD_F91F_FFFFh System Management
> + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
> + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
> + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
> + * FD_FE00_0000h FD_FFFF_FFFFh Configuration
> + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
> + * FE_2000_0000h FF_FFFF_FFFFh Reserved
> + *
> + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
> + * Table 3: Special Address Controls (GPA) for more information.
> + */
> +#define DEFAULT_NR_USABLE_IOVAS 1
> +#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
> +    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
> +};
> +
> + uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
> +
> +static void init_usable_iova_ranges(void)
> +{
> +    uint32_t eax, vendor[3];
> +
> +    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
> +    if (IS_AMD_VENDOR(vendor)) {
> +        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
> +        nb_iova_ranges++;
> +    }
> +}
> +
> +static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
> +                                hwaddr base, hwaddr size, hwaddr offset)
> +{
> +    hwaddr start, region_size, resv_start, resv_end;
> +    struct GPARange *range;
> +    MemoryRegion *region;
> +    uint32_t index;
> +
> +    for_each_usable_range(index, base, size, range, start, region_size) {
> +        region = g_malloc(sizeof(*region));
> +        memory_region_init_alias(region, NULL, range->name, ram,
> +                                 offset, region_size);
> +        memory_region_add_subregion(system_memory, start, region);
> +        e820_add_entry(start, region_size, E820_RAM);
> +
> +        assert(size >= region_size);
> +        if (size == region_size) {
> +            return;
> +        }
> +
> +        /*
> +         * There's memory left to create a region for, so there should be
> +         * another valid IOVA range left.  Creating the reserved region
> +         * would also be pointless.
> +         */
> +        if (index + 1 == nb_iova_ranges) {
> +            return;
> +        }
> +
> +        resv_start = start + region_size;
> +        resv_end = usable_iova_ranges[index + 1].start;
> +
> +        /* Create a reserved region in the IOVA hole. */
> +        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
> +
> +        offset += region_size;
> +    }
> +}
> +
>  void pc_memory_init(PCMachineState *pcms,
>                      MemoryRegion *system_memory,
>                      MemoryRegion *rom_memory,
> @@ -867,7 +955,7 @@ void pc_memory_init(PCMachineState *pcms,
>  {
>      int linux_boot, i;
>      MemoryRegion *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g;
>      FWCfgState *fw_cfg;
>      MachineState *machine = MACHINE(pcms);
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
> @@ -877,6 +965,8 @@ void pc_memory_init(PCMachineState *pcms,
>      assert(machine->ram_size == x86ms->below_4g_mem_size +
>                                  x86ms->above_4g_mem_size);
>  
> +    init_usable_iova_ranges();
> +
>      linux_boot = (machine->kernel_filename != NULL);
>  
>      /*
> @@ -888,16 +978,11 @@ void pc_memory_init(PCMachineState *pcms,
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
>                               0, x86ms->below_4g_mem_size);
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
>      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
>      if (x86ms->above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> -                                 machine->ram,
> -                                 x86ms->below_4g_mem_size,
> -                                 x86ms->above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
> -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> +        add_memory_region(system_memory, machine->ram, 4 * GiB,
> +                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
>      }
>  
>      if (!pcmc->has_reserved_memory &&
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 1522a3359a93..73b8e2900c72 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -151,6 +151,63 @@ void pc_guest_info_init(PCMachineState *pcms);
>  #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
>  #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
>  
> +struct GPARange {
> +    uint64_t start;
> +    uint64_t end;
> +#define range_len(r) ((r).end - (r).start + 1)
> +
> +    const char *name;
> +};
> +
> +extern uint32_t nb_iova_ranges;
> +extern struct GPARange usable_iova_ranges[];
> +
> +static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
> +                                         hwaddr *start, hwaddr *region_size,
> +                                         uint32_t *index, struct GPARange **r)
> +{
> +    struct GPARange *range;
> +
> +    while (i < nb_iova_ranges) {
> +        range = &usable_iova_ranges[i];
> +        if (range->end >= base) {
> +            break;
> +        }
> +        i++;
> +    }
> +
> +    *index = i;
> +    /* index is out of bounds or no region left to process */
> +    if (i >= nb_iova_ranges || !size) {
> +        return;
> +    }
> +
> +    *start = MAX(range->start, base);
> +    *region_size = MIN(range->end - *start + 1, size);
> +    *r = range;
> +}
> +
> +/*
> + * for_each_usable_range() - iterates over usable IOVA regions
> + *
> + * @i:      range index within usable_iova_ranges
> + * @base:   requested address we want to use
> + * @size:   total size of the region with @base
> + * @r:      range indexed by @i within usable_iova_ranges
> + * @s:      calculated usable iova address base
> + * @i_size: calculated usable iova region size starting @s
> + *
> + * Use this iterator to walk through usable GPA ranges. Platforms
> + * such as AMD with IOMMU capable hardware restrict certain address
> + * ranges for Hyper Transport. This iterator helper lets user avoid
> + * using those said reserved ranges.
> + */
> +#define for_each_usable_range(i, base, size, r, s, i_size) \
> +    for (s = 0, i_size = 0, r = NULL, \
> +         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
> +         i < nb_iova_ranges && size != 0; \
> +         size -= i_size, \
> +         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
>  
>  void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
>                              MemoryRegion *pci_address_space);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 1e11071d817b..f8f15a4523c6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -869,6 +869,9 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
>  #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
>                           (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
>                           (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> +#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
> +                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
> +                         (vendor[2]) == CPUID_VENDOR_AMD_3)
>  
>  #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
>  #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
Joao Martins June 23, 2021, 9:37 a.m. UTC | #3
On 6/23/21 8:11 AM, Igor Mammedov wrote:
> On Tue, 22 Jun 2021 16:49:00 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> It is assumed that the whole GPA space is available to be
>> DMA addressable, within a given address space limit. Since
>> v5.4 based that is not true, and VFIO will validate whether
>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>> on behalf of some specific devices or platform-defined.
>>
>> AMD systems with an IOMMU are examples of such platforms and
>> particularly may export only these ranges as allowed:
>>
>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>
>> We already know of accounting for the 4G hole, albeit if the
>> guest is big enough we will fail to allocate a >1010G given
>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>
>> When creating the region above 4G, take into account what
>> IOVAs are allowed by defining the known allowed ranges
>> and search for the next free IOVA ranges. When finding a
>> invalid IOVA we mark them as reserved and proceed to the
>> next allowed IOVA region.
>>
>> After accounting for the 1Tb hole on AMD hosts, mtree should
>> look like:
>>
>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>> 0000010000000000-000001037fffffff (prio 0, i/o):
>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff
> 
> why not push whole ram-above-4g above 1Tb mark
> when RAM is sufficiently large (regardless of used host),
> instead of creating yet another hole and all complexity it brings along?
> 

There's the problem with CMOS which describes memory above 4G, part of the
reason I cap it to the 1TB minus the reserved range i.e. for AMD, CMOS would
only describe up to 1T.

But should we not care about that then it's an option, I suppose.

We would waste 1Tb of address space because of 12G, and btw the logic here
is not so different than the 4G hole, in fact could probably share this
with it.
Joao Martins June 23, 2021, 9:51 a.m. UTC | #4
On 6/23/21 10:03 AM, Igor Mammedov wrote:
> On Tue, 22 Jun 2021 16:49:00 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> It is assumed that the whole GPA space is available to be
>> DMA addressable, within a given address space limit. Since
>> v5.4 based that is not true, and VFIO will validate whether
>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>> on behalf of some specific devices or platform-defined.
>>
>> AMD systems with an IOMMU are examples of such platforms and
>> particularly may export only these ranges as allowed:
>>
>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>
>> We already know of accounting for the 4G hole, albeit if the
>> guest is big enough we will fail to allocate a >1010G given
>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>
>> When creating the region above 4G, take into account what
>> IOVAs are allowed by defining the known allowed ranges
>> and search for the next free IOVA ranges. When finding a
>> invalid IOVA we mark them as reserved and proceed to the
>> next allowed IOVA region.
>>
>> After accounting for the 1Tb hole on AMD hosts, mtree should
>> look like:
>>
>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>> 0000010000000000-000001037fffffff (prio 0, i/o):
>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff
> 
> You are talking here about GPA which is guest specific thing
> and then somehow it becomes tied to host. For bystanders it's
> not clear from above commit message how both are related.
> I'd add here an explicit explanation how AMD host is related GPAs
> and clarify where you are talking about guest/host side.
> 
OK, makes sense.

Perhaps using IOVA makes it easier to understand. I said GPA because
there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).

> also what about usecases:
>  * start QEMU with Intel cpu model on AMD host with intel's iommu

In principle it would be less likely to occur. But you would still need
to mark the same range as reserved. The limitation is on DMA occuring
on those IOVAs (host or guest) coinciding with that range, so you would
want to inform the guest that at least those should be avoided.

>  * start QEMU with AMD cpu model and AMD's iommu on Intel host

Here you would probably only mark the range, solely for honoring how hardware
is usually represented. But really, on Intel, nothing stops you from exposing the
aforementioned range as RAM.

>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> 
This one is tricky. Because you can hotplug a VFIO device later on,
I opted for always marking the reserved range. If you don't use VFIO you're good, but
otherwise you would still need reserved. But I am not sure how qtest is used
today for testing huge guests.
Igor Mammedov June 23, 2021, 11:39 a.m. UTC | #5
On Wed, 23 Jun 2021 10:37:38 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/23/21 8:11 AM, Igor Mammedov wrote:
> > On Tue, 22 Jun 2021 16:49:00 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> It is assumed that the whole GPA space is available to be
> >> DMA addressable, within a given address space limit. Since
> >> v5.4 based that is not true, and VFIO will validate whether
> >> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> >> on behalf of some specific devices or platform-defined.
> >>
> >> AMD systems with an IOMMU are examples of such platforms and
> >> particularly may export only these ranges as allowed:
> >>
> >> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> >> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> >> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> >>
> >> We already know of accounting for the 4G hole, albeit if the
> >> guest is big enough we will fail to allocate a >1010G given
> >> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> >>
> >> When creating the region above 4G, take into account what
> >> IOVAs are allowed by defining the known allowed ranges
> >> and search for the next free IOVA ranges. When finding a
> >> invalid IOVA we mark them as reserved and proceed to the
> >> next allowed IOVA region.
> >>
> >> After accounting for the 1Tb hole on AMD hosts, mtree should
> >> look like:
> >>
> >> 0000000100000000-000000fcffffffff (prio 0, i/o):
> >> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> >> 0000010000000000-000001037fffffff (prio 0, i/o):
> >> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff  
> > 
> > why not push whole ram-above-4g above 1Tb mark
> > when RAM is sufficiently large (regardless of used host),
> > instead of creating yet another hole and all complexity it brings along?
> >   
> 
> There's the problem with CMOS which describes memory above 4G, part of the
> reason I cap it to the 1TB minus the reserved range i.e. for AMD, CMOS would
> only describe up to 1T.
> 
> But should we not care about that then it's an option, I suppose.
we probably do not care about CMOS with so large RAM,
as long as QEMU generates correct E820 (cmos mattered only with old Seabios
which used it for generating memory map)

> We would waste 1Tb of address space because of 12G, and btw the logic here
> is not so different than the 4G hole, in fact could probably share this
> with it.
the main reason I'm looking for alternative, is complexity
of making hole brings in. At this point, we can't do anything
with 4G hole as it's already there, but we can try to avoid that
for high RAM and keep rules there simple as it's now.

Also partitioning/splitting main RAM is one of the things that
gets in the way converting it to PC-DIMMMs model.

Loosing 1Tb of address space might be acceptable on a host
that can handle such amounts of RAM
Igor Mammedov June 23, 2021, 12:09 p.m. UTC | #6
On Wed, 23 Jun 2021 10:51:59 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/23/21 10:03 AM, Igor Mammedov wrote:
> > On Tue, 22 Jun 2021 16:49:00 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> It is assumed that the whole GPA space is available to be
> >> DMA addressable, within a given address space limit. Since
> >> v5.4 based that is not true, and VFIO will validate whether
> >> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> >> on behalf of some specific devices or platform-defined.
> >>
> >> AMD systems with an IOMMU are examples of such platforms and
> >> particularly may export only these ranges as allowed:
> >>
> >> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> >> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> >> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> >>
> >> We already know of accounting for the 4G hole, albeit if the
> >> guest is big enough we will fail to allocate a >1010G given
> >> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> >>
> >> When creating the region above 4G, take into account what
> >> IOVAs are allowed by defining the known allowed ranges
> >> and search for the next free IOVA ranges. When finding a
> >> invalid IOVA we mark them as reserved and proceed to the
> >> next allowed IOVA region.
> >>
> >> After accounting for the 1Tb hole on AMD hosts, mtree should
> >> look like:
> >>
> >> 0000000100000000-000000fcffffffff (prio 0, i/o):
> >> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> >> 0000010000000000-000001037fffffff (prio 0, i/o):
> >> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff  
> > 
> > You are talking here about GPA which is guest specific thing
> > and then somehow it becomes tied to host. For bystanders it's
> > not clear from above commit message how both are related.
> > I'd add here an explicit explanation how AMD host is related GPAs
> > and clarify where you are talking about guest/host side.
> >   
> OK, makes sense.
> 
> Perhaps using IOVA makes it easier to understand. I said GPA because
> there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).

IOVA may be a too broad term, maybe explain it in terms of GPA and HPA
and why it does matter on each side (host/guest)


> > also what about usecases:
> >  * start QEMU with Intel cpu model on AMD host with intel's iommu  
> 
> In principle it would be less likely to occur. But you would still need
> to mark the same range as reserved. The limitation is on DMA occuring
> on those IOVAs (host or guest) coinciding with that range, so you would
> want to inform the guest that at least those should be avoided.
> 
> >  * start QEMU with AMD cpu model and AMD's iommu on Intel host  
> 
> Here you would probably only mark the range, solely for honoring how hardware
> is usually represented. But really, on Intel, nothing stops you from exposing the
> aforementioned range as RAM.
> 
> >  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> >   
> This one is tricky. Because you can hotplug a VFIO device later on,
> I opted for always marking the reserved range. If you don't use VFIO you're good, but
> otherwise you would still need reserved. But I am not sure how qtest is used
> today for testing huge guests.
I do not know if there are VFIO tests in qtest (probably nope, since that
could require a host configured for that), but we can add a test
for his memory quirk (assuming phys-bits won't get in the way)
Joao Martins June 23, 2021, 1:04 p.m. UTC | #7
On 6/23/21 12:39 PM, Igor Mammedov wrote:
> On Wed, 23 Jun 2021 10:37:38 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 6/23/21 8:11 AM, Igor Mammedov wrote:
>>> On Tue, 22 Jun 2021 16:49:00 +0100
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> It is assumed that the whole GPA space is available to be
>>>> DMA addressable, within a given address space limit. Since
>>>> v5.4 based that is not true, and VFIO will validate whether
>>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>>>> on behalf of some specific devices or platform-defined.
>>>>
>>>> AMD systems with an IOMMU are examples of such platforms and
>>>> particularly may export only these ranges as allowed:
>>>>
>>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>>>
>>>> We already know of accounting for the 4G hole, albeit if the
>>>> guest is big enough we will fail to allocate a >1010G given
>>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>>>
>>>> When creating the region above 4G, take into account what
>>>> IOVAs are allowed by defining the known allowed ranges
>>>> and search for the next free IOVA ranges. When finding a
>>>> invalid IOVA we mark them as reserved and proceed to the
>>>> next allowed IOVA region.
>>>>
>>>> After accounting for the 1Tb hole on AMD hosts, mtree should
>>>> look like:
>>>>
>>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>>>> 0000010000000000-000001037fffffff (prio 0, i/o):
>>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff  
>>>
>>> why not push whole ram-above-4g above 1Tb mark
>>> when RAM is sufficiently large (regardless of used host),
>>> instead of creating yet another hole and all complexity it brings along?
>>>   
>>
>> There's the problem with CMOS which describes memory above 4G, part of the
>> reason I cap it to the 1TB minus the reserved range i.e. for AMD, CMOS would
>> only describe up to 1T.
>>
>> But should we not care about that then it's an option, I suppose.
> we probably do not care about CMOS with so large RAM,
> as long as QEMU generates correct E820 (cmos mattered only with old Seabios
> which used it for generating memory map)
> 
OK, good to know.

Any extension on CMOS would probably also be out of spec.

>> We would waste 1Tb of address space because of 12G, and btw the logic here
>> is not so different than the 4G hole, in fact could probably share this
>> with it.
> the main reason I'm looking for alternative, is complexity
> of making hole brings in. At this point, we can't do anything
> with 4G hole as it's already there, but we can try to avoid that
> for high RAM and keep rules there simple as it's now.
> 
Right. But for what is worth, that complexity is spread in two parts:

1) dealing with a sparse RAM model (with more than one hole)

2) offsetting everything else that assumes a linear RAM map.

I don't think that even if we shift start of RAM to after 1TB boundary that
we would get away top solving item 2 -- which personally is where I find this
a tad bit more hairy. So it would probably make this patch complexity smaller, but
not vary much in how spread the changes get.

> Also partitioning/splitting main RAM is one of the things that
> gets in the way converting it to PC-DIMMMs model.
> 
Can you expand on that? (a link to a series is enough)

> Loosing 1Tb of address space might be acceptable on a host
> that can handle such amounts of RAM
>
Joao Martins June 23, 2021, 1:07 p.m. UTC | #8
On 6/23/21 1:09 PM, Igor Mammedov wrote:
> On Wed, 23 Jun 2021 10:51:59 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 6/23/21 10:03 AM, Igor Mammedov wrote:
>>> On Tue, 22 Jun 2021 16:49:00 +0100
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> It is assumed that the whole GPA space is available to be
>>>> DMA addressable, within a given address space limit. Since
>>>> v5.4 based that is not true, and VFIO will validate whether
>>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>>>> on behalf of some specific devices or platform-defined.
>>>>
>>>> AMD systems with an IOMMU are examples of such platforms and
>>>> particularly may export only these ranges as allowed:
>>>>
>>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>>>
>>>> We already know of accounting for the 4G hole, albeit if the
>>>> guest is big enough we will fail to allocate a >1010G given
>>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>>>
>>>> When creating the region above 4G, take into account what
>>>> IOVAs are allowed by defining the known allowed ranges
>>>> and search for the next free IOVA ranges. When finding a
>>>> invalid IOVA we mark them as reserved and proceed to the
>>>> next allowed IOVA region.
>>>>
>>>> After accounting for the 1Tb hole on AMD hosts, mtree should
>>>> look like:
>>>>
>>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>>>> 0000010000000000-000001037fffffff (prio 0, i/o):
>>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff  
>>>
>>> You are talking here about GPA which is guest specific thing
>>> and then somehow it becomes tied to host. For bystanders it's
>>> not clear from above commit message how both are related.
>>> I'd add here an explicit explanation how AMD host is related GPAs
>>> and clarify where you are talking about guest/host side.
>>>   
>> OK, makes sense.
>>
>> Perhaps using IOVA makes it easier to understand. I said GPA because
>> there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).
> 
> IOVA may be a too broad term, maybe explain it in terms of GPA and HPA
> and why it does matter on each side (host/guest)
> 

I used the term IOVA specially because that is applicable to Host IOVA or
Guest IOVA (same rules apply as this is not special cased for VMs). So,
regardless of whether we have guest mode page tables, or just host
iommu page tables, this address range should be reserved and not used.

> 
>>> also what about usecases:
>>>  * start QEMU with Intel cpu model on AMD host with intel's iommu  
>>
>> In principle it would be less likely to occur. But you would still need
>> to mark the same range as reserved. The limitation is on DMA occuring
>> on those IOVAs (host or guest) coinciding with that range, so you would
>> want to inform the guest that at least those should be avoided.
>>
>>>  * start QEMU with AMD cpu model and AMD's iommu on Intel host  
>>
>> Here you would probably only mark the range, solely for honoring how hardware
>> is usually represented. But really, on Intel, nothing stops you from exposing the
>> aforementioned range as RAM.
>>
>>>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
>>>   
>> This one is tricky. Because you can hotplug a VFIO device later on,
>> I opted for always marking the reserved range. If you don't use VFIO you're good, but
>> otherwise you would still need reserved. But I am not sure how qtest is used
>> today for testing huge guests.
> I do not know if there are VFIO tests in qtest (probably nope, since that
> could require a host configured for that), but we can add a test
> for his memory quirk (assuming phys-bits won't get in the way)
> 

	Joao
Dr. David Alan Gilbert June 24, 2021, 9:32 a.m. UTC | #9
* Igor Mammedov (imammedo@redhat.com) wrote:
> On Tue, 22 Jun 2021 16:49:00 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> > It is assumed that the whole GPA space is available to be
> > DMA addressable, within a given address space limit. Since
> > v5.4 based that is not true, and VFIO will validate whether
> > the selected IOVA is indeed valid i.e. not reserved by IOMMU
> > on behalf of some specific devices or platform-defined.
> > 
> > AMD systems with an IOMMU are examples of such platforms and
> > particularly may export only these ranges as allowed:
> > 
> > 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> > 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> > 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> > 
> > We already know of accounting for the 4G hole, albeit if the
> > guest is big enough we will fail to allocate a >1010G given
> > the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> > 
> > When creating the region above 4G, take into account what
> > IOVAs are allowed by defining the known allowed ranges
> > and search for the next free IOVA ranges. When finding a
> > invalid IOVA we mark them as reserved and proceed to the
> > next allowed IOVA region.
> > 
> > After accounting for the 1Tb hole on AMD hosts, mtree should
> > look like:
> > 
> > 0000000100000000-000000fcffffffff (prio 0, i/o):
> > 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> > 0000010000000000-000001037fffffff (prio 0, i/o):
> > 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff
> 
> You are talking here about GPA which is guest specific thing
> and then somehow it becomes tied to host. For bystanders it's
> not clear from above commit message how both are related.
> I'd add here an explicit explanation how AMD host is related GPAs
> and clarify where you are talking about guest/host side.
> 
> also what about usecases:
>  * start QEMU with Intel cpu model on AMD host with intel's iommu

Does that work?

>  * start QEMU with AMD cpu model and AMD's iommu on Intel host

Isn't the case we commonly do:
   * start QEMU with AMD cpu model on AMD host with Intel iommu?

Dave

>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> 
> > Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> > Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> > Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> > ---
> >  hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
> >  include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
> >  target/i386/cpu.h    |   3 ++
> >  3 files changed, 154 insertions(+), 9 deletions(-)
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index c6d8d0d84d91..52a5473ba846 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -91,6 +91,7 @@
> >  #include "qapi/qmp/qerror.h"
> >  #include "e820_memory_layout.h"
> >  #include "fw_cfg.h"
> > +#include "target/i386/cpu.h"
> >  #include "trace.h"
> >  #include CONFIG_DEVICES
> >  
> > @@ -860,6 +861,93 @@ void xen_load_linux(PCMachineState *pcms)
> >      x86ms->fw_cfg = fw_cfg;
> >  }
> >  
> > +struct GPARange usable_iova_ranges[] = {
> > +    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
> > +
> > +/*
> > + * AMD systems with an IOMMU have an additional hole close to the
> > + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> > + * on kernel version, VFIO may or may not let you DMA map those ranges.
> > + * Starting v5.4 we validate it, and can't create guests on AMD machines
> > + * with certain memory sizes. The range is:
> > + *
> > + * FD_0000_0000h - FF_FFFF_FFFFh
> > + *
> > + * The ranges represent the following:
> > + *
> > + * Base Address   Top Address  Use
> > + *
> > + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
> > + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
> > + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
> > + * FD_F910_0000h FD_F91F_FFFFh System Management
> > + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
> > + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
> > + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
> > + * FD_FE00_0000h FD_FFFF_FFFFh Configuration
> > + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
> > + * FE_2000_0000h FF_FFFF_FFFFh Reserved
> > + *
> > + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
> > + * Table 3: Special Address Controls (GPA) for more information.
> > + */
> > +#define DEFAULT_NR_USABLE_IOVAS 1
> > +#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
> > +    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
> > +};
> > +
> > + uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
> > +
> > +static void init_usable_iova_ranges(void)
> > +{
> > +    uint32_t eax, vendor[3];
> > +
> > +    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
> > +    if (IS_AMD_VENDOR(vendor)) {
> > +        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
> > +        nb_iova_ranges++;
> > +    }
> > +}
> > +
> > +static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
> > +                                hwaddr base, hwaddr size, hwaddr offset)
> > +{
> > +    hwaddr start, region_size, resv_start, resv_end;
> > +    struct GPARange *range;
> > +    MemoryRegion *region;
> > +    uint32_t index;
> > +
> > +    for_each_usable_range(index, base, size, range, start, region_size) {
> > +        region = g_malloc(sizeof(*region));
> > +        memory_region_init_alias(region, NULL, range->name, ram,
> > +                                 offset, region_size);
> > +        memory_region_add_subregion(system_memory, start, region);
> > +        e820_add_entry(start, region_size, E820_RAM);
> > +
> > +        assert(size >= region_size);
> > +        if (size == region_size) {
> > +            return;
> > +        }
> > +
> > +        /*
> > +         * There's memory left to create a region for, so there should be
> > +         * another valid IOVA range left.  Creating the reserved region
> > +         * would also be pointless.
> > +         */
> > +        if (index + 1 == nb_iova_ranges) {
> > +            return;
> > +        }
> > +
> > +        resv_start = start + region_size;
> > +        resv_end = usable_iova_ranges[index + 1].start;
> > +
> > +        /* Create a reserved region in the IOVA hole. */
> > +        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
> > +
> > +        offset += region_size;
> > +    }
> > +}
> > +
> >  void pc_memory_init(PCMachineState *pcms,
> >                      MemoryRegion *system_memory,
> >                      MemoryRegion *rom_memory,
> > @@ -867,7 +955,7 @@ void pc_memory_init(PCMachineState *pcms,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> > +    MemoryRegion *ram_below_4g;
> >      FWCfgState *fw_cfg;
> >      MachineState *machine = MACHINE(pcms);
> >      MachineClass *mc = MACHINE_GET_CLASS(machine);
> > @@ -877,6 +965,8 @@ void pc_memory_init(PCMachineState *pcms,
> >      assert(machine->ram_size == x86ms->below_4g_mem_size +
> >                                  x86ms->above_4g_mem_size);
> >  
> > +    init_usable_iova_ranges();
> > +
> >      linux_boot = (machine->kernel_filename != NULL);
> >  
> >      /*
> > @@ -888,16 +978,11 @@ void pc_memory_init(PCMachineState *pcms,
> >      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
> >                               0, x86ms->below_4g_mem_size);
> >      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > +
> >      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
> >      if (x86ms->above_4g_mem_size > 0) {
> > -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> > -                                 machine->ram,
> > -                                 x86ms->below_4g_mem_size,
> > -                                 x86ms->above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > -                                    ram_above_4g);
> > -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> > +        add_memory_region(system_memory, machine->ram, 4 * GiB,
> > +                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
> >      }
> >  
> >      if (!pcmc->has_reserved_memory &&
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index 1522a3359a93..73b8e2900c72 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -151,6 +151,63 @@ void pc_guest_info_init(PCMachineState *pcms);
> >  #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
> >  #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
> >  
> > +struct GPARange {
> > +    uint64_t start;
> > +    uint64_t end;
> > +#define range_len(r) ((r).end - (r).start + 1)
> > +
> > +    const char *name;
> > +};
> > +
> > +extern uint32_t nb_iova_ranges;
> > +extern struct GPARange usable_iova_ranges[];
> > +
> > +static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
> > +                                         hwaddr *start, hwaddr *region_size,
> > +                                         uint32_t *index, struct GPARange **r)
> > +{
> > +    struct GPARange *range;
> > +
> > +    while (i < nb_iova_ranges) {
> > +        range = &usable_iova_ranges[i];
> > +        if (range->end >= base) {
> > +            break;
> > +        }
> > +        i++;
> > +    }
> > +
> > +    *index = i;
> > +    /* index is out of bounds or no region left to process */
> > +    if (i >= nb_iova_ranges || !size) {
> > +        return;
> > +    }
> > +
> > +    *start = MAX(range->start, base);
> > +    *region_size = MIN(range->end - *start + 1, size);
> > +    *r = range;
> > +}
> > +
> > +/*
> > + * for_each_usable_range() - iterates over usable IOVA regions
> > + *
> > + * @i:      range index within usable_iova_ranges
> > + * @base:   requested address we want to use
> > + * @size:   total size of the region with @base
> > + * @r:      range indexed by @i within usable_iova_ranges
> > + * @s:      calculated usable iova address base
> > + * @i_size: calculated usable iova region size starting @s
> > + *
> > + * Use this iterator to walk through usable GPA ranges. Platforms
> > + * such as AMD with IOMMU capable hardware restrict certain address
> > + * ranges for Hyper Transport. This iterator helper lets user avoid
> > + * using those said reserved ranges.
> > + */
> > +#define for_each_usable_range(i, base, size, r, s, i_size) \
> > +    for (s = 0, i_size = 0, r = NULL, \
> > +         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
> > +         i < nb_iova_ranges && size != 0; \
> > +         size -= i_size, \
> > +         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
> >  
> >  void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
> >                              MemoryRegion *pci_address_space);
> > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > index 1e11071d817b..f8f15a4523c6 100644
> > --- a/target/i386/cpu.h
> > +++ b/target/i386/cpu.h
> > @@ -869,6 +869,9 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
> >  #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> >                           (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> >                           (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> > +#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
> > +                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
> > +                         (vendor[2]) == CPUID_VENDOR_AMD_3)
> >  
> >  #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
> >  #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
> 
>
Igor Mammedov June 28, 2021, 1:25 p.m. UTC | #10
On Wed, 23 Jun 2021 14:07:29 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/23/21 1:09 PM, Igor Mammedov wrote:
> > On Wed, 23 Jun 2021 10:51:59 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> On 6/23/21 10:03 AM, Igor Mammedov wrote:  
> >>> On Tue, 22 Jun 2021 16:49:00 +0100
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>     
> >>>> It is assumed that the whole GPA space is available to be
> >>>> DMA addressable, within a given address space limit. Since
> >>>> v5.4 based that is not true, and VFIO will validate whether
> >>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> >>>> on behalf of some specific devices or platform-defined.
> >>>>
> >>>> AMD systems with an IOMMU are examples of such platforms and
> >>>> particularly may export only these ranges as allowed:
> >>>>
> >>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> >>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> >>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> >>>>
> >>>> We already know of accounting for the 4G hole, albeit if the
> >>>> guest is big enough we will fail to allocate a >1010G given
> >>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> >>>>
> >>>> When creating the region above 4G, take into account what
> >>>> IOVAs are allowed by defining the known allowed ranges
> >>>> and search for the next free IOVA ranges. When finding a
> >>>> invalid IOVA we mark them as reserved and proceed to the
> >>>> next allowed IOVA region.
> >>>>
> >>>> After accounting for the 1Tb hole on AMD hosts, mtree should
> >>>> look like:
> >>>>
> >>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
> >>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> >>>> 0000010000000000-000001037fffffff (prio 0, i/o):
> >>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff    
> >>>
> >>> You are talking here about GPA which is guest specific thing
> >>> and then somehow it becomes tied to host. For bystanders it's
> >>> not clear from above commit message how both are related.
> >>> I'd add here an explicit explanation how AMD host is related GPAs
> >>> and clarify where you are talking about guest/host side.
> >>>     
> >> OK, makes sense.
> >>
> >> Perhaps using IOVA makes it easier to understand. I said GPA because
> >> there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).  
> > 
> > IOVA may be a too broad term, maybe explain it in terms of GPA and HPA
> > and why it does matter on each side (host/guest)
> >   
> 
> I used the term IOVA specially because that is applicable to Host IOVA or
> Guest IOVA (same rules apply as this is not special cased for VMs). So,
> regardless of whether we have guest mode page tables, or just host
> iommu page tables, this address range should be reserved and not used.

IOVA doesn't make it any clearer, on contrary it's more confusing.

And does host's HPA matter at all? (if host's firmware isn't broken,
it should never use nor advertise 1Tb hole). So we probably talking
here only about GPA only.
   
> >>> also what about usecases:
> >>>  * start QEMU with Intel cpu model on AMD host with intel's iommu    
> >>
> >> In principle it would be less likely to occur. But you would still need
> >> to mark the same range as reserved. The limitation is on DMA occuring
> >> on those IOVAs (host or guest) coinciding with that range, so you would
> >> want to inform the guest that at least those should be avoided.
> >>  
> >>>  * start QEMU with AMD cpu model and AMD's iommu on Intel host    
> >>
> >> Here you would probably only mark the range, solely for honoring how hardware
> >> is usually represented. But really, on Intel, nothing stops you from exposing the
> >> aforementioned range as RAM.
> >>  
> >>>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> >>>     
> >> This one is tricky. Because you can hotplug a VFIO device later on,
> >> I opted for always marking the reserved range. If you don't use VFIO you're good, but
> >> otherwise you would still need reserved. But I am not sure how qtest is used
> >> today for testing huge guests.  
> > I do not know if there are VFIO tests in qtest (probably nope, since that
> > could require a host configured for that), but we can add a test
> > for his memory quirk (assuming phys-bits won't get in the way)
> >   
> 
> 	Joao
>
Joao Martins June 28, 2021, 1:43 p.m. UTC | #11
On 6/28/21 2:25 PM, Igor Mammedov wrote:
> On Wed, 23 Jun 2021 14:07:29 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 6/23/21 1:09 PM, Igor Mammedov wrote:
>>> On Wed, 23 Jun 2021 10:51:59 +0100
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> On 6/23/21 10:03 AM, Igor Mammedov wrote:  
>>>>> On Tue, 22 Jun 2021 16:49:00 +0100
>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>     
>>>>>> It is assumed that the whole GPA space is available to be
>>>>>> DMA addressable, within a given address space limit. Since
>>>>>> v5.4 based that is not true, and VFIO will validate whether
>>>>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>>>>>> on behalf of some specific devices or platform-defined.
>>>>>>
>>>>>> AMD systems with an IOMMU are examples of such platforms and
>>>>>> particularly may export only these ranges as allowed:
>>>>>>
>>>>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>>>>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>>>>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>>>>>
>>>>>> We already know of accounting for the 4G hole, albeit if the
>>>>>> guest is big enough we will fail to allocate a >1010G given
>>>>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>>>>>
>>>>>> When creating the region above 4G, take into account what
>>>>>> IOVAs are allowed by defining the known allowed ranges
>>>>>> and search for the next free IOVA ranges. When finding a
>>>>>> invalid IOVA we mark them as reserved and proceed to the
>>>>>> next allowed IOVA region.
>>>>>>
>>>>>> After accounting for the 1Tb hole on AMD hosts, mtree should
>>>>>> look like:
>>>>>>
>>>>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>>>>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>>>>>> 0000010000000000-000001037fffffff (prio 0, i/o):
>>>>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff    
>>>>>
>>>>> You are talking here about GPA which is guest specific thing
>>>>> and then somehow it becomes tied to host. For bystanders it's
>>>>> not clear from above commit message how both are related.
>>>>> I'd add here an explicit explanation how AMD host is related GPAs
>>>>> and clarify where you are talking about guest/host side.
>>>>>     
>>>> OK, makes sense.
>>>>
>>>> Perhaps using IOVA makes it easier to understand. I said GPA because
>>>> there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).  
>>>
>>> IOVA may be a too broad term, maybe explain it in terms of GPA and HPA
>>> and why it does matter on each side (host/guest)
>>>   
>>
>> I used the term IOVA specially because that is applicable to Host IOVA or
>> Guest IOVA (same rules apply as this is not special cased for VMs). So,
>> regardless of whether we have guest mode page tables, or just host
>> iommu page tables, this address range should be reserved and not used.
> 
> IOVA doesn't make it any clearer, on contrary it's more confusing.
> 
> And does host's HPA matter at all? (if host's firmware isn't broken,
> it should never use nor advertise 1Tb hole). 
> So we probably talking here only about GPA only.
> 
For the case in point for the series, yes it's only GPA that we care about.

Perhaps I misunderstood your earlier comment where you said how HPAs were
affected, so I was trying to encompass the problem statement in a Guest/Host
agnostic manner by using IOVA given this is all related to IOMMU reserved ranges.
I'll stick to GPA to avoid any confusion -- as that's what matters for this series.

>>>>> also what about usecases:
>>>>>  * start QEMU with Intel cpu model on AMD host with intel's iommu    
>>>>
>>>> In principle it would be less likely to occur. But you would still need
>>>> to mark the same range as reserved. The limitation is on DMA occuring
>>>> on those IOVAs (host or guest) coinciding with that range, so you would
>>>> want to inform the guest that at least those should be avoided.
>>>>  
>>>>>  * start QEMU with AMD cpu model and AMD's iommu on Intel host    
>>>>
>>>> Here you would probably only mark the range, solely for honoring how hardware
>>>> is usually represented. But really, on Intel, nothing stops you from exposing the
>>>> aforementioned range as RAM.
>>>>  
>>>>>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
>>>>>     
>>>> This one is tricky. Because you can hotplug a VFIO device later on,
>>>> I opted for always marking the reserved range. If you don't use VFIO you're good, but
>>>> otherwise you would still need reserved. But I am not sure how qtest is used
>>>> today for testing huge guests.  
>>> I do not know if there are VFIO tests in qtest (probably nope, since that
>>> could require a host configured for that), but we can add a test
>>> for his memory quirk (assuming phys-bits won't get in the way)
>>>   
>>
>> 	Joao
>>
>
Igor Mammedov June 28, 2021, 2:32 p.m. UTC | #12
On Wed, 23 Jun 2021 14:04:19 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/23/21 12:39 PM, Igor Mammedov wrote:
> > On Wed, 23 Jun 2021 10:37:38 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> On 6/23/21 8:11 AM, Igor Mammedov wrote:  
> >>> On Tue, 22 Jun 2021 16:49:00 +0100
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>     
> >>>> It is assumed that the whole GPA space is available to be
> >>>> DMA addressable, within a given address space limit. Since
> >>>> v5.4 based that is not true, and VFIO will validate whether
> >>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> >>>> on behalf of some specific devices or platform-defined.
> >>>>
> >>>> AMD systems with an IOMMU are examples of such platforms and
> >>>> particularly may export only these ranges as allowed:
> >>>>
> >>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> >>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> >>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> >>>>
> >>>> We already know of accounting for the 4G hole, albeit if the
> >>>> guest is big enough we will fail to allocate a >1010G given
> >>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> >>>>
> >>>> When creating the region above 4G, take into account what
> >>>> IOVAs are allowed by defining the known allowed ranges
> >>>> and search for the next free IOVA ranges. When finding a
> >>>> invalid IOVA we mark them as reserved and proceed to the
> >>>> next allowed IOVA region.
> >>>>
> >>>> After accounting for the 1Tb hole on AMD hosts, mtree should
> >>>> look like:
> >>>>
> >>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
> >>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> >>>> 0000010000000000-000001037fffffff (prio 0, i/o):
> >>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff    
> >>>
> >>> why not push whole ram-above-4g above 1Tb mark
> >>> when RAM is sufficiently large (regardless of used host),
> >>> instead of creating yet another hole and all complexity it brings along?
> >>>     
> >>
> >> There's the problem with CMOS which describes memory above 4G, part of the
> >> reason I cap it to the 1TB minus the reserved range i.e. for AMD, CMOS would
> >> only describe up to 1T.
> >>
> >> But should we not care about that then it's an option, I suppose.  
> > we probably do not care about CMOS with so large RAM,
> > as long as QEMU generates correct E820 (cmos mattered only with old Seabios
> > which used it for generating memory map)
> >   
> OK, good to know.
> 
> Any extension on CMOS would probably also be out of spec.
ram size in CMOS is approximate at best as doesn't account for all holes,
anyways it's irrelevant if we aren't changing RAM size.

> >> We would waste 1Tb of address space because of 12G, and btw the logic here
> >> is not so different than the 4G hole, in fact could probably share this
> >> with it.  
> > the main reason I'm looking for alternative, is complexity
> > of making hole brings in. At this point, we can't do anything
> > with 4G hole as it's already there, but we can try to avoid that
> > for high RAM and keep rules there simple as it's now.
> >   
> Right. But for what is worth, that complexity is spread in two parts:
> 
> 1) dealing with a sparse RAM model (with more than one hole)
> 
> 2) offsetting everything else that assumes a linear RAM map.
> 
> I don't think that even if we shift start of RAM to after 1TB boundary that
> we would get away top solving item 2 -- which personally is where I find this
> a tad bit more hairy. So it would probably make this patch complexity smaller, but
> not vary much in how spread the changes get.

you are already shifting hotplug area behind 1Tb in [2/6],
so to be consistent just do the same for 4Gb+ alias a well.
That will automatically solve issues in patch 4/6, and all
that at cost of several lines in one place vs this 200+ LOC series.
Both approaches are a kludge but shifting everything over 1Tb mark
is the simplest one.

If there were/is actual demand for sparse/non linear RAM layouts,
I'd switch to pc-dimm based RAM (i.e. generalize it to handle
RAM below 4G and let users specify their own memory map,
see below for more).

> > Also partitioning/splitting main RAM is one of the things that
> > gets in the way converting it to PC-DIMMMs model.
> >   
> Can you expand on that? (a link to a series is enough)
There is no series so far, only a rough idea.

current initial RAM with rather arbitrary splitting rules,
doesn't map very well with pc-dimm device model.
Unfortunately I don't see a way to convert initial RAM to
pc-dimm model without breaking CLI/ABI/migration.

As a way forward, we can restrict legacy layout to old
machine versions, and switch to pc-dimm based initial RAM
for new machine versions.

Then users will be able to specify GPA where each pc-dimm shall
be mapped. For reserving GPA ranges we can change current GPA
allocator. Alternatively a bit more flexible approach could be
a dummy memory device that would consume a reserved range so
that no one could assign pc-dimm there, this way user can define
arbitrary (subject to alignment restrictions we put on it) sparse
memory layouts without modifying QEMU for yet another hole.


> > Loosing 1Tb of address space might be acceptable on a host
> > that can handle such amounts of RAM
> >   
>
Igor Mammedov June 28, 2021, 2:42 p.m. UTC | #13
On Thu, 24 Jun 2021 10:32:05 +0100
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Igor Mammedov (imammedo@redhat.com) wrote:
> > On Tue, 22 Jun 2021 16:49:00 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> > > It is assumed that the whole GPA space is available to be
> > > DMA addressable, within a given address space limit. Since
> > > v5.4 based that is not true, and VFIO will validate whether
> > > the selected IOVA is indeed valid i.e. not reserved by IOMMU
> > > on behalf of some specific devices or platform-defined.
> > > 
> > > AMD systems with an IOMMU are examples of such platforms and
> > > particularly may export only these ranges as allowed:
> > > 
> > > 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> > > 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> > > 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> > > 
> > > We already know of accounting for the 4G hole, albeit if the
> > > guest is big enough we will fail to allocate a >1010G given
> > > the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> > > 
> > > When creating the region above 4G, take into account what
> > > IOVAs are allowed by defining the known allowed ranges
> > > and search for the next free IOVA ranges. When finding a
> > > invalid IOVA we mark them as reserved and proceed to the
> > > next allowed IOVA region.
> > > 
> > > After accounting for the 1Tb hole on AMD hosts, mtree should
> > > look like:
> > > 
> > > 0000000100000000-000000fcffffffff (prio 0, i/o):
> > > 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> > > 0000010000000000-000001037fffffff (prio 0, i/o):
> > > 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff  
> > 
> > You are talking here about GPA which is guest specific thing
> > and then somehow it becomes tied to host. For bystanders it's
> > not clear from above commit message how both are related.
> > I'd add here an explicit explanation how AMD host is related GPAs
> > and clarify where you are talking about guest/host side.
> > 
> > also what about usecases:
> >  * start QEMU with Intel cpu model on AMD host with intel's iommu  

I vaguely recall KVM fixing up vendor_id to host's CPU type
without redefining the rest of CPU model but I can't find it anymore.
Though I'm not sure how guest kernel would behave in this mixed case,
guest might get confused.

> Does that work?
> 
> >  * start QEMU with AMD cpu model and AMD's iommu on Intel host  
> 
> Isn't the case we commonly do:
>    * start QEMU with AMD cpu model on AMD host with Intel iommu?
> Dave
> 
> >  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> >   
> > > Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> > > Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> > > ---
> > >  hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
> > >  include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
> > >  target/i386/cpu.h    |   3 ++
> > >  3 files changed, 154 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > > index c6d8d0d84d91..52a5473ba846 100644
> > > --- a/hw/i386/pc.c
> > > +++ b/hw/i386/pc.c
> > > @@ -91,6 +91,7 @@
> > >  #include "qapi/qmp/qerror.h"
> > >  #include "e820_memory_layout.h"
> > >  #include "fw_cfg.h"
> > > +#include "target/i386/cpu.h"
> > >  #include "trace.h"
> > >  #include CONFIG_DEVICES
> > >  
> > > @@ -860,6 +861,93 @@ void xen_load_linux(PCMachineState *pcms)
> > >      x86ms->fw_cfg = fw_cfg;
> > >  }
> > >  
> > > +struct GPARange usable_iova_ranges[] = {
> > > +    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
> > > +
> > > +/*
> > > + * AMD systems with an IOMMU have an additional hole close to the
> > > + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
> > > + * on kernel version, VFIO may or may not let you DMA map those ranges.
> > > + * Starting v5.4 we validate it, and can't create guests on AMD machines
> > > + * with certain memory sizes. The range is:
> > > + *
> > > + * FD_0000_0000h - FF_FFFF_FFFFh
> > > + *
> > > + * The ranges represent the following:
> > > + *
> > > + * Base Address   Top Address  Use
> > > + *
> > > + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
> > > + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
> > > + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
> > > + * FD_F910_0000h FD_F91F_FFFFh System Management
> > > + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
> > > + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
> > > + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
> > > + * FD_FE00_0000h FD_FFFF_FFFFh Configuration
> > > + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
> > > + * FE_2000_0000h FF_FFFF_FFFFh Reserved
> > > + *
> > > + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
> > > + * Table 3: Special Address Controls (GPA) for more information.
> > > + */
> > > +#define DEFAULT_NR_USABLE_IOVAS 1
> > > +#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
> > > +    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
> > > +};
> > > +
> > > + uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
> > > +
> > > +static void init_usable_iova_ranges(void)
> > > +{
> > > +    uint32_t eax, vendor[3];
> > > +
> > > +    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
> > > +    if (IS_AMD_VENDOR(vendor)) {
> > > +        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
> > > +        nb_iova_ranges++;
> > > +    }
> > > +}
> > > +
> > > +static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
> > > +                                hwaddr base, hwaddr size, hwaddr offset)
> > > +{
> > > +    hwaddr start, region_size, resv_start, resv_end;
> > > +    struct GPARange *range;
> > > +    MemoryRegion *region;
> > > +    uint32_t index;
> > > +
> > > +    for_each_usable_range(index, base, size, range, start, region_size) {
> > > +        region = g_malloc(sizeof(*region));
> > > +        memory_region_init_alias(region, NULL, range->name, ram,
> > > +                                 offset, region_size);
> > > +        memory_region_add_subregion(system_memory, start, region);
> > > +        e820_add_entry(start, region_size, E820_RAM);
> > > +
> > > +        assert(size >= region_size);
> > > +        if (size == region_size) {
> > > +            return;
> > > +        }
> > > +
> > > +        /*
> > > +         * There's memory left to create a region for, so there should be
> > > +         * another valid IOVA range left.  Creating the reserved region
> > > +         * would also be pointless.
> > > +         */
> > > +        if (index + 1 == nb_iova_ranges) {
> > > +            return;
> > > +        }
> > > +
> > > +        resv_start = start + region_size;
> > > +        resv_end = usable_iova_ranges[index + 1].start;
> > > +
> > > +        /* Create a reserved region in the IOVA hole. */
> > > +        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
> > > +
> > > +        offset += region_size;
> > > +    }
> > > +}
> > > +
> > >  void pc_memory_init(PCMachineState *pcms,
> > >                      MemoryRegion *system_memory,
> > >                      MemoryRegion *rom_memory,
> > > @@ -867,7 +955,7 @@ void pc_memory_init(PCMachineState *pcms,
> > >  {
> > >      int linux_boot, i;
> > >      MemoryRegion *option_rom_mr;
> > > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> > > +    MemoryRegion *ram_below_4g;
> > >      FWCfgState *fw_cfg;
> > >      MachineState *machine = MACHINE(pcms);
> > >      MachineClass *mc = MACHINE_GET_CLASS(machine);
> > > @@ -877,6 +965,8 @@ void pc_memory_init(PCMachineState *pcms,
> > >      assert(machine->ram_size == x86ms->below_4g_mem_size +
> > >                                  x86ms->above_4g_mem_size);
> > >  
> > > +    init_usable_iova_ranges();
> > > +
> > >      linux_boot = (machine->kernel_filename != NULL);
> > >  
> > >      /*
> > > @@ -888,16 +978,11 @@ void pc_memory_init(PCMachineState *pcms,
> > >      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
> > >                               0, x86ms->below_4g_mem_size);
> > >      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > > +
> > >      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
> > >      if (x86ms->above_4g_mem_size > 0) {
> > > -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> > > -                                 machine->ram,
> > > -                                 x86ms->below_4g_mem_size,
> > > -                                 x86ms->above_4g_mem_size);
> > > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > > -                                    ram_above_4g);
> > > -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> > > +        add_memory_region(system_memory, machine->ram, 4 * GiB,
> > > +                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
> > >      }
> > >  
> > >      if (!pcmc->has_reserved_memory &&
> > > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > > index 1522a3359a93..73b8e2900c72 100644
> > > --- a/include/hw/i386/pc.h
> > > +++ b/include/hw/i386/pc.h
> > > @@ -151,6 +151,63 @@ void pc_guest_info_init(PCMachineState *pcms);
> > >  #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
> > >  #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
> > >  
> > > +struct GPARange {
> > > +    uint64_t start;
> > > +    uint64_t end;
> > > +#define range_len(r) ((r).end - (r).start + 1)
> > > +
> > > +    const char *name;
> > > +};
> > > +
> > > +extern uint32_t nb_iova_ranges;
> > > +extern struct GPARange usable_iova_ranges[];
> > > +
> > > +static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
> > > +                                         hwaddr *start, hwaddr *region_size,
> > > +                                         uint32_t *index, struct GPARange **r)
> > > +{
> > > +    struct GPARange *range;
> > > +
> > > +    while (i < nb_iova_ranges) {
> > > +        range = &usable_iova_ranges[i];
> > > +        if (range->end >= base) {
> > > +            break;
> > > +        }
> > > +        i++;
> > > +    }
> > > +
> > > +    *index = i;
> > > +    /* index is out of bounds or no region left to process */
> > > +    if (i >= nb_iova_ranges || !size) {
> > > +        return;
> > > +    }
> > > +
> > > +    *start = MAX(range->start, base);
> > > +    *region_size = MIN(range->end - *start + 1, size);
> > > +    *r = range;
> > > +}
> > > +
> > > +/*
> > > + * for_each_usable_range() - iterates over usable IOVA regions
> > > + *
> > > + * @i:      range index within usable_iova_ranges
> > > + * @base:   requested address we want to use
> > > + * @size:   total size of the region with @base
> > > + * @r:      range indexed by @i within usable_iova_ranges
> > > + * @s:      calculated usable iova address base
> > > + * @i_size: calculated usable iova region size starting @s
> > > + *
> > > + * Use this iterator to walk through usable GPA ranges. Platforms
> > > + * such as AMD with IOMMU capable hardware restrict certain address
> > > + * ranges for Hyper Transport. This iterator helper lets user avoid
> > > + * using those said reserved ranges.
> > > + */
> > > +#define for_each_usable_range(i, base, size, r, s, i_size) \
> > > +    for (s = 0, i_size = 0, r = NULL, \
> > > +         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
> > > +         i < nb_iova_ranges && size != 0; \
> > > +         size -= i_size, \
> > > +         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
> > >  
> > >  void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
> > >                              MemoryRegion *pci_address_space);
> > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > > index 1e11071d817b..f8f15a4523c6 100644
> > > --- a/target/i386/cpu.h
> > > +++ b/target/i386/cpu.h
> > > @@ -869,6 +869,9 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
> > >  #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> > >                           (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> > >                           (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> > > +#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
> > > +                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
> > > +                         (vendor[2]) == CPUID_VENDOR_AMD_3)
> > >  
> > >  #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
> > >  #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */  
> > 
> >
Igor Mammedov June 28, 2021, 3:21 p.m. UTC | #14
On Mon, 28 Jun 2021 14:43:48 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/28/21 2:25 PM, Igor Mammedov wrote:
> > On Wed, 23 Jun 2021 14:07:29 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> On 6/23/21 1:09 PM, Igor Mammedov wrote:  
> >>> On Wed, 23 Jun 2021 10:51:59 +0100
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>     
> >>>> On 6/23/21 10:03 AM, Igor Mammedov wrote:    
> >>>>> On Tue, 22 Jun 2021 16:49:00 +0100
> >>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>>>       
> >>>>>> It is assumed that the whole GPA space is available to be
> >>>>>> DMA addressable, within a given address space limit. Since
> >>>>>> v5.4 based that is not true, and VFIO will validate whether
> >>>>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
> >>>>>> on behalf of some specific devices or platform-defined.
> >>>>>>
> >>>>>> AMD systems with an IOMMU are examples of such platforms and
> >>>>>> particularly may export only these ranges as allowed:
> >>>>>>
> >>>>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
> >>>>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
> >>>>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
> >>>>>>
> >>>>>> We already know of accounting for the 4G hole, albeit if the
> >>>>>> guest is big enough we will fail to allocate a >1010G given
> >>>>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
> >>>>>>
> >>>>>> When creating the region above 4G, take into account what
> >>>>>> IOVAs are allowed by defining the known allowed ranges
> >>>>>> and search for the next free IOVA ranges. When finding a
> >>>>>> invalid IOVA we mark them as reserved and proceed to the
> >>>>>> next allowed IOVA region.
> >>>>>>
> >>>>>> After accounting for the 1Tb hole on AMD hosts, mtree should
> >>>>>> look like:
> >>>>>>
> >>>>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
> >>>>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
> >>>>>> 0000010000000000-000001037fffffff (prio 0, i/o):
> >>>>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff      
> >>>>>
> >>>>> You are talking here about GPA which is guest specific thing
> >>>>> and then somehow it becomes tied to host. For bystanders it's
> >>>>> not clear from above commit message how both are related.
> >>>>> I'd add here an explicit explanation how AMD host is related GPAs
> >>>>> and clarify where you are talking about guest/host side.
> >>>>>       
> >>>> OK, makes sense.
> >>>>
> >>>> Perhaps using IOVA makes it easier to understand. I said GPA because
> >>>> there's an 1:1 mapping between GPA and IOVA (if you're not using vIOMMU).    
> >>>
> >>> IOVA may be a too broad term, maybe explain it in terms of GPA and HPA
> >>> and why it does matter on each side (host/guest)
> >>>     
> >>
> >> I used the term IOVA specially because that is applicable to Host IOVA or
> >> Guest IOVA (same rules apply as this is not special cased for VMs). So,
> >> regardless of whether we have guest mode page tables, or just host
> >> iommu page tables, this address range should be reserved and not used.  
> > 
> > IOVA doesn't make it any clearer, on contrary it's more confusing.
> > 
> > And does host's HPA matter at all? (if host's firmware isn't broken,
> > it should never use nor advertise 1Tb hole). 
> > So we probably talking here only about GPA only.
> >   
> For the case in point for the series, yes it's only GPA that we care about.
> 
> Perhaps I misunderstood your earlier comment where you said how HPAs were
> affected, so I was trying to encompass the problem statement in a Guest/Host
> agnostic manner by using IOVA given this is all related to IOMMU reserved ranges.
> I'll stick to GPA to avoid any confusion -- as that's what matters for this series.

Even better is to add here a reference to spec where it says so.

> 
> >>>>> also what about usecases:
> >>>>>  * start QEMU with Intel cpu model on AMD host with intel's iommu      
> >>>>
> >>>> In principle it would be less likely to occur. But you would still need
> >>>> to mark the same range as reserved. The limitation is on DMA occuring
> >>>> on those IOVAs (host or guest) coinciding with that range, so you would
> >>>> want to inform the guest that at least those should be avoided.
> >>>>    
> >>>>>  * start QEMU with AMD cpu model and AMD's iommu on Intel host      
> >>>>
> >>>> Here you would probably only mark the range, solely for honoring how hardware
> >>>> is usually represented. But really, on Intel, nothing stops you from exposing the
> >>>> aforementioned range as RAM.
> >>>>    
> >>>>>  * start QEMU in TCG mode on AMD host (mostly form qtest point ot view)
> >>>>>       
> >>>> This one is tricky. Because you can hotplug a VFIO device later on,
> >>>> I opted for always marking the reserved range. If you don't use VFIO you're good, but
> >>>> otherwise you would still need reserved. But I am not sure how qtest is used
> >>>> today for testing huge guests.    
> >>> I do not know if there are VFIO tests in qtest (probably nope, since that
> >>> could require a host configured for that), but we can add a test
> >>> for his memory quirk (assuming phys-bits won't get in the way)
> >>>     
> >>
> >> 	Joao
> >>  
> >   
>
Joao Martins Aug. 6, 2021, 10:41 a.m. UTC | #15
[sorry for the thread necromancy -- got preempted in other stuff
 and am slowly coming back to this]

On 6/28/21 3:32 PM, Igor Mammedov wrote:
> On Wed, 23 Jun 2021 14:04:19 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 6/23/21 12:39 PM, Igor Mammedov wrote:
>>> On Wed, 23 Jun 2021 10:37:38 +0100
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> On 6/23/21 8:11 AM, Igor Mammedov wrote:  
>>>>> On Tue, 22 Jun 2021 16:49:00 +0100
>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>     
>>>>>> It is assumed that the whole GPA space is available to be
>>>>>> DMA addressable, within a given address space limit. Since
>>>>>> v5.4 based that is not true, and VFIO will validate whether
>>>>>> the selected IOVA is indeed valid i.e. not reserved by IOMMU
>>>>>> on behalf of some specific devices or platform-defined.
>>>>>>
>>>>>> AMD systems with an IOMMU are examples of such platforms and
>>>>>> particularly may export only these ranges as allowed:
>>>>>>
>>>>>> 	0000000000000000 - 00000000fedfffff (0      .. 3.982G)
>>>>>> 	00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
>>>>>> 	0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)
>>>>>>
>>>>>> We already know of accounting for the 4G hole, albeit if the
>>>>>> guest is big enough we will fail to allocate a >1010G given
>>>>>> the ~12G hole at the 1Tb boundary, reserved for HyperTransport.
>>>>>>
>>>>>> When creating the region above 4G, take into account what
>>>>>> IOVAs are allowed by defining the known allowed ranges
>>>>>> and search for the next free IOVA ranges. When finding a
>>>>>> invalid IOVA we mark them as reserved and proceed to the
>>>>>> next allowed IOVA region.
>>>>>>
>>>>>> After accounting for the 1Tb hole on AMD hosts, mtree should
>>>>>> look like:
>>>>>>
>>>>>> 0000000100000000-000000fcffffffff (prio 0, i/o):
>>>>>> 	alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
>>>>>> 0000010000000000-000001037fffffff (prio 0, i/o):
>>>>>> 	alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff    
>>>>>
>>>>> why not push whole ram-above-4g above 1Tb mark
>>>>> when RAM is sufficiently large (regardless of used host),
>>>>> instead of creating yet another hole and all complexity it brings along?
>>>>>     
>>>>
>>>> There's the problem with CMOS which describes memory above 4G, part of the
>>>> reason I cap it to the 1TB minus the reserved range i.e. for AMD, CMOS would
>>>> only describe up to 1T.
>>>>
>>>> But should we not care about that then it's an option, I suppose.  
>>> we probably do not care about CMOS with so large RAM,
>>> as long as QEMU generates correct E820 (cmos mattered only with old Seabios
>>> which used it for generating memory map)
>>>   
>> OK, good to know.
>>
>> Any extension on CMOS would probably also be out of spec.
> ram size in CMOS is approximate at best as doesn't account for all holes,
> anyways it's irrelevant if we aren't changing RAM size.
> 
/me nods

>>>> We would waste 1Tb of address space because of 12G, and btw the logic here
>>>> is not so different than the 4G hole, in fact could probably share this
>>>> with it.  
>>> the main reason I'm looking for alternative, is complexity
>>> of making hole brings in. At this point, we can't do anything
>>> with 4G hole as it's already there, but we can try to avoid that
>>> for high RAM and keep rules there simple as it's now.
>>>   
>> Right. But for what is worth, that complexity is spread in two parts:
>>
>> 1) dealing with a sparse RAM model (with more than one hole)
>>
>> 2) offsetting everything else that assumes a linear RAM map.
>>
>> I don't think that even if we shift start of RAM to after 1TB boundary that
>> we would get away top solving item 2 -- which personally is where I find this
>> a tad bit more hairy. So it would probably make this patch complexity smaller, but
>> not vary much in how spread the changes get.
> 
> you are already shifting hotplug area behind 1Tb in [2/6],
> so to be consistent just do the same for 4Gb+ alias a well.
> That will automatically solve issues in patch 4/6, and all
> that at cost of several lines in one place vs this 200+ LOC series.
> Both approaches are a kludge but shifting everything over 1Tb mark
> is the simplest one.
> 

I understand that shifting above-4g-mem region to start at 1T would
be simpler and a lot less to maintain considering we can eliminate
most complexity from finding holes to place RAM ranges.

Let me try your suggestion.

But I'm still somewhat at odds that we have a gigantic hole from below
[<4G ... 1T]. I have nothing to base upon *right now* so nothing more
than a void concern, but hopefully there isn't assumptions elsewhere
in OSes/firmware and etc.


> If there were/is actual demand for sparse/non linear RAM layouts,
> I'd switch to pc-dimm based RAM (i.e. generalize it to handle
> RAM below 4G and let users specify their own memory map,
> see below for more).
> 
It seems less prone to error if we learn to deal with these odities
with the pc-dimm based RAM (longterm). At least it would open
other avenues Alex suggested in the cover letter. And more importantly,
let represent all this closer to what hardware actually does (which is
what I was attempting with this series)

>>> Also partitioning/splitting main RAM is one of the things that
>>> gets in the way converting it to PC-DIMMMs model.
>>>   
>> Can you expand on that? (a link to a series is enough)
> There is no series so far, only a rough idea.
> 
> current initial RAM with rather arbitrary splitting rules,
> doesn't map very well with pc-dimm device model.
> Unfortunately I don't see a way to convert initial RAM to
> pc-dimm model without breaking CLI/ABI/migration.
> 
> As a way forward, we can restrict legacy layout to old
> machine versions, and switch to pc-dimm based initial RAM
> for new machine versions.
> 
> Then users will be able to specify GPA where each pc-dimm shall
> be mapped. For reserving GPA ranges we can change current GPA
> allocator. Alternatively a bit more flexible approach could be
> a dummy memory device that would consume a reserved range so
> that no one could assign pc-dimm there, this way user can define
> arbitrary (subject to alignment restrictions we put on it) sparse
> memory layouts without modifying QEMU for yet another hole.
> 
Yeap -- sounds a whole lot more flexible longterm.

> 
>>> Loosing 1Tb of address space might be acceptable on a host
>>> that can handle such amounts of RAM
diff mbox series

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c6d8d0d84d91..52a5473ba846 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -91,6 +91,7 @@ 
 #include "qapi/qmp/qerror.h"
 #include "e820_memory_layout.h"
 #include "fw_cfg.h"
+#include "target/i386/cpu.h"
 #include "trace.h"
 #include CONFIG_DEVICES
 
@@ -860,6 +861,93 @@  void xen_load_linux(PCMachineState *pcms)
     x86ms->fw_cfg = fw_cfg;
 }
 
+struct GPARange usable_iova_ranges[] = {
+    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
+
+/*
+ * AMD systems with an IOMMU have an additional hole close to the
+ * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
+ * on kernel version, VFIO may or may not let you DMA map those ranges.
+ * Starting v5.4 we validate it, and can't create guests on AMD machines
+ * with certain memory sizes. The range is:
+ *
+ * FD_0000_0000h - FF_FFFF_FFFFh
+ *
+ * The ranges represent the following:
+ *
+ * Base Address   Top Address  Use
+ *
+ * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
+ * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
+ * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
+ * FD_F910_0000h FD_F91F_FFFFh System Management
+ * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
+ * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
+ * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
+ * FD_FE00_0000h FD_FFFF_FFFFh Configuration
+ * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
+ * FE_2000_0000h FF_FFFF_FFFFh Reserved
+ *
+ * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
+ * Table 3: Special Address Controls (GPA) for more information.
+ */
+#define DEFAULT_NR_USABLE_IOVAS 1
+#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
+    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
+};
+
+ uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
+
+static void init_usable_iova_ranges(void)
+{
+    uint32_t eax, vendor[3];
+
+    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
+    if (IS_AMD_VENDOR(vendor)) {
+        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
+        nb_iova_ranges++;
+    }
+}
+
+static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
+                                hwaddr base, hwaddr size, hwaddr offset)
+{
+    hwaddr start, region_size, resv_start, resv_end;
+    struct GPARange *range;
+    MemoryRegion *region;
+    uint32_t index;
+
+    for_each_usable_range(index, base, size, range, start, region_size) {
+        region = g_malloc(sizeof(*region));
+        memory_region_init_alias(region, NULL, range->name, ram,
+                                 offset, region_size);
+        memory_region_add_subregion(system_memory, start, region);
+        e820_add_entry(start, region_size, E820_RAM);
+
+        assert(size >= region_size);
+        if (size == region_size) {
+            return;
+        }
+
+        /*
+         * There's memory left to create a region for, so there should be
+         * another valid IOVA range left.  Creating the reserved region
+         * would also be pointless.
+         */
+        if (index + 1 == nb_iova_ranges) {
+            return;
+        }
+
+        resv_start = start + region_size;
+        resv_end = usable_iova_ranges[index + 1].start;
+
+        /* Create a reserved region in the IOVA hole. */
+        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
+
+        offset += region_size;
+    }
+}
+
 void pc_memory_init(PCMachineState *pcms,
                     MemoryRegion *system_memory,
                     MemoryRegion *rom_memory,
@@ -867,7 +955,7 @@  void pc_memory_init(PCMachineState *pcms,
 {
     int linux_boot, i;
     MemoryRegion *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g;
     FWCfgState *fw_cfg;
     MachineState *machine = MACHINE(pcms);
     MachineClass *mc = MACHINE_GET_CLASS(machine);
@@ -877,6 +965,8 @@  void pc_memory_init(PCMachineState *pcms,
     assert(machine->ram_size == x86ms->below_4g_mem_size +
                                 x86ms->above_4g_mem_size);
 
+    init_usable_iova_ranges();
+
     linux_boot = (machine->kernel_filename != NULL);
 
     /*
@@ -888,16 +978,11 @@  void pc_memory_init(PCMachineState *pcms,
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
                              0, x86ms->below_4g_mem_size);
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
     e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
     if (x86ms->above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
-                                 machine->ram,
-                                 x86ms->below_4g_mem_size,
-                                 x86ms->above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
-        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+        add_memory_region(system_memory, machine->ram, 4 * GiB,
+                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
     }
 
     if (!pcmc->has_reserved_memory &&
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 1522a3359a93..73b8e2900c72 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -151,6 +151,63 @@  void pc_guest_info_init(PCMachineState *pcms);
 #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
 #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
 
+struct GPARange {
+    uint64_t start;
+    uint64_t end;
+#define range_len(r) ((r).end - (r).start + 1)
+
+    const char *name;
+};
+
+extern uint32_t nb_iova_ranges;
+extern struct GPARange usable_iova_ranges[];
+
+static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
+                                         hwaddr *start, hwaddr *region_size,
+                                         uint32_t *index, struct GPARange **r)
+{
+    struct GPARange *range;
+
+    while (i < nb_iova_ranges) {
+        range = &usable_iova_ranges[i];
+        if (range->end >= base) {
+            break;
+        }
+        i++;
+    }
+
+    *index = i;
+    /* index is out of bounds or no region left to process */
+    if (i >= nb_iova_ranges || !size) {
+        return;
+    }
+
+    *start = MAX(range->start, base);
+    *region_size = MIN(range->end - *start + 1, size);
+    *r = range;
+}
+
+/*
+ * for_each_usable_range() - iterates over usable IOVA regions
+ *
+ * @i:      range index within usable_iova_ranges
+ * @base:   requested address we want to use
+ * @size:   total size of the region with @base
+ * @r:      range indexed by @i within usable_iova_ranges
+ * @s:      calculated usable iova address base
+ * @i_size: calculated usable iova region size starting @s
+ *
+ * Use this iterator to walk through usable GPA ranges. Platforms
+ * such as AMD with IOMMU capable hardware restrict certain address
+ * ranges for Hyper Transport. This iterator helper lets user avoid
+ * using those said reserved ranges.
+ */
+#define for_each_usable_range(i, base, size, r, s, i_size) \
+    for (s = 0, i_size = 0, r = NULL, \
+         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
+         i < nb_iova_ranges && size != 0; \
+         size -= i_size, \
+         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
 
 void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
                             MemoryRegion *pci_address_space);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1e11071d817b..f8f15a4523c6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -869,6 +869,9 @@  typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
                          (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
                          (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
+#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
+                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
+                         (vendor[2]) == CPUID_VENDOR_AMD_3)
 
 #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */