diff mbox series

[v1,3/6] xen/arm: create dom0less virtio-pci DT node

Message ID 20240924162359.1390487-4-edgar.iglesias@gmail.com (mailing list archive)
State New
Headers show
Series xen/arm: Add Virtio-PCI for dom0less on ARM | expand

Commit Message

Edgar E. Iglesias Sept. 24, 2024, 4:23 p.m. UTC
From: Stewart Hildebrand <stewart.hildebrand@amd.com>

When virtio-pci is specified in the dom0less domU properties, create a
virtio-pci node in the guest's device tree. Set up an mmio handler with
a register for the guest to poll when the backend has connected and
virtio-pci bus is ready to be probed. Grant tables may be used by
specifying virtio-pci = "grants";.

[Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
 Make grants iommu-map cover the entire PCI bus.
 Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
 Document virtio-pci dom0less fdt bindings.]
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
---
 docs/misc/arm/device-tree/booting.txt |  21 +++
 xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
 xen/arch/arm/include/asm/kernel.h     |  15 ++
 3 files changed, 274 insertions(+)

Comments

Stefano Stabellini Sept. 24, 2024, 11:55 p.m. UTC | #1
On Tue, 24 Sep 2024, Edgar E. Iglesias wrote:
> From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> 
> When virtio-pci is specified in the dom0less domU properties, create a
> virtio-pci node in the guest's device tree. Set up an mmio handler with
> a register for the guest to poll when the backend has connected and
> virtio-pci bus is ready to be probed. Grant tables may be used by
> specifying virtio-pci = "grants";.
> 
> [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
>  Make grants iommu-map cover the entire PCI bus.
>  Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
>  Document virtio-pci dom0less fdt bindings.]
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> ---
>  docs/misc/arm/device-tree/booting.txt |  21 +++
>  xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
>  xen/arch/arm/include/asm/kernel.h     |  15 ++
>  3 files changed, 274 insertions(+)
> 
> diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
> index 3a04f5c57f..82f3bd7026 100644
> --- a/docs/misc/arm/device-tree/booting.txt
> +++ b/docs/misc/arm/device-tree/booting.txt
> @@ -276,6 +276,27 @@ with the following properties:
>      passed through. This option is the default if this property is missing
>      and the user does not provide the device partial device tree for the domain.
>  
> +- virtio-pci
> +
> +    A string property specifying whether virtio-pci is enabled for the
> +    domain and if grant table mappings should be used. If no value is set
> +    this property is treated as a boolean and the same way as if set to
> +    "enabled".
> +    Possible property values are:
> +
> +    - "enabled"
> +    Virtio-pci is enabled for the domain.
> +
> +    - "grants"
> +    Virtio-pci is enabled for the domain and an grants IOMMU node will be
> +    generated in the domains device-tree.
> +
> +- virtio-pci-ranges
> +
> +    An optional array of 6 u32 values specifying the 2 cells base addresses of
> +    the ECAM, Memory and Prefetchable-Memory regions for virtio-pci. This is
> +    useful to avoid memory-map collisions when using direct-mapped guests.
> +
>  Under the "xen,domain" compatible node, one or more sub-nodes are present
>  for the DomU kernel and ramdisk.
>  
> diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> index 09b65e44ae..dab24fa9e2 100644
> --- a/xen/arch/arm/dom0less-build.c
> +++ b/xen/arch/arm/dom0less-build.c
> @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
>      return res;
>  }
>  
> +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> +{
> +    void *fdt = kinfo->fdt;
> +    /* reg is sized to be used for all the needed properties below */
> +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> +               * 2];
> +    __be32 irq_map[4 * 4 * 8];
> +    __be32 *cells;
> +    char buf[22]; /* pcie@ + max 16 char address + '\0' */
> +    int res;
> +    int devfn, intx_pin;
> +    static const char compat[] = "pci-host-ecam-generic";
> +    static const char reg_names[] = "ecam";
> +
> +    if ( p2m_ipa_bits <= 40 ) {
> +        printk("PA bits %d is too small!\nvirtio-pci is only supported "
> +               "on platforms with PA bits > 40\n", p2m_ipa_bits);
> +        return -EINVAL;
> +    }
> +
> +    snprintf(buf, sizeof(buf), "pcie@%lx", kinfo->virtio_pci.ecam.base);
> +    dt_dprintk("Create virtio-pci node\n");
> +    res = fdt_begin_node(fdt, buf);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "compatible", compat, sizeof(compat));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_string(fdt, "device_type", "pci");
> +    if ( res )
> +        return res;
> +
> +    /* Create reg property */
> +    cells = &reg[0];
> +    dt_child_set_range(&cells, GUEST_ROOT_ADDRESS_CELLS, GUEST_ROOT_SIZE_CELLS,
> +                       kinfo->virtio_pci.ecam.base, GUEST_VIRTIO_PCI_ECAM_SIZE);
> +
> +    res = fdt_property(fdt, "reg", reg,
> +                       (GUEST_ROOT_ADDRESS_CELLS +
> +                        GUEST_ROOT_SIZE_CELLS) * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "reg-names", reg_names, sizeof(reg_names));
> +    if ( res )
> +        return res;
> +
> +    /* Create bus-range property */
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, 0);
> +    dt_set_cell(&cells, 1, 0xff);
> +    res = fdt_property(fdt, "bus-range", reg, 2 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#address-cells", 3);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#size-cells", 2);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_string(fdt, "status", "okay");
> +    if ( res )
> +        return res;
> +
> +    /*
> +     * Create ranges property as:
> +     * <(PCI bitfield) (PCI address) (CPU address) (Size)>
> +     */
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_MEM_TYPE);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_MEM_SIZE);
> +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_PREFETCH_MEM_TYPE);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_PREFETCH_MEM_SIZE);
> +    res = fdt_property(fdt, "ranges", reg, 14 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "dma-coherent", "", 0);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#interrupt-cells", 1);
> +    if ( res )
> +        return res;
> +
> +    /*
> +     * PCI-to-PCI bridge specification
> +     * 9.1: Interrupt routing. Table 9-1
> +     *
> +     * the PCI Express Base Specification, Revision 2.1
> +     * 2.2.8.1: INTx interrupt signaling - Rules
> +     *          the Implementation Note
> +     *          Table 2-20
> +     */
> +    cells = &irq_map[0];
> +    for (devfn = 0; devfn <= 0x18; devfn += 8) {
> +        for (intx_pin = 0; intx_pin < 4; intx_pin++) {
> +            int irq = GUEST_VIRTIO_PCI_SPI_FIRST - 32;
> +            irq += ((intx_pin + PCI_SLOT(devfn)) % 4);
> +
> +            dt_set_cell(&cells, 1, devfn << 8);
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, intx_pin + 1);
> +            dt_set_cell(&cells, 1, kinfo->phandle_gic);
> +            /* 3 GIC cells.  */
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, irq);
> +            dt_set_cell(&cells, 1, DT_IRQ_TYPE_LEVEL_HIGH);
> +        }
> +    }
> +
> +    /* Assert we've sized irq_map correctly.  */
> +    BUG_ON(cells - &irq_map[0] != ARRAY_SIZE(irq_map));
> +
> +    res = fdt_property(fdt, "interrupt-map", irq_map, sizeof(irq_map));
> +    if ( res )
> +        return res;
> +
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, cpu_to_be16(PCI_DEVFN(3, 0)));

Hi Edgar, what is this PCI_DEVFN(3, 0)  device?


> +    dt_set_cell(&cells, 1, 0x0);
> +    dt_set_cell(&cells, 1, 0x0);
> +    dt_set_cell(&cells, 1, 0x7);
> +    res = fdt_property(fdt, "interrupt-map-mask", reg, 4 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> +    {
> +        cells = &reg[0];
> +        dt_set_cell(&cells, 1, 0x0);
> +        dt_set_cell(&cells, 1, GUEST_PHANDLE_IOMMU);
> +        dt_set_cell(&cells, 1, 0x0);
> +        dt_set_cell(&cells, 1, 0x10000);
> +        res = fdt_property(fdt, "iommu-map", reg, 4 * sizeof(*reg));
> +        if ( res )
> +            return res;
> +    }
> +
> +    res = fdt_property_cell(fdt, "linux,pci-domain", 1);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_end_node(fdt);
> +    if ( res )
> +        return res;
> +
> +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> +    {
> +        snprintf(buf, sizeof(buf), "xen_iommu");

I don't think underscores are allowed in device tree node names



> +        res = fdt_begin_node(fdt, buf);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_string(fdt, "compatible", "xen,grant-dma");
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_cell(fdt, "#iommu-cells", 1);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_cell(fdt, "phandle", GUEST_PHANDLE_IOMMU);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_end_node(fdt);
> +    }
> +
> +    return res;
> +}
> +
>  /*
>   * The max size for DT is 2MB. However, the generated DT is small (not including
>   * domU passthrough DT nodes whose size we account separately), 4KB are enough
> @@ -693,6 +876,13 @@ static int __init prepare_dtb_domU(struct domain *d, struct kernel_info *kinfo)
>              goto err;
>      }
>  
> +    if ( kinfo->virtio_pci.mode )
> +    {
> +        ret = make_virtio_pci_domU_node(kinfo);
> +        if ( ret )
> +            goto err;
> +    }
> +
>      ret = fdt_end_node(kinfo->fdt);
>      if ( ret < 0 )
>          goto err;
> @@ -744,11 +934,24 @@ static int __init alloc_xenstore_evtchn(struct domain *d)
>      return 0;
>  }
>  
> +static u64 combine_u64(u32 v[2])

Let's make this a static inline or a macro. I can't believe we don't
have one already.


> +{
> +    u64 v64;
> +
> +    v64 = v[0];
> +    v64 <<= 32;
> +    v64 |= v[1];
> +    return v64;
> +}
> +
>  static int __init construct_domU(struct domain *d,
>                                   const struct dt_device_node *node)
>  {
>      struct kernel_info kinfo = KERNEL_INFO_INIT;
>      const char *dom0less_enhanced;
> +    const char *virtio_pci;
> +    /* virtio-pci ECAM, MEM, PF-MEM each carrying 2 x Address cells.  */
> +    u32 virtio_pci_ranges[3 * 2];
>      int rc;
>      u64 mem;
>      u32 p2m_mem_mb;
> @@ -779,6 +982,41 @@ static int __init construct_domU(struct domain *d,
>  
>      kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
>  
> +    rc = dt_property_read_string(node, "virtio-pci", &virtio_pci);
> +    if ( !rc )
> +    {
> +        if ( !strcmp(virtio_pci, "enabled") )
> +            kinfo.virtio_pci.mode = VIRTIO_PCI;
> +        else if ( !strcmp(virtio_pci, "grants") )
> +            kinfo.virtio_pci.mode = VIRTIO_PCI_GRANTS;
> +        else
> +        {
> +            printk("Invalid \"virtio-pci\" property value (%s)\n", virtio_pci);
> +            return -EINVAL;
> +        }
> +    }
> +    else if ( rc == -ENODATA )
> +    {
> +        /* Handle missing property value */
> +        kinfo.virtio_pci.mode = dt_property_read_bool(node, "virtio-pci");
> +    }
> +
> +    if ( kinfo.virtio_pci.mode != VIRTIO_PCI_NONE )
> +    {
> +        rc = dt_property_read_u32_array(node, "virtio-pci-ranges",
> +                                        virtio_pci_ranges,
> +                                        ARRAY_SIZE(virtio_pci_ranges));
> +        if ( rc == 0 ) {
> +            kinfo.virtio_pci.ecam.base = combine_u64(&virtio_pci_ranges[0]);
> +            kinfo.virtio_pci.mem.base = combine_u64(&virtio_pci_ranges[2]);
> +            kinfo.virtio_pci.pf_mem.base = combine_u64(&virtio_pci_ranges[4]);
> +        } else {
> +            kinfo.virtio_pci.ecam.base = GUEST_VIRTIO_PCI_ECAM_BASE;
> +            kinfo.virtio_pci.mem.base = GUEST_VIRTIO_PCI_MEM_BASE;
> +            kinfo.virtio_pci.pf_mem.base = GUEST_VIRTIO_PCI_PREFETCH_MEM_BASE;
> +        }
> +    }
> +
>      rc = dt_property_read_string(node, "xen,enhanced", &dom0less_enhanced);
>      if ( rc == -EILSEQ ||
>           rc == -ENODATA ||
> diff --git a/xen/arch/arm/include/asm/kernel.h b/xen/arch/arm/include/asm/kernel.h
> index 7e6e3c82a4..2dab2ac88f 100644
> --- a/xen/arch/arm/include/asm/kernel.h
> +++ b/xen/arch/arm/include/asm/kernel.h
> @@ -29,6 +29,13 @@
>  #define DOM0LESS_XENSTORE        BIT(1, U)
>  #define DOM0LESS_ENHANCED        (DOM0LESS_ENHANCED_NO_XS | DOM0LESS_XENSTORE)
>  
> +/* virtio-pci types */
> +enum virtio_pci_type {
> +    VIRTIO_PCI_NONE,
> +    VIRTIO_PCI,
> +    VIRTIO_PCI_GRANTS,
> +};
> +
>  struct kernel_info {
>  #ifdef CONFIG_ARM_64
>      enum domain_type type;
> @@ -62,6 +69,14 @@ struct kernel_info {
>      /* Enable/Disable PV drivers interfaces */
>      uint16_t dom0less_feature;
>  
> +    struct {
> +        enum virtio_pci_type mode;
> +        struct {
> +            u64 base;
> +        } ecam, mem, pf_mem;
> +        u32 pci_intx_irq_base;
> +    } virtio_pci;
> +
>      /* GIC phandle */
>      uint32_t phandle_gic;
>  
> -- 
> 2.43.0
>
Stewart Hildebrand Sept. 25, 2024, 2:48 a.m. UTC | #2
On 9/24/24 12:23, Edgar E. Iglesias wrote:
> diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> index 09b65e44ae..dab24fa9e2 100644
> --- a/xen/arch/arm/dom0less-build.c
> +++ b/xen/arch/arm/dom0less-build.c
> @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
>      return res;
>  }
>  
> +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> +{
> +    void *fdt = kinfo->fdt;
> +    /* reg is sized to be used for all the needed properties below */
> +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> +               * 2];
> +    __be32 irq_map[4 * 4 * 8];
> +    __be32 *cells;
> +    char buf[22]; /* pcie@ + max 16 char address + '\0' */
> +    int res;
> +    int devfn, intx_pin;
> +    static const char compat[] = "pci-host-ecam-generic";
> +    static const char reg_names[] = "ecam";
> +
> +    if ( p2m_ipa_bits <= 40 ) {
> +        printk("PA bits %d is too small!\nvirtio-pci is only supported "
> +               "on platforms with PA bits > 40\n", p2m_ipa_bits);
> +        return -EINVAL;
> +    }
> +
> +    snprintf(buf, sizeof(buf), "pcie@%lx", kinfo->virtio_pci.ecam.base);
> +    dt_dprintk("Create virtio-pci node\n");
> +    res = fdt_begin_node(fdt, buf);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "compatible", compat, sizeof(compat));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_string(fdt, "device_type", "pci");
> +    if ( res )
> +        return res;
> +
> +    /* Create reg property */
> +    cells = &reg[0];
> +    dt_child_set_range(&cells, GUEST_ROOT_ADDRESS_CELLS, GUEST_ROOT_SIZE_CELLS,
> +                       kinfo->virtio_pci.ecam.base, GUEST_VIRTIO_PCI_ECAM_SIZE);
> +
> +    res = fdt_property(fdt, "reg", reg,
> +                       (GUEST_ROOT_ADDRESS_CELLS +
> +                        GUEST_ROOT_SIZE_CELLS) * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "reg-names", reg_names, sizeof(reg_names));
> +    if ( res )
> +        return res;
> +
> +    /* Create bus-range property */
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, 0);
> +    dt_set_cell(&cells, 1, 0xff);
> +    res = fdt_property(fdt, "bus-range", reg, 2 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#address-cells", 3);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#size-cells", 2);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_string(fdt, "status", "okay");
> +    if ( res )
> +        return res;
> +
> +    /*
> +     * Create ranges property as:
> +     * <(PCI bitfield) (PCI address) (CPU address) (Size)>
> +     */
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_MEM_TYPE);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_MEM_SIZE);
> +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_PREFETCH_MEM_TYPE);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_PREFETCH_MEM_SIZE);
> +    res = fdt_property(fdt, "ranges", reg, 14 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property(fdt, "dma-coherent", "", 0);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_property_cell(fdt, "#interrupt-cells", 1);
> +    if ( res )
> +        return res;
> +
> +    /*
> +     * PCI-to-PCI bridge specification
> +     * 9.1: Interrupt routing. Table 9-1
> +     *
> +     * the PCI Express Base Specification, Revision 2.1
> +     * 2.2.8.1: INTx interrupt signaling - Rules
> +     *          the Implementation Note
> +     *          Table 2-20
> +     */
> +    cells = &irq_map[0];
> +    for (devfn = 0; devfn <= 0x18; devfn += 8) {
> +        for (intx_pin = 0; intx_pin < 4; intx_pin++) {
> +            int irq = GUEST_VIRTIO_PCI_SPI_FIRST - 32;
> +            irq += ((intx_pin + PCI_SLOT(devfn)) % 4);
> +
> +            dt_set_cell(&cells, 1, devfn << 8);
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, intx_pin + 1);
> +            dt_set_cell(&cells, 1, kinfo->phandle_gic);

Here we will also want a parent unit address (GIC unit address). The
number of cells is determined by the vGIC node's #address-cells. See
section 2.4.3 in [1].

Also take a look at EPAM's libxl implementation, in the function
create_virtio_pci_irq_map() [2].

[1] https://github.com/devicetree-org/devicetree-specification/releases/download/v0.4/devicetree-specification-v0.4.pdf
[2] https://lore.kernel.org/xen-devel/20231115112611.3865905-4-Sergiy_Kibrik@epam.com/

> +            /* 3 GIC cells.  */
> +            dt_set_cell(&cells, 1, 0);
> +            dt_set_cell(&cells, 1, irq);
> +            dt_set_cell(&cells, 1, DT_IRQ_TYPE_LEVEL_HIGH);
> +        }
> +    }
> +
> +    /* Assert we've sized irq_map correctly.  */
> +    BUG_ON(cells - &irq_map[0] != ARRAY_SIZE(irq_map));
> +
> +    res = fdt_property(fdt, "interrupt-map", irq_map, sizeof(irq_map));
> +    if ( res )
> +        return res;
> +
> +    cells = &reg[0];
> +    dt_set_cell(&cells, 1, cpu_to_be16(PCI_DEVFN(3, 0)));
> +    dt_set_cell(&cells, 1, 0x0);
> +    dt_set_cell(&cells, 1, 0x0);
> +    dt_set_cell(&cells, 1, 0x7);
> +    res = fdt_property(fdt, "interrupt-map-mask", reg, 4 * sizeof(*reg));
> +    if ( res )
> +        return res;
> +
> +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> +    {
> +        cells = &reg[0];
> +        dt_set_cell(&cells, 1, 0x0);
> +        dt_set_cell(&cells, 1, GUEST_PHANDLE_IOMMU);
> +        dt_set_cell(&cells, 1, 0x0);
> +        dt_set_cell(&cells, 1, 0x10000);
> +        res = fdt_property(fdt, "iommu-map", reg, 4 * sizeof(*reg));
> +        if ( res )
> +            return res;
> +    }
> +
> +    res = fdt_property_cell(fdt, "linux,pci-domain", 1);
> +    if ( res )
> +        return res;
> +
> +    res = fdt_end_node(fdt);
> +    if ( res )
> +        return res;
> +
> +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> +    {
> +        snprintf(buf, sizeof(buf), "xen_iommu");
> +
> +        res = fdt_begin_node(fdt, buf);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_string(fdt, "compatible", "xen,grant-dma");
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_cell(fdt, "#iommu-cells", 1);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_property_cell(fdt, "phandle", GUEST_PHANDLE_IOMMU);
> +        if ( res )
> +            return res;
> +
> +        res = fdt_end_node(fdt);
> +    }
> +
> +    return res;
> +}
Julien Grall Sept. 25, 2024, 7:44 a.m. UTC | #3
Hi,

On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> 
> When virtio-pci is specified in the dom0less domU properties, create a
> virtio-pci node in the guest's device tree. Set up an mmio handler with
> a register for the guest to poll when the backend has connected and
> virtio-pci bus is ready to be probed. Grant tables may be used by
> specifying virtio-pci = "grants";.
> 
> [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
>   Make grants iommu-map cover the entire PCI bus.
>   Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
>   Document virtio-pci dom0less fdt bindings.]
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> ---
>   docs/misc/arm/device-tree/booting.txt |  21 +++
>   xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
>   xen/arch/arm/include/asm/kernel.h     |  15 ++
>   3 files changed, 274 insertions(+)
> 
> diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
> index 3a04f5c57f..82f3bd7026 100644
> --- a/docs/misc/arm/device-tree/booting.txt
> +++ b/docs/misc/arm/device-tree/booting.txt
> @@ -276,6 +276,27 @@ with the following properties:
>       passed through. This option is the default if this property is missing
>       and the user does not provide the device partial device tree for the domain.
>   
> +- virtio-pci

Similar question to the other patches, why is this specific to virtio 
PCI? QEMU (or another device module) is free to emulate whatever it 
wants behind the PCI hosbtridge.

> +
> +    A string property specifying whether virtio-pci is enabled for the
> +    domain and if grant table mappings should be used. If no value is set
> +    this property is treated as a boolean and the same way as if set to
> +    "enabled".
> +    Possible property values are:
> +
> +    - "enabled"
> +    Virtio-pci is enabled for the domain.
> +
> +    - "grants"
> +    Virtio-pci is enabled for the domain and an grants IOMMU node will be
> +    generated in the domains device-tree.
> +
> +- virtio-pci-ranges
> +
> +    An optional array of 6 u32 values specifying the 2 cells base addresses of
> +    the ECAM, Memory and Prefetchable-Memory regions for virtio-pci. This is
> +    useful to avoid memory-map collisions when using direct-mapped guests.
> +
>   Under the "xen,domain" compatible node, one or more sub-nodes are present
>   for the DomU kernel and ramdisk.
>   
> diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> index 09b65e44ae..dab24fa9e2 100644
> --- a/xen/arch/arm/dom0less-build.c
> +++ b/xen/arch/arm/dom0less-build.c
> @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
>       return res;
>   }
>   
> +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> +{
> +    void *fdt = kinfo->fdt;
> +    /* reg is sized to be used for all the needed properties below */
> +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> +               * 2];
> +    __be32 irq_map[4 * 4 * 8];
> +    __be32 *cells;
> +    char buf[22]; /* pcie@ + max 16 char address + '\0' */
> +    int res;
> +    int devfn, intx_pin;
> +    static const char compat[] = "pci-host-ecam-generic";
> +    static const char reg_names[] = "ecam";
> +
> +    if ( p2m_ipa_bits <= 40 ) {
> +        printk("PA bits %d is too small!\nvirtio-pci is only supported "
> +               "on platforms with PA bits > 40\n", p2m_ipa_bits);
> +        return -EINVAL;
> +    }

Please add a comment explaining where does this requires come from. If 
this is the Address layout, then probably be to avoid relying on 
hardcoded number of bits.

Cheers,
Stewart Hildebrand Sept. 25, 2024, 1:03 p.m. UTC | #4
On 9/24/24 19:55, Stefano Stabellini wrote:
> On Tue, 24 Sep 2024, Edgar E. Iglesias wrote:
>> diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
>> index 09b65e44ae..dab24fa9e2 100644
>> --- a/xen/arch/arm/dom0less-build.c
>> +++ b/xen/arch/arm/dom0less-build.c
>> @@ -744,11 +934,24 @@ static int __init alloc_xenstore_evtchn(struct domain *d)
>>      return 0;
>>  }
>>  
>> +static u64 combine_u64(u32 v[2])
> 
> Let's make this a static inline or a macro. I can't believe we don't
> have one already.

We have dt_read_number(). You'd need to use it with dt_get_property().
In this case, it might look something like:

__be32 *val = dt_get_property(node, "virtio-pci-ranges", &len);

/* TODO: check len */

...ecam.base = dt_read_number(&val[0], 2);
...mem.base = dt_read_number(&val[2], 2);
...pf_mem.base = dt_read_number(&val[4], 2);

>> +{
>> +    u64 v64;
>> +
>> +    v64 = v[0];
>> +    v64 <<= 32;
>> +    v64 |= v[1];
>> +    return v64;
>> +}
>> +
>>  static int __init construct_domU(struct domain *d,
>>                                   const struct dt_device_node *node)
>>  {
>>      struct kernel_info kinfo = KERNEL_INFO_INIT;
>>      const char *dom0less_enhanced;
>> +    const char *virtio_pci;
>> +    /* virtio-pci ECAM, MEM, PF-MEM each carrying 2 x Address cells.  */
>> +    u32 virtio_pci_ranges[3 * 2];
>>      int rc;
>>      u64 mem;
>>      u32 p2m_mem_mb;
>> @@ -779,6 +982,41 @@ static int __init construct_domU(struct domain *d,
>>  
>>      kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
>>  
>> +    rc = dt_property_read_string(node, "virtio-pci", &virtio_pci);
>> +    if ( !rc )
>> +    {
>> +        if ( !strcmp(virtio_pci, "enabled") )
>> +            kinfo.virtio_pci.mode = VIRTIO_PCI;
>> +        else if ( !strcmp(virtio_pci, "grants") )
>> +            kinfo.virtio_pci.mode = VIRTIO_PCI_GRANTS;
>> +        else
>> +        {
>> +            printk("Invalid \"virtio-pci\" property value (%s)\n", virtio_pci);
>> +            return -EINVAL;
>> +        }
>> +    }
>> +    else if ( rc == -ENODATA )
>> +    {
>> +        /* Handle missing property value */
>> +        kinfo.virtio_pci.mode = dt_property_read_bool(node, "virtio-pci");
>> +    }
>> +
>> +    if ( kinfo.virtio_pci.mode != VIRTIO_PCI_NONE )
>> +    {
>> +        rc = dt_property_read_u32_array(node, "virtio-pci-ranges",
>> +                                        virtio_pci_ranges,
>> +                                        ARRAY_SIZE(virtio_pci_ranges));
>> +        if ( rc == 0 ) {
>> +            kinfo.virtio_pci.ecam.base = combine_u64(&virtio_pci_ranges[0]);
>> +            kinfo.virtio_pci.mem.base = combine_u64(&virtio_pci_ranges[2]);
>> +            kinfo.virtio_pci.pf_mem.base = combine_u64(&virtio_pci_ranges[4]);
>> +        } else {
>> +            kinfo.virtio_pci.ecam.base = GUEST_VIRTIO_PCI_ECAM_BASE;
>> +            kinfo.virtio_pci.mem.base = GUEST_VIRTIO_PCI_MEM_BASE;
>> +            kinfo.virtio_pci.pf_mem.base = GUEST_VIRTIO_PCI_PREFETCH_MEM_BASE;
>> +        }
>> +    }
>> +
>>      rc = dt_property_read_string(node, "xen,enhanced", &dom0less_enhanced);
>>      if ( rc == -EILSEQ ||
>>           rc == -ENODATA ||
Edgar E. Iglesias Sept. 25, 2024, 4:34 p.m. UTC | #5
On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
> Hi,
> 

Hi Julien,


> On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > 
> > When virtio-pci is specified in the dom0less domU properties, create a
> > virtio-pci node in the guest's device tree. Set up an mmio handler with
> > a register for the guest to poll when the backend has connected and
> > virtio-pci bus is ready to be probed. Grant tables may be used by
> > specifying virtio-pci = "grants";.
> > 
> > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> >   Make grants iommu-map cover the entire PCI bus.
> >   Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
> >   Document virtio-pci dom0less fdt bindings.]
> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > ---
> >   docs/misc/arm/device-tree/booting.txt |  21 +++
> >   xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
> >   xen/arch/arm/include/asm/kernel.h     |  15 ++
> >   3 files changed, 274 insertions(+)
> > 
> > diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
> > index 3a04f5c57f..82f3bd7026 100644
> > --- a/docs/misc/arm/device-tree/booting.txt
> > +++ b/docs/misc/arm/device-tree/booting.txt
> > @@ -276,6 +276,27 @@ with the following properties:
> >       passed through. This option is the default if this property is missing
> >       and the user does not provide the device partial device tree for the domain.
> > +- virtio-pci
> 
> Similar question to the other patches, why is this specific to virtio PCI?
> QEMU (or another device module) is free to emulate whatever it wants behind
> the PCI hosbtridge.

There's no hard limitatino to only virtio-pci devices it's more of a
recommendation that PVH guests should not use "emulated" devices but
there's nothing stopping it.

Perhaps the names of these properties are missleading, I'm happy to
change them if there are better suggestions!

I can also clarify it in the documentation.


> 
> > +
> > +    A string property specifying whether virtio-pci is enabled for the
> > +    domain and if grant table mappings should be used. If no value is set
> > +    this property is treated as a boolean and the same way as if set to
> > +    "enabled".
> > +    Possible property values are:
> > +
> > +    - "enabled"
> > +    Virtio-pci is enabled for the domain.
> > +
> > +    - "grants"
> > +    Virtio-pci is enabled for the domain and an grants IOMMU node will be
> > +    generated in the domains device-tree.
> > +
> > +- virtio-pci-ranges
> > +
> > +    An optional array of 6 u32 values specifying the 2 cells base addresses of
> > +    the ECAM, Memory and Prefetchable-Memory regions for virtio-pci. This is
> > +    useful to avoid memory-map collisions when using direct-mapped guests.
> > +
> >   Under the "xen,domain" compatible node, one or more sub-nodes are present
> >   for the DomU kernel and ramdisk.
> > diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> > index 09b65e44ae..dab24fa9e2 100644
> > --- a/xen/arch/arm/dom0less-build.c
> > +++ b/xen/arch/arm/dom0less-build.c
> > @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
> >       return res;
> >   }
> > +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> > +{
> > +    void *fdt = kinfo->fdt;
> > +    /* reg is sized to be used for all the needed properties below */
> > +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> > +               * 2];
> > +    __be32 irq_map[4 * 4 * 8];
> > +    __be32 *cells;
> > +    char buf[22]; /* pcie@ + max 16 char address + '\0' */
> > +    int res;
> > +    int devfn, intx_pin;
> > +    static const char compat[] = "pci-host-ecam-generic";
> > +    static const char reg_names[] = "ecam";
> > +
> > +    if ( p2m_ipa_bits <= 40 ) {
> > +        printk("PA bits %d is too small!\nvirtio-pci is only supported "
> > +               "on platforms with PA bits > 40\n", p2m_ipa_bits);
> > +        return -EINVAL;
> > +    }
> 
> Please add a comment explaining where does this requires come from. If this
> is the Address layout, then probably be to avoid relying on hardcoded number
> of bits.
>

Oops, sorry, I had actually removed this part from my git branch but I
forgot to regenerated the patch series.

I'll remove it.

Best regards,
Edgar
Julien Grall Sept. 25, 2024, 4:38 p.m. UTC | #6
Hi Edgar,

On 25/09/2024 17:34, Edgar E. Iglesias wrote:
> On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
>> Hi,
>> On 24/09/2024 17:23, Edgar E. Iglesias wrote:
>>> From: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>
>>> When virtio-pci is specified in the dom0less domU properties, create a
>>> virtio-pci node in the guest's device tree. Set up an mmio handler with
>>> a register for the guest to poll when the backend has connected and
>>> virtio-pci bus is ready to be probed. Grant tables may be used by
>>> specifying virtio-pci = "grants";.
>>>
>>> [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
>>>    Make grants iommu-map cover the entire PCI bus.
>>>    Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
>>>    Document virtio-pci dom0less fdt bindings.]
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
>>> ---
>>>    docs/misc/arm/device-tree/booting.txt |  21 +++
>>>    xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
>>>    xen/arch/arm/include/asm/kernel.h     |  15 ++
>>>    3 files changed, 274 insertions(+)
>>>
>>> diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
>>> index 3a04f5c57f..82f3bd7026 100644
>>> --- a/docs/misc/arm/device-tree/booting.txt
>>> +++ b/docs/misc/arm/device-tree/booting.txt
>>> @@ -276,6 +276,27 @@ with the following properties:
>>>        passed through. This option is the default if this property is missing
>>>        and the user does not provide the device partial device tree for the domain.
>>> +- virtio-pci
>>
>> Similar question to the other patches, why is this specific to virtio PCI?
>> QEMU (or another device module) is free to emulate whatever it wants behind
>> the PCI hosbtridge.
> 
> There's no hard limitatino to only virtio-pci devices it's more of a
> recommendation that PVH guests should not use "emulated" devices but
> there's nothing stopping it.

Could you provide a bit more details where this requirement is coming 
from? For instance, I would expect we would need to do some emulation to 
boot Windows on Arm.

Cheers,
Edgar E. Iglesias Sept. 25, 2024, 4:44 p.m. UTC | #7
On Wed, Sep 25, 2024 at 05:38:13PM +0100, Julien Grall wrote:
> Hi Edgar,
> 
> On 25/09/2024 17:34, Edgar E. Iglesias wrote:
> > On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
> > > Hi,
> > > On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> > > > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > 
> > > > When virtio-pci is specified in the dom0less domU properties, create a
> > > > virtio-pci node in the guest's device tree. Set up an mmio handler with
> > > > a register for the guest to poll when the backend has connected and
> > > > virtio-pci bus is ready to be probed. Grant tables may be used by
> > > > specifying virtio-pci = "grants";.
> > > > 
> > > > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> > > >    Make grants iommu-map cover the entire PCI bus.
> > > >    Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
> > > >    Document virtio-pci dom0less fdt bindings.]
> > > > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > > > ---
> > > >    docs/misc/arm/device-tree/booting.txt |  21 +++
> > > >    xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
> > > >    xen/arch/arm/include/asm/kernel.h     |  15 ++
> > > >    3 files changed, 274 insertions(+)
> > > > 
> > > > diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
> > > > index 3a04f5c57f..82f3bd7026 100644
> > > > --- a/docs/misc/arm/device-tree/booting.txt
> > > > +++ b/docs/misc/arm/device-tree/booting.txt
> > > > @@ -276,6 +276,27 @@ with the following properties:
> > > >        passed through. This option is the default if this property is missing
> > > >        and the user does not provide the device partial device tree for the domain.
> > > > +- virtio-pci
> > > 
> > > Similar question to the other patches, why is this specific to virtio PCI?
> > > QEMU (or another device module) is free to emulate whatever it wants behind
> > > the PCI hosbtridge.
> > 
> > There's no hard limitatino to only virtio-pci devices it's more of a
> > recommendation that PVH guests should not use "emulated" devices but
> > there's nothing stopping it.
> 
> Could you provide a bit more details where this requirement is coming from?
> For instance, I would expect we would need to do some emulation to boot
> Windows on Arm.
>

I see. I guess it just came from my mental model, I thought part of the
philosophy behind PVH was to avoid emulated devices and use
paravirualized (virtio or something else) or passthrough whereever
possible (except for the basic set of devices needed like vGIC, vuart,
MMU).

Cheers,
Edgar
Edgar E. Iglesias Sept. 25, 2024, 4:49 p.m. UTC | #8
On Wed, Sep 25, 2024 at 10:44 AM Edgar E. Iglesias <edgar.iglesias@amd.com>
wrote:

> On Wed, Sep 25, 2024 at 05:38:13PM +0100, Julien Grall wrote:
> > Hi Edgar,
> >
> > On 25/09/2024 17:34, Edgar E. Iglesias wrote:
> > > On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
> > > > Hi,
> > > > On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> > > > > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > >
> > > > > When virtio-pci is specified in the dom0less domU properties,
> create a
> > > > > virtio-pci node in the guest's device tree. Set up an mmio handler
> with
> > > > > a register for the guest to poll when the backend has connected and
> > > > > virtio-pci bus is ready to be probed. Grant tables may be used by
> > > > > specifying virtio-pci = "grants";.
> > > > >
> > > > > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> > > > >    Make grants iommu-map cover the entire PCI bus.
> > > > >    Add virtio-pci-ranges to specify memory-map for direct-mapped
> guests.
> > > > >    Document virtio-pci dom0less fdt bindings.]
> > > > > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > > > > ---
> > > > >    docs/misc/arm/device-tree/booting.txt |  21 +++
> > > > >    xen/arch/arm/dom0less-build.c         | 238
> ++++++++++++++++++++++++++
> > > > >    xen/arch/arm/include/asm/kernel.h     |  15 ++
> > > > >    3 files changed, 274 insertions(+)
> > > > >
> > > > > diff --git a/docs/misc/arm/device-tree/booting.txt
> b/docs/misc/arm/device-tree/booting.txt
> > > > > index 3a04f5c57f..82f3bd7026 100644
> > > > > --- a/docs/misc/arm/device-tree/booting.txt
> > > > > +++ b/docs/misc/arm/device-tree/booting.txt
> > > > > @@ -276,6 +276,27 @@ with the following properties:
> > > > >        passed through. This option is the default if this property
> is missing
> > > > >        and the user does not provide the device partial device
> tree for the domain.
> > > > > +- virtio-pci
> > > >
> > > > Similar question to the other patches, why is this specific to
> virtio PCI?
> > > > QEMU (or another device module) is free to emulate whatever it wants
> behind
> > > > the PCI hosbtridge.
> > >
> > > There's no hard limitatino to only virtio-pci devices it's more of a
> > > recommendation that PVH guests should not use "emulated" devices but
> > > there's nothing stopping it.
> >
> > Could you provide a bit more details where this requirement is coming
> from?
> > For instance, I would expect we would need to do some emulation to boot
> > Windows on Arm.
> >
>
> I see. I guess it just came from my mental model, I thought part of the
> philosophy behind PVH was to avoid emulated devices and use
> paravirualized (virtio or something else) or passthrough whereever
> possible (except for the basic set of devices needed like vGIC, vuart,
> MMU).
>

For  example, we would recommend users to use virtio-net in favor of an
emulated eepro1000 or whatever other NIC models available in QEMU.
But there is no hard requirement nor limitation, a user can connect any
available PCI device from the QEMU set.

Another thing we're looking to do is to minimize the QEMU build (Kconfig +
configure flags) to create a small build with only the stuff needed for
virtio-pci.

Best regards,
Edgar
Julien Grall Sept. 25, 2024, 5:45 p.m. UTC | #9
Hi Edgar,

On 25/09/2024 17:49, Edgar E. Iglesias wrote:
> On Wed, Sep 25, 2024 at 10:44 AM Edgar E. Iglesias <edgar.iglesias@amd.com>
> wrote:
> 
>> On Wed, Sep 25, 2024 at 05:38:13PM +0100, Julien Grall wrote:
>>> Hi Edgar,
>>>
>>> On 25/09/2024 17:34, Edgar E. Iglesias wrote:
>>>> On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
>>>>> Hi,
>>>>> On 24/09/2024 17:23, Edgar E. Iglesias wrote:
>>>>>> From: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>>>
>>>>>> When virtio-pci is specified in the dom0less domU properties,
>> create a
>>>>>> virtio-pci node in the guest's device tree. Set up an mmio handler
>> with
>>>>>> a register for the guest to poll when the backend has connected and
>>>>>> virtio-pci bus is ready to be probed. Grant tables may be used by
>>>>>> specifying virtio-pci = "grants";.
>>>>>>
>>>>>> [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
>>>>>>     Make grants iommu-map cover the entire PCI bus.
>>>>>>     Add virtio-pci-ranges to specify memory-map for direct-mapped
>> guests.
>>>>>>     Document virtio-pci dom0less fdt bindings.]
>>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>>> Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
>>>>>> ---
>>>>>>     docs/misc/arm/device-tree/booting.txt |  21 +++
>>>>>>     xen/arch/arm/dom0less-build.c         | 238
>> ++++++++++++++++++++++++++
>>>>>>     xen/arch/arm/include/asm/kernel.h     |  15 ++
>>>>>>     3 files changed, 274 insertions(+)
>>>>>>
>>>>>> diff --git a/docs/misc/arm/device-tree/booting.txt
>> b/docs/misc/arm/device-tree/booting.txt
>>>>>> index 3a04f5c57f..82f3bd7026 100644
>>>>>> --- a/docs/misc/arm/device-tree/booting.txt
>>>>>> +++ b/docs/misc/arm/device-tree/booting.txt
>>>>>> @@ -276,6 +276,27 @@ with the following properties:
>>>>>>         passed through. This option is the default if this property
>> is missing
>>>>>>         and the user does not provide the device partial device
>> tree for the domain.
>>>>>> +- virtio-pci
>>>>>
>>>>> Similar question to the other patches, why is this specific to
>> virtio PCI?
>>>>> QEMU (or another device module) is free to emulate whatever it wants
>> behind
>>>>> the PCI hosbtridge.
>>>>
>>>> There's no hard limitatino to only virtio-pci devices it's more of a
>>>> recommendation that PVH guests should not use "emulated" devices but
>>>> there's nothing stopping it.
>>>
>>> Could you provide a bit more details where this requirement is coming
>> from?
>>> For instance, I would expect we would need to do some emulation to boot
>>> Windows on Arm.
>>>
>>
>> I see. I guess it just came from my mental model, I thought part of the
>> philosophy behind PVH was to avoid emulated devices and use
>> paravirualized (virtio or something else) or passthrough whereever
>> possible (except for the basic set of devices needed like vGIC, vuart,
>> MMU).
>>
> 
> For  example, we would recommend users to use virtio-net in favor of an
> emulated eepro1000 or whatever other NIC models available in QEMU.

Indeed. I would always recommend user to use virtio-net over eepro1000.

> But there is no hard requirement nor limitation, a user can connect any
> available PCI device from the QEMU set.

We need to be clear about what we are exposing to the guest. With this 
patch we will describe a PCI hostbridge in Device Tree (well it is an 
empty region we hope the Device Model to emulate at some point). But the 
hypervisor will not create the device model. Instead, you expect the 
user/integrator to have extra script to launch a Device Model (So it may 
not even be a hostbridge).

>
> Another thing we're looking to do is to minimize the QEMU build (Kconfig +
> configure flags) to create a small build with only the stuff needed for
> virtio-pci.

It is nice to have a cut down version of QEMU :). However, Xen doesn't 
care about the device model used for the emulation. I have seen some 
specialized DM in the wild (and used them while I was working on 
disaggregating the DM).

Anyway, while I understand this approach works in tailored environment, 
I am not convinced this works for a more general approach. The two 
options I would rather consider are:
   1. Allow the device model to receive access for a single PCI device 
(IOW hook into vPCI).
   2. Find a way to let the user provide the binding (maybe in a partial 
device-tree) + the list of Interrupts/MMIO that would be emulated by QEMU.

The second approach might be another way to get a second hostbridge in 
your use case while giving a bit more flexibility in what can be done 
(thinking about disagreggated environment).

Cheers,
Edgar E. Iglesias Sept. 25, 2024, 6:42 p.m. UTC | #10
On Wed, Sep 25, 2024 at 06:45:19PM +0100, Julien Grall wrote:
> Hi Edgar,
> 
> On 25/09/2024 17:49, Edgar E. Iglesias wrote:
> > On Wed, Sep 25, 2024 at 10:44 AM Edgar E. Iglesias <edgar.iglesias@amd.com>
> > wrote:
> > 
> > > On Wed, Sep 25, 2024 at 05:38:13PM +0100, Julien Grall wrote:
> > > > Hi Edgar,
> > > > 
> > > > On 25/09/2024 17:34, Edgar E. Iglesias wrote:
> > > > > On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
> > > > > > Hi,
> > > > > > On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> > > > > > > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > > > > 
> > > > > > > When virtio-pci is specified in the dom0less domU properties,
> > > create a
> > > > > > > virtio-pci node in the guest's device tree. Set up an mmio handler
> > > with
> > > > > > > a register for the guest to poll when the backend has connected and
> > > > > > > virtio-pci bus is ready to be probed. Grant tables may be used by
> > > > > > > specifying virtio-pci = "grants";.
> > > > > > > 
> > > > > > > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> > > > > > >     Make grants iommu-map cover the entire PCI bus.
> > > > > > >     Add virtio-pci-ranges to specify memory-map for direct-mapped
> > > guests.
> > > > > > >     Document virtio-pci dom0less fdt bindings.]
> > > > > > > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > > > > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > > > > > > ---
> > > > > > >     docs/misc/arm/device-tree/booting.txt |  21 +++
> > > > > > >     xen/arch/arm/dom0less-build.c         | 238
> > > ++++++++++++++++++++++++++
> > > > > > >     xen/arch/arm/include/asm/kernel.h     |  15 ++
> > > > > > >     3 files changed, 274 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/docs/misc/arm/device-tree/booting.txt
> > > b/docs/misc/arm/device-tree/booting.txt
> > > > > > > index 3a04f5c57f..82f3bd7026 100644
> > > > > > > --- a/docs/misc/arm/device-tree/booting.txt
> > > > > > > +++ b/docs/misc/arm/device-tree/booting.txt
> > > > > > > @@ -276,6 +276,27 @@ with the following properties:
> > > > > > >         passed through. This option is the default if this property
> > > is missing
> > > > > > >         and the user does not provide the device partial device
> > > tree for the domain.
> > > > > > > +- virtio-pci
> > > > > > 
> > > > > > Similar question to the other patches, why is this specific to
> > > virtio PCI?
> > > > > > QEMU (or another device module) is free to emulate whatever it wants
> > > behind
> > > > > > the PCI hosbtridge.
> > > > > 
> > > > > There's no hard limitatino to only virtio-pci devices it's more of a
> > > > > recommendation that PVH guests should not use "emulated" devices but
> > > > > there's nothing stopping it.
> > > > 
> > > > Could you provide a bit more details where this requirement is coming
> > > from?
> > > > For instance, I would expect we would need to do some emulation to boot
> > > > Windows on Arm.
> > > > 
> > > 
> > > I see. I guess it just came from my mental model, I thought part of the
> > > philosophy behind PVH was to avoid emulated devices and use
> > > paravirualized (virtio or something else) or passthrough whereever
> > > possible (except for the basic set of devices needed like vGIC, vuart,
> > > MMU).
> > > 
> > 
> > For  example, we would recommend users to use virtio-net in favor of an
> > emulated eepro1000 or whatever other NIC models available in QEMU.
> 
> Indeed. I would always recommend user to use virtio-net over eepro1000.
> 
> > But there is no hard requirement nor limitation, a user can connect any
> > available PCI device from the QEMU set.
> 
> We need to be clear about what we are exposing to the guest. With this patch
> we will describe a PCI hostbridge in Device Tree (well it is an empty region
> we hope the Device Model to emulate at some point). But the hypervisor will
> not create the device model. Instead, you expect the user/integrator to have
> extra script to launch a Device Model (So it may not even be a hostbridge).

Right OK, I'll try make it clearer in the next versions allthough the
interface may change a bit if we go with your second suggestion below.

> 
> > 
> > Another thing we're looking to do is to minimize the QEMU build (Kconfig +
> > configure flags) to create a small build with only the stuff needed for
> > virtio-pci.
> 
> It is nice to have a cut down version of QEMU :). However, Xen doesn't care
> about the device model used for the emulation. I have seen some specialized
> DM in the wild (and used them while I was working on disaggregating the DM).
> 
> Anyway, while I understand this approach works in tailored environment, I am
> not convinced this works for a more general approach. The two options I
> would rather consider are:
>   1. Allow the device model to receive access for a single PCI device (IOW
> hook into vPCI).
>   2. Find a way to let the user provide the binding (maybe in a partial
> device-tree) + the list of Interrupts/MMIO that would be emulated by QEMU.
> 
> The second approach might be another way to get a second hostbridge in your
> use case while giving a bit more flexibility in what can be done (thinking
> about disagreggated environment).
>

Thanks, it makes sense and I like that it makes the FDT setup more explicit,
i.e user providing an FDT fragment rather than Xen implicitly generating
it behind the scene.

I'll wait for more comments but if folks don't disagree I will explore this
second option.

Cheers,
Edgar
Stefano Stabellini Sept. 25, 2024, 10:20 p.m. UTC | #11
On Wed, 25 Sep 2024, Julien Grall wrote:
> Hi Edgar,
> 
> On 25/09/2024 17:49, Edgar E. Iglesias wrote:
> > On Wed, Sep 25, 2024 at 10:44 AM Edgar E. Iglesias <edgar.iglesias@amd.com>
> > wrote:
> > 
> > > On Wed, Sep 25, 2024 at 05:38:13PM +0100, Julien Grall wrote:
> > > > Hi Edgar,
> > > > 
> > > > On 25/09/2024 17:34, Edgar E. Iglesias wrote:
> > > > > On Wed, Sep 25, 2024 at 08:44:41AM +0100, Julien Grall wrote:
> > > > > > Hi,
> > > > > > On 24/09/2024 17:23, Edgar E. Iglesias wrote:
> > > > > > > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > > > > 
> > > > > > > When virtio-pci is specified in the dom0less domU properties,
> > > create a
> > > > > > > virtio-pci node in the guest's device tree. Set up an mmio handler
> > > with
> > > > > > > a register for the guest to poll when the backend has connected
> > > > > > > and
> > > > > > > virtio-pci bus is ready to be probed. Grant tables may be used by
> > > > > > > specifying virtio-pci = "grants";.
> > > > > > > 
> > > > > > > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> > > > > > >     Make grants iommu-map cover the entire PCI bus.
> > > > > > >     Add virtio-pci-ranges to specify memory-map for direct-mapped
> > > guests.
> > > > > > >     Document virtio-pci dom0less fdt bindings.]
> > > > > > > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > > > > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > > > > > > ---
> > > > > > >     docs/misc/arm/device-tree/booting.txt |  21 +++
> > > > > > >     xen/arch/arm/dom0less-build.c         | 238
> > > ++++++++++++++++++++++++++
> > > > > > >     xen/arch/arm/include/asm/kernel.h     |  15 ++
> > > > > > >     3 files changed, 274 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/docs/misc/arm/device-tree/booting.txt
> > > b/docs/misc/arm/device-tree/booting.txt
> > > > > > > index 3a04f5c57f..82f3bd7026 100644
> > > > > > > --- a/docs/misc/arm/device-tree/booting.txt
> > > > > > > +++ b/docs/misc/arm/device-tree/booting.txt
> > > > > > > @@ -276,6 +276,27 @@ with the following properties:
> > > > > > >         passed through. This option is the default if this
> > > > > > > property
> > > is missing
> > > > > > >         and the user does not provide the device partial device
> > > tree for the domain.
> > > > > > > +- virtio-pci
> > > > > > 
> > > > > > Similar question to the other patches, why is this specific to
> > > virtio PCI?
> > > > > > QEMU (or another device module) is free to emulate whatever it wants
> > > behind
> > > > > > the PCI hosbtridge.
> > > > > 
> > > > > There's no hard limitatino to only virtio-pci devices it's more of a
> > > > > recommendation that PVH guests should not use "emulated" devices but
> > > > > there's nothing stopping it.
> > > > 
> > > > Could you provide a bit more details where this requirement is coming
> > > from?
> > > > For instance, I would expect we would need to do some emulation to boot
> > > > Windows on Arm.
> > > > 
> > > 
> > > I see. I guess it just came from my mental model, I thought part of the
> > > philosophy behind PVH was to avoid emulated devices and use
> > > paravirualized (virtio or something else) or passthrough whereever
> > > possible (except for the basic set of devices needed like vGIC, vuart,
> > > MMU).
> > > 
> > 
> > For  example, we would recommend users to use virtio-net in favor of an
> > emulated eepro1000 or whatever other NIC models available in QEMU.
> 
> Indeed. I would always recommend user to use virtio-net over eepro1000.
> 
> > But there is no hard requirement nor limitation, a user can connect any
> > available PCI device from the QEMU set.
> 
> We need to be clear about what we are exposing to the guest. With this patch
> we will describe a PCI hostbridge in Device Tree (well it is an empty region
> we hope the Device Model to emulate at some point). But the hypervisor will
> not create the device model. Instead, you expect the user/integrator to have
> extra script to launch a Device Model (So it may not even be a hostbridge).
> 
> > 
> > Another thing we're looking to do is to minimize the QEMU build (Kconfig +
> > configure flags) to create a small build with only the stuff needed for
> > virtio-pci.
> 
> It is nice to have a cut down version of QEMU :). However, Xen doesn't care
> about the device model used for the emulation. I have seen some specialized DM
> in the wild (and used them while I was working on disaggregating the DM).
> 
> Anyway, while I understand this approach works in tailored environment, I am
> not convinced this works for a more general approach. The two options I would
> rather consider are:
>   1. Allow the device model to receive access for a single PCI device (IOW
> hook into vPCI).
>   2. Find a way to let the user provide the binding (maybe in a partial
> device-tree) + the list of Interrupts/MMIO that would be emulated by QEMU.
> 
> The second approach might be another way to get a second hostbridge in your
> use case while giving a bit more flexibility in what can be done (thinking
> about disagreggated environment).

Thank you for the suggestion on the second option, I think that is
close to what we intended. Let me add a few more details.

There has been a significant trend toward using virtio for all virtual
interfaces in automotive and other industries for several years now.
While I'm not entirely sure about Windows, all the operating systems we
work with (e.g. Android, RTOSes) are optimizing for virtio interfaces.
The expectation is that guests will either access physical devices or
virtio devices. I mention this in response to the specialized vs.
general approach - virtio is becoming (or has already become) the
standard, at least in automotive and embedded sectors. This is why we
have introduced the new specialized QEMU machine for virtio only on both
ARM and x86. However, you are right that the solution is somewhat
dependent on the QEMU emulation provided, meaning it isn't fully
generalized and may not work with other device models. Let's see if we
can improve this. 

I agree that a single PCI root complex is the cleanest solution from a
Xen perspective. However, aside from the level of effort required, it's
also important to consider QEMU integration. The separate root complex
integrates very well into QEMU's own view of the world, and that is
important too because the more we deviate the more we are at risk of
triggering unwanted bugs in QEMU. Bugs that would only show up in a Xen
configuration and we would responsible to fix. The two PCI RCs approach
is simple because it is low complexity from a QEMU point of view. The
trade-off is having the two PCI RCs exposed to the VM instead of one,
but in our tests two PCI RCs work well on both ARM and even x86. So I
think the two PCI RCs approach is viable. (Also I believe that
technically is a single PCI RC with two host bridges.)

For the second option, I'll let Edgar investigate but I think that would
work, thank you for being flexible. We would still need patches 4-6 from
this series.

Let's assume we'll proceed with patches 4-6 and, as agreed, skip patches
1-2. Then my first thought would be to rely on ImageBuilder to generate
the complete virtio DT node. While I usually like using ImageBuilder,
in this case, I lean toward having Xen generate the domU nodes. There
are a few reasons for this: the partial DTB is typically used for
passthrough and related information, which isn't the case here. Although
ImageBuilder can merge multiple partial DTBs, I think it's best not to
depend on that more delicate feature, for scenarios where a user wants
both a passthrough device and a virtio device.

But we could change the DT properties to be more explicitly related to
an emulated PCI root complex, which could be provided by any device
model and not only QEMU. Also we can avoid saying "virtio" in the
property name because although our use-case is virtio, as you wrote
there is nothing that ties this to virtio today. So what about the
following dom0less device tree properties instead?

secondary-emulated-pci-host-bridge = <ecam_address ecam_size
                                      memory_address memory_size
                                      prefetch_address prefetch_size
                                      irq_start, irq_how_many,
                                      flags>;
one of the special flags could be grants enabled/disabled.

This way:

- The list of interrupts and MMIOs is explicit
- The fact that we are talking about a seconday emulated host bridge is
  explicit, in the description we can say we expect it to be provided by
  an ioreq device model. We can call it secondary-ioreq-pci-host-bridge
- The Xen-generated domU device tree description is still generic and
  reusable. Let's say that someone comes up with a different use-case
  and a different device model but still wants a PCI host bridge, they
  can reuse this. The DomU DT is standard for a generic PCI host bridge.
- We don't need ImageBuilder to generate/edit complex device tree nodes
  and merge partial device trees
Edgar E. Iglesias Sept. 26, 2024, 10:15 p.m. UTC | #12
On Tue, Sep 24, 2024 at 04:55:15PM -0700, Stefano Stabellini wrote:
> On Tue, 24 Sep 2024, Edgar E. Iglesias wrote:
> > From: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > 
> > When virtio-pci is specified in the dom0less domU properties, create a
> > virtio-pci node in the guest's device tree. Set up an mmio handler with
> > a register for the guest to poll when the backend has connected and
> > virtio-pci bus is ready to be probed. Grant tables may be used by
> > specifying virtio-pci = "grants";.
> > 
> > [Edgar: Use GPEX PCI INTX interrupt swizzling (from PCI specs).
> >  Make grants iommu-map cover the entire PCI bus.
> >  Add virtio-pci-ranges to specify memory-map for direct-mapped guests.
> >  Document virtio-pci dom0less fdt bindings.]
> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > Signed-off-by: Edgar E. Iglesias <edgar.iglesias@amd.com>
> > ---
> >  docs/misc/arm/device-tree/booting.txt |  21 +++
> >  xen/arch/arm/dom0less-build.c         | 238 ++++++++++++++++++++++++++
> >  xen/arch/arm/include/asm/kernel.h     |  15 ++
> >  3 files changed, 274 insertions(+)
> > 
> > diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
> > index 3a04f5c57f..82f3bd7026 100644
> > --- a/docs/misc/arm/device-tree/booting.txt
> > +++ b/docs/misc/arm/device-tree/booting.txt
> > @@ -276,6 +276,27 @@ with the following properties:
> >      passed through. This option is the default if this property is missing
> >      and the user does not provide the device partial device tree for the domain.
> >  
> > +- virtio-pci
> > +
> > +    A string property specifying whether virtio-pci is enabled for the
> > +    domain and if grant table mappings should be used. If no value is set
> > +    this property is treated as a boolean and the same way as if set to
> > +    "enabled".
> > +    Possible property values are:
> > +
> > +    - "enabled"
> > +    Virtio-pci is enabled for the domain.
> > +
> > +    - "grants"
> > +    Virtio-pci is enabled for the domain and an grants IOMMU node will be
> > +    generated in the domains device-tree.
> > +
> > +- virtio-pci-ranges
> > +
> > +    An optional array of 6 u32 values specifying the 2 cells base addresses of
> > +    the ECAM, Memory and Prefetchable-Memory regions for virtio-pci. This is
> > +    useful to avoid memory-map collisions when using direct-mapped guests.
> > +
> >  Under the "xen,domain" compatible node, one or more sub-nodes are present
> >  for the DomU kernel and ramdisk.
> >  
> > diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> > index 09b65e44ae..dab24fa9e2 100644
> > --- a/xen/arch/arm/dom0less-build.c
> > +++ b/xen/arch/arm/dom0less-build.c
> > @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
> >      return res;
> >  }
> >  
> > +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> > +{
> > +    void *fdt = kinfo->fdt;
> > +    /* reg is sized to be used for all the needed properties below */
> > +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> > +               * 2];
> > +    __be32 irq_map[4 * 4 * 8];
> > +    __be32 *cells;
> > +    char buf[22]; /* pcie@ + max 16 char address + '\0' */
> > +    int res;
> > +    int devfn, intx_pin;
> > +    static const char compat[] = "pci-host-ecam-generic";
> > +    static const char reg_names[] = "ecam";
> > +
> > +    if ( p2m_ipa_bits <= 40 ) {
> > +        printk("PA bits %d is too small!\nvirtio-pci is only supported "
> > +               "on platforms with PA bits > 40\n", p2m_ipa_bits);
> > +        return -EINVAL;
> > +    }
> > +
> > +    snprintf(buf, sizeof(buf), "pcie@%lx", kinfo->virtio_pci.ecam.base);
> > +    dt_dprintk("Create virtio-pci node\n");
> > +    res = fdt_begin_node(fdt, buf);
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property(fdt, "compatible", compat, sizeof(compat));
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property_string(fdt, "device_type", "pci");
> > +    if ( res )
> > +        return res;
> > +
> > +    /* Create reg property */
> > +    cells = &reg[0];
> > +    dt_child_set_range(&cells, GUEST_ROOT_ADDRESS_CELLS, GUEST_ROOT_SIZE_CELLS,
> > +                       kinfo->virtio_pci.ecam.base, GUEST_VIRTIO_PCI_ECAM_SIZE);
> > +
> > +    res = fdt_property(fdt, "reg", reg,
> > +                       (GUEST_ROOT_ADDRESS_CELLS +
> > +                        GUEST_ROOT_SIZE_CELLS) * sizeof(*reg));
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property(fdt, "reg-names", reg_names, sizeof(reg_names));
> > +    if ( res )
> > +        return res;
> > +
> > +    /* Create bus-range property */
> > +    cells = &reg[0];
> > +    dt_set_cell(&cells, 1, 0);
> > +    dt_set_cell(&cells, 1, 0xff);
> > +    res = fdt_property(fdt, "bus-range", reg, 2 * sizeof(*reg));
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property_cell(fdt, "#address-cells", 3);
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property_cell(fdt, "#size-cells", 2);
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property_string(fdt, "status", "okay");
> > +    if ( res )
> > +        return res;
> > +
> > +    /*
> > +     * Create ranges property as:
> > +     * <(PCI bitfield) (PCI address) (CPU address) (Size)>
> > +     */
> > +    cells = &reg[0];
> > +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_MEM_TYPE);
> > +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> > +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
> > +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_MEM_SIZE);
> > +    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_PREFETCH_MEM_TYPE);
> > +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> > +    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
> > +    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_PREFETCH_MEM_SIZE);
> > +    res = fdt_property(fdt, "ranges", reg, 14 * sizeof(*reg));
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property(fdt, "dma-coherent", "", 0);
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_property_cell(fdt, "#interrupt-cells", 1);
> > +    if ( res )
> > +        return res;
> > +
> > +    /*
> > +     * PCI-to-PCI bridge specification
> > +     * 9.1: Interrupt routing. Table 9-1
> > +     *
> > +     * the PCI Express Base Specification, Revision 2.1
> > +     * 2.2.8.1: INTx interrupt signaling - Rules
> > +     *          the Implementation Note
> > +     *          Table 2-20
> > +     */
> > +    cells = &irq_map[0];
> > +    for (devfn = 0; devfn <= 0x18; devfn += 8) {
> > +        for (intx_pin = 0; intx_pin < 4; intx_pin++) {
> > +            int irq = GUEST_VIRTIO_PCI_SPI_FIRST - 32;
> > +            irq += ((intx_pin + PCI_SLOT(devfn)) % 4);
> > +
> > +            dt_set_cell(&cells, 1, devfn << 8);
> > +            dt_set_cell(&cells, 1, 0);
> > +            dt_set_cell(&cells, 1, 0);
> > +            dt_set_cell(&cells, 1, intx_pin + 1);
> > +            dt_set_cell(&cells, 1, kinfo->phandle_gic);
> > +            /* 3 GIC cells.  */
> > +            dt_set_cell(&cells, 1, 0);
> > +            dt_set_cell(&cells, 1, irq);
> > +            dt_set_cell(&cells, 1, DT_IRQ_TYPE_LEVEL_HIGH);
> > +        }
> > +    }
> > +
> > +    /* Assert we've sized irq_map correctly.  */
> > +    BUG_ON(cells - &irq_map[0] != ARRAY_SIZE(irq_map));
> > +
> > +    res = fdt_property(fdt, "interrupt-map", irq_map, sizeof(irq_map));
> > +    if ( res )
> > +        return res;
> > +
> > +    cells = &reg[0];
> > +    dt_set_cell(&cells, 1, cpu_to_be16(PCI_DEVFN(3, 0)));
> 
> Hi Edgar, what is this PCI_DEVFN(3, 0)  device?


Hi Stefano,

This is for the interrupt-map-mask, since the swizzling pattern
repeats itself for every fourth device, we only need to describe
entries 0 - 3 and can mask out the rest.

I the next version (which will be quite different) I'll try to make
this clearer.


> 
> 
> > +    dt_set_cell(&cells, 1, 0x0);
> > +    dt_set_cell(&cells, 1, 0x0);
> > +    dt_set_cell(&cells, 1, 0x7);
> > +    res = fdt_property(fdt, "interrupt-map-mask", reg, 4 * sizeof(*reg));
> > +    if ( res )
> > +        return res;
> > +
> > +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> > +    {
> > +        cells = &reg[0];
> > +        dt_set_cell(&cells, 1, 0x0);
> > +        dt_set_cell(&cells, 1, GUEST_PHANDLE_IOMMU);
> > +        dt_set_cell(&cells, 1, 0x0);
> > +        dt_set_cell(&cells, 1, 0x10000);
> > +        res = fdt_property(fdt, "iommu-map", reg, 4 * sizeof(*reg));
> > +        if ( res )
> > +            return res;
> > +    }
> > +
> > +    res = fdt_property_cell(fdt, "linux,pci-domain", 1);
> > +    if ( res )
> > +        return res;
> > +
> > +    res = fdt_end_node(fdt);
> > +    if ( res )
> > +        return res;
> > +
> > +    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
> > +    {
> > +        snprintf(buf, sizeof(buf), "xen_iommu");
> 
> I don't think underscores are allowed in device tree node names


Will fix, thanks.


> 
> 
> 
> > +        res = fdt_begin_node(fdt, buf);
> > +        if ( res )
> > +            return res;
> > +
> > +        res = fdt_property_string(fdt, "compatible", "xen,grant-dma");
> > +        if ( res )
> > +            return res;
> > +
> > +        res = fdt_property_cell(fdt, "#iommu-cells", 1);
> > +        if ( res )
> > +            return res;
> > +
> > +        res = fdt_property_cell(fdt, "phandle", GUEST_PHANDLE_IOMMU);
> > +        if ( res )
> > +            return res;
> > +
> > +        res = fdt_end_node(fdt);
> > +    }
> > +
> > +    return res;
> > +}
> > +
> >  /*
> >   * The max size for DT is 2MB. However, the generated DT is small (not including
> >   * domU passthrough DT nodes whose size we account separately), 4KB are enough
> > @@ -693,6 +876,13 @@ static int __init prepare_dtb_domU(struct domain *d, struct kernel_info *kinfo)
> >              goto err;
> >      }
> >  
> > +    if ( kinfo->virtio_pci.mode )
> > +    {
> > +        ret = make_virtio_pci_domU_node(kinfo);
> > +        if ( ret )
> > +            goto err;
> > +    }
> > +
> >      ret = fdt_end_node(kinfo->fdt);
> >      if ( ret < 0 )
> >          goto err;
> > @@ -744,11 +934,24 @@ static int __init alloc_xenstore_evtchn(struct domain *d)
> >      return 0;
> >  }
> >  
> > +static u64 combine_u64(u32 v[2])
> 
> Let's make this a static inline or a macro. I can't believe we don't
> have one already.

Yes, I'll try what Stewart suggested.

Cheers,
Edgar
Stewart Hildebrand Oct. 1, 2024, 7:30 p.m. UTC | #13
On 9/24/24 12:23, Edgar E. Iglesias wrote:
> diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
> index 09b65e44ae..dab24fa9e2 100644
> --- a/xen/arch/arm/dom0less-build.c
> +++ b/xen/arch/arm/dom0less-build.c
> @@ -586,6 +586,189 @@ static int __init domain_handle_dtb_bootmodule(struct domain *d,
>      return res;
>  }
>  
> +static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
> +{
> +    void *fdt = kinfo->fdt;
> +    /* reg is sized to be used for all the needed properties below */
> +    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
> +               * 2];
> +    __be32 irq_map[4 * 4 * 8];

Why two separate arrays on the stack? It looks like they're used for the
same purpose, so I think we may as well keep the bigger one and get rid
of the smaller.
diff mbox series

Patch

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 3a04f5c57f..82f3bd7026 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -276,6 +276,27 @@  with the following properties:
     passed through. This option is the default if this property is missing
     and the user does not provide the device partial device tree for the domain.
 
+- virtio-pci
+
+    A string property specifying whether virtio-pci is enabled for the
+    domain and if grant table mappings should be used. If no value is set
+    this property is treated as a boolean and the same way as if set to
+    "enabled".
+    Possible property values are:
+
+    - "enabled"
+    Virtio-pci is enabled for the domain.
+
+    - "grants"
+    Virtio-pci is enabled for the domain and an grants IOMMU node will be
+    generated in the domains device-tree.
+
+- virtio-pci-ranges
+
+    An optional array of 6 u32 values specifying the 2 cells base addresses of
+    the ECAM, Memory and Prefetchable-Memory regions for virtio-pci. This is
+    useful to avoid memory-map collisions when using direct-mapped guests.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c
index 09b65e44ae..dab24fa9e2 100644
--- a/xen/arch/arm/dom0less-build.c
+++ b/xen/arch/arm/dom0less-build.c
@@ -586,6 +586,189 @@  static int __init domain_handle_dtb_bootmodule(struct domain *d,
     return res;
 }
 
+static int __init make_virtio_pci_domU_node(const struct kernel_info *kinfo)
+{
+    void *fdt = kinfo->fdt;
+    /* reg is sized to be used for all the needed properties below */
+    __be32 reg[(1 + (GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS)
+               * 2];
+    __be32 irq_map[4 * 4 * 8];
+    __be32 *cells;
+    char buf[22]; /* pcie@ + max 16 char address + '\0' */
+    int res;
+    int devfn, intx_pin;
+    static const char compat[] = "pci-host-ecam-generic";
+    static const char reg_names[] = "ecam";
+
+    if ( p2m_ipa_bits <= 40 ) {
+        printk("PA bits %d is too small!\nvirtio-pci is only supported "
+               "on platforms with PA bits > 40\n", p2m_ipa_bits);
+        return -EINVAL;
+    }
+
+    snprintf(buf, sizeof(buf), "pcie@%lx", kinfo->virtio_pci.ecam.base);
+    dt_dprintk("Create virtio-pci node\n");
+    res = fdt_begin_node(fdt, buf);
+    if ( res )
+        return res;
+
+    res = fdt_property(fdt, "compatible", compat, sizeof(compat));
+    if ( res )
+        return res;
+
+    res = fdt_property_string(fdt, "device_type", "pci");
+    if ( res )
+        return res;
+
+    /* Create reg property */
+    cells = &reg[0];
+    dt_child_set_range(&cells, GUEST_ROOT_ADDRESS_CELLS, GUEST_ROOT_SIZE_CELLS,
+                       kinfo->virtio_pci.ecam.base, GUEST_VIRTIO_PCI_ECAM_SIZE);
+
+    res = fdt_property(fdt, "reg", reg,
+                       (GUEST_ROOT_ADDRESS_CELLS +
+                        GUEST_ROOT_SIZE_CELLS) * sizeof(*reg));
+    if ( res )
+        return res;
+
+    res = fdt_property(fdt, "reg-names", reg_names, sizeof(reg_names));
+    if ( res )
+        return res;
+
+    /* Create bus-range property */
+    cells = &reg[0];
+    dt_set_cell(&cells, 1, 0);
+    dt_set_cell(&cells, 1, 0xff);
+    res = fdt_property(fdt, "bus-range", reg, 2 * sizeof(*reg));
+    if ( res )
+        return res;
+
+    res = fdt_property_cell(fdt, "#address-cells", 3);
+    if ( res )
+        return res;
+
+    res = fdt_property_cell(fdt, "#size-cells", 2);
+    if ( res )
+        return res;
+
+    res = fdt_property_string(fdt, "status", "okay");
+    if ( res )
+        return res;
+
+    /*
+     * Create ranges property as:
+     * <(PCI bitfield) (PCI address) (CPU address) (Size)>
+     */
+    cells = &reg[0];
+    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_MEM_TYPE);
+    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
+    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.mem.base);
+    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_MEM_SIZE);
+    dt_set_cell(&cells, 1, GUEST_VIRTIO_PCI_PREFETCH_MEM_TYPE);
+    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
+    dt_set_cell(&cells, GUEST_ROOT_ADDRESS_CELLS, kinfo->virtio_pci.pf_mem.base);
+    dt_set_cell(&cells, GUEST_ROOT_SIZE_CELLS, GUEST_VIRTIO_PCI_PREFETCH_MEM_SIZE);
+    res = fdt_property(fdt, "ranges", reg, 14 * sizeof(*reg));
+    if ( res )
+        return res;
+
+    res = fdt_property(fdt, "dma-coherent", "", 0);
+    if ( res )
+        return res;
+
+    res = fdt_property_cell(fdt, "#interrupt-cells", 1);
+    if ( res )
+        return res;
+
+    /*
+     * PCI-to-PCI bridge specification
+     * 9.1: Interrupt routing. Table 9-1
+     *
+     * the PCI Express Base Specification, Revision 2.1
+     * 2.2.8.1: INTx interrupt signaling - Rules
+     *          the Implementation Note
+     *          Table 2-20
+     */
+    cells = &irq_map[0];
+    for (devfn = 0; devfn <= 0x18; devfn += 8) {
+        for (intx_pin = 0; intx_pin < 4; intx_pin++) {
+            int irq = GUEST_VIRTIO_PCI_SPI_FIRST - 32;
+            irq += ((intx_pin + PCI_SLOT(devfn)) % 4);
+
+            dt_set_cell(&cells, 1, devfn << 8);
+            dt_set_cell(&cells, 1, 0);
+            dt_set_cell(&cells, 1, 0);
+            dt_set_cell(&cells, 1, intx_pin + 1);
+            dt_set_cell(&cells, 1, kinfo->phandle_gic);
+            /* 3 GIC cells.  */
+            dt_set_cell(&cells, 1, 0);
+            dt_set_cell(&cells, 1, irq);
+            dt_set_cell(&cells, 1, DT_IRQ_TYPE_LEVEL_HIGH);
+        }
+    }
+
+    /* Assert we've sized irq_map correctly.  */
+    BUG_ON(cells - &irq_map[0] != ARRAY_SIZE(irq_map));
+
+    res = fdt_property(fdt, "interrupt-map", irq_map, sizeof(irq_map));
+    if ( res )
+        return res;
+
+    cells = &reg[0];
+    dt_set_cell(&cells, 1, cpu_to_be16(PCI_DEVFN(3, 0)));
+    dt_set_cell(&cells, 1, 0x0);
+    dt_set_cell(&cells, 1, 0x0);
+    dt_set_cell(&cells, 1, 0x7);
+    res = fdt_property(fdt, "interrupt-map-mask", reg, 4 * sizeof(*reg));
+    if ( res )
+        return res;
+
+    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
+    {
+        cells = &reg[0];
+        dt_set_cell(&cells, 1, 0x0);
+        dt_set_cell(&cells, 1, GUEST_PHANDLE_IOMMU);
+        dt_set_cell(&cells, 1, 0x0);
+        dt_set_cell(&cells, 1, 0x10000);
+        res = fdt_property(fdt, "iommu-map", reg, 4 * sizeof(*reg));
+        if ( res )
+            return res;
+    }
+
+    res = fdt_property_cell(fdt, "linux,pci-domain", 1);
+    if ( res )
+        return res;
+
+    res = fdt_end_node(fdt);
+    if ( res )
+        return res;
+
+    if ( kinfo->virtio_pci.mode == VIRTIO_PCI_GRANTS )
+    {
+        snprintf(buf, sizeof(buf), "xen_iommu");
+
+        res = fdt_begin_node(fdt, buf);
+        if ( res )
+            return res;
+
+        res = fdt_property_string(fdt, "compatible", "xen,grant-dma");
+        if ( res )
+            return res;
+
+        res = fdt_property_cell(fdt, "#iommu-cells", 1);
+        if ( res )
+            return res;
+
+        res = fdt_property_cell(fdt, "phandle", GUEST_PHANDLE_IOMMU);
+        if ( res )
+            return res;
+
+        res = fdt_end_node(fdt);
+    }
+
+    return res;
+}
+
 /*
  * The max size for DT is 2MB. However, the generated DT is small (not including
  * domU passthrough DT nodes whose size we account separately), 4KB are enough
@@ -693,6 +876,13 @@  static int __init prepare_dtb_domU(struct domain *d, struct kernel_info *kinfo)
             goto err;
     }
 
+    if ( kinfo->virtio_pci.mode )
+    {
+        ret = make_virtio_pci_domU_node(kinfo);
+        if ( ret )
+            goto err;
+    }
+
     ret = fdt_end_node(kinfo->fdt);
     if ( ret < 0 )
         goto err;
@@ -744,11 +934,24 @@  static int __init alloc_xenstore_evtchn(struct domain *d)
     return 0;
 }
 
+static u64 combine_u64(u32 v[2])
+{
+    u64 v64;
+
+    v64 = v[0];
+    v64 <<= 32;
+    v64 |= v[1];
+    return v64;
+}
+
 static int __init construct_domU(struct domain *d,
                                  const struct dt_device_node *node)
 {
     struct kernel_info kinfo = KERNEL_INFO_INIT;
     const char *dom0less_enhanced;
+    const char *virtio_pci;
+    /* virtio-pci ECAM, MEM, PF-MEM each carrying 2 x Address cells.  */
+    u32 virtio_pci_ranges[3 * 2];
     int rc;
     u64 mem;
     u32 p2m_mem_mb;
@@ -779,6 +982,41 @@  static int __init construct_domU(struct domain *d,
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
 
+    rc = dt_property_read_string(node, "virtio-pci", &virtio_pci);
+    if ( !rc )
+    {
+        if ( !strcmp(virtio_pci, "enabled") )
+            kinfo.virtio_pci.mode = VIRTIO_PCI;
+        else if ( !strcmp(virtio_pci, "grants") )
+            kinfo.virtio_pci.mode = VIRTIO_PCI_GRANTS;
+        else
+        {
+            printk("Invalid \"virtio-pci\" property value (%s)\n", virtio_pci);
+            return -EINVAL;
+        }
+    }
+    else if ( rc == -ENODATA )
+    {
+        /* Handle missing property value */
+        kinfo.virtio_pci.mode = dt_property_read_bool(node, "virtio-pci");
+    }
+
+    if ( kinfo.virtio_pci.mode != VIRTIO_PCI_NONE )
+    {
+        rc = dt_property_read_u32_array(node, "virtio-pci-ranges",
+                                        virtio_pci_ranges,
+                                        ARRAY_SIZE(virtio_pci_ranges));
+        if ( rc == 0 ) {
+            kinfo.virtio_pci.ecam.base = combine_u64(&virtio_pci_ranges[0]);
+            kinfo.virtio_pci.mem.base = combine_u64(&virtio_pci_ranges[2]);
+            kinfo.virtio_pci.pf_mem.base = combine_u64(&virtio_pci_ranges[4]);
+        } else {
+            kinfo.virtio_pci.ecam.base = GUEST_VIRTIO_PCI_ECAM_BASE;
+            kinfo.virtio_pci.mem.base = GUEST_VIRTIO_PCI_MEM_BASE;
+            kinfo.virtio_pci.pf_mem.base = GUEST_VIRTIO_PCI_PREFETCH_MEM_BASE;
+        }
+    }
+
     rc = dt_property_read_string(node, "xen,enhanced", &dom0less_enhanced);
     if ( rc == -EILSEQ ||
          rc == -ENODATA ||
diff --git a/xen/arch/arm/include/asm/kernel.h b/xen/arch/arm/include/asm/kernel.h
index 7e6e3c82a4..2dab2ac88f 100644
--- a/xen/arch/arm/include/asm/kernel.h
+++ b/xen/arch/arm/include/asm/kernel.h
@@ -29,6 +29,13 @@ 
 #define DOM0LESS_XENSTORE        BIT(1, U)
 #define DOM0LESS_ENHANCED        (DOM0LESS_ENHANCED_NO_XS | DOM0LESS_XENSTORE)
 
+/* virtio-pci types */
+enum virtio_pci_type {
+    VIRTIO_PCI_NONE,
+    VIRTIO_PCI,
+    VIRTIO_PCI_GRANTS,
+};
+
 struct kernel_info {
 #ifdef CONFIG_ARM_64
     enum domain_type type;
@@ -62,6 +69,14 @@  struct kernel_info {
     /* Enable/Disable PV drivers interfaces */
     uint16_t dom0less_feature;
 
+    struct {
+        enum virtio_pci_type mode;
+        struct {
+            u64 base;
+        } ecam, mem, pf_mem;
+        u32 pci_intx_irq_base;
+    } virtio_pci;
+
     /* GIC phandle */
     uint32_t phandle_gic;