diff mbox series

[v1,1/4] vfio: new command line params for device memory NUMA nodes

Message ID 20230915024559.6565-2-ankita@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio: report NUMA nodes for device memory | expand

Commit Message

Ankit Agrawal Sept. 15, 2023, 2:45 a.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

The CPU cache coherent device memory can be added as a set of
NUMA nodes distinct from the system memory nodes. The Qemu currently
do not provide a mechanism to support node creation for a vfio-pci
device.

Introduce new command line parameters to allow host admin provide
the desired starting NUMA node id (pxm-ns) and the number of such
nodes (pxm-nc) associated with the device. In this implementation,
a numerically consecutive nodes from pxm-ns to pxm-ns + pxm-nc
is created. Also validate the requested range of nodes to check
for conflict with other nodes and to ensure that the id do not cross
QEMU limit.

Since the QEMU's SRAT and DST builder code needs the proximity
domain (PXM) id range, expose PXM start and count as device object
properties.

The device driver module communicates support for such feature through
sysfs. Check the presence of the feature to activate the code.

E.g. the following argument adds 8 PXM nodes starting from id 0x10.
-device vfio-pci-nohotplug,host=<pci-bdf>,pxm-ns=0x10,pxm-nc=8

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/vfio/pci.c               | 144 ++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h               |   2 +
 include/hw/pci/pci_device.h |   3 +
 3 files changed, 149 insertions(+)

Comments

Jonathan Cameron Sept. 15, 2023, 2:25 p.m. UTC | #1
On Thu, 14 Sep 2023 19:45:56 -0700
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The CPU cache coherent device memory can be added as a set of
> NUMA nodes distinct from the system memory nodes. The Qemu currently
> do not provide a mechanism to support node creation for a vfio-pci
> device.
> 
> Introduce new command line parameters to allow host admin provide
> the desired starting NUMA node id (pxm-ns) and the number of such
> nodes (pxm-nc) associated with the device. In this implementation,
> a numerically consecutive nodes from pxm-ns to pxm-ns + pxm-nc
> is created. Also validate the requested range of nodes to check


Hi Ankit,

That's not a particularly intuitive bit of interface!

pxm-start
pxm-number
perhaps?  However, in QEMU commmand lines the node terminology is used so
numa-node-start
numa-node-number

Though in general this feels backwards compared to how the rest of
the numa definition is currently done.

Could the current interface be extended.

-numa node,vfio-device=X

at the cost of a bit of house keeping and lookup.

We need something similar for generic initiators, so maybe
vfio-mem=X? (might not even need to be vfio specific - even
if we only support it for now on VFIO devices).
leaving
initiator=X available for later...

Also, good to say why multiple nodes per device are needed.

Jonathan

> for conflict with other nodes and to ensure that the id do not cross
> QEMU limit.
> 
> Since the QEMU's SRAT and DST builder code needs the proximity
> domain (PXM) id range, expose PXM start and count as device object
> properties.
> 
> The device driver module communicates support for such feature through
> sysfs. Check the presence of the feature to activate the code.
> 
> E.g. the following argument adds 8 PXM nodes starting from id 0x10.
> -device vfio-pci-nohotplug,host=<pci-bdf>,pxm-ns=0x10,pxm-nc=8
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  hw/vfio/pci.c               | 144 ++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h               |   2 +
>  include/hw/pci/pci_device.h |   3 +
>  3 files changed, 149 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a205c6b113..cc0c516161 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -42,6 +42,8 @@
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
>  #include "migration/qemu-file.h"
> +#include "qapi/visitor.h"
> +#include "include/hw/boards.h"
>  
>  #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
>  
> @@ -2955,6 +2957,22 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static void vfio_pci_get_dev_mem_pxm_start(Object *obj, Visitor *v,
> +                                           const char *name,
> +                                           void *opaque, Error **errp)
> +{
> +    uint64_t pxm_start = (uintptr_t) opaque;
> +    visit_type_uint64(v, name, &pxm_start, errp);
> +}
> +
> +static void vfio_pci_get_dev_mem_pxm_count(Object *obj, Visitor *v,
> +                                           const char *name,
> +                                           void *opaque, Error **errp)
> +{
> +    uint64_t pxm_count = (uintptr_t) opaque;
> +    visit_type_uint64(v, name, &pxm_count, errp);
> +}
> +
>  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>  {
>      Error *err = NULL;
> @@ -2974,6 +2992,125 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static int validate_dev_numa(uint32_t dev_node_start, uint32_t num_nodes)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    unsigned int i;
> +
> +    if (num_nodes >= MAX_NODES) {
> +        return -EINVAL;
> +    }
> +
> +    for (i = 0; i < num_nodes; i++) {
> +        if (ms->numa_state->nodes[dev_node_start + i].present) {
> +            return -EBUSY;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int mark_dev_node_present(uint32_t dev_node_start, uint32_t num_nodes)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    unsigned int i;
> +
> +    for (i = 0; i < num_nodes; i++) {
> +        ms->numa_state->nodes[dev_node_start + i].present = true;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +static bool vfio_pci_read_cohmem_support_sysfs(VFIODevice *vdev)
> +{
> +    gchar *contents = NULL;
> +    gsize length;
> +    char *path;
> +    bool ret = false;
> +    uint32_t supported;
> +
> +    path = g_strdup_printf("%s/coherent_mem", vdev->sysfsdev);

If someone has asked for it, why should we care if they specify it on a
device where it doesn't do anything useful?  This to me feels like something
to check at a higher level of the stack.

> +    if (g_file_get_contents(path, &contents, &length, NULL) && length > 0) {
> +        if ((sscanf(contents, "%u", &supported) == 1) && supported) {
> +            ret = true;
> +        }
> +    }
> +
> +    if (length) {
> +        g_free(contents);

g_autofree will clean this up for you I think

> +    }
> +    g_free(path);

and this.

> +
> +    return ret;
> +}
> +
> +static int vfio_pci_dev_mem_probe(VFIOPCIDevice *vPciDev,
> +                                         Error **errp)
> +{
> +    Object *obj = NULL;
> +    VFIODevice *vdev = &vPciDev->vbasedev;
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    int ret = 0;
> +    uint32_t dev_node_start = vPciDev->dev_node_start;
> +    uint32_t dev_node_count = vPciDev->dev_nodes;
> +
> +    if (!vdev->sysfsdev || !vfio_pci_read_cohmem_support_sysfs(vdev)) {
> +        ret = -ENODEV;
return -ENODEV; 

and similar in all the other cases as no cleanup to do.

> +        goto done;
> +    }
> +
> +    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {

nicer to handle one condition at a time.

    if (vdev->type != VFIO_DEVICE_TYPE_PCI) {
        return -EINVAL;
    }

    obj = vfio_pci_get_object(vdev);
this can't fail

Also get rid of assigning it to NULL above.

   if (DEVICE_CLASS(object...)) {
      return -EINVAL;
   }


> +        obj = vfio_pci_get_object(vdev);
> +    }
> +
> +    /* Since this device creates new NUMA node, hotplug is not supported. */
> +    if (!obj || DEVICE_CLASS(object_get_class(obj))->hotpluggable) {
> +        ret = -EINVAL;
> +        goto done;
> +    }
> +
> +    /*
> +     * This device has memory that is coherently accessible from the CPU.
> +     * The memory can be represented seperate memory-only NUMA nodes.
> +     */
> +    vPciDev->pdev.has_coherent_memory = true;
> +
> +    /*
> +     * The device can create several NUMA nodes with consecutive IDs
> +     * from dev_node_start to dev_node_start + dev_node_count.
> +     * Verify
> +     * - whether any node ID is occupied in the desired range.
> +     * - Node ID is not crossing MAX_NODE.
> +     */
> +    ret = validate_dev_numa(dev_node_start, dev_node_count);
> +    if (ret) {
> +        goto done;
        return ret;

> +    }
> +
> +    /* Reserve the node by marking as present */
> +    mark_dev_node_present(dev_node_start, dev_node_count);
> +
> +    /*
> +     * To have multiple unique nodes in the VM, a series of PXM nodes are
> +     * required to be added to VM's SRAT. Send the information about the
> +     * starting node ID and the node count to the ACPI builder code.
> +     */
> +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_start", "uint64",
> +                        vfio_pci_get_dev_mem_pxm_start, NULL, NULL,
> +                        (void *) (uintptr_t) dev_node_start);
> +
> +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_count", "uint64",
> +                        vfio_pci_get_dev_mem_pxm_count, NULL, NULL,
> +                        (void *) (uintptr_t) dev_node_count);
> +
> +    ms->numa_state->num_nodes += dev_node_count;
> +
> +done:
> +    return ret;
> +}
> +
>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> @@ -3291,6 +3428,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          }
>      }
>  
> +    ret = vfio_pci_dev_mem_probe(vdev, errp);
> +    if (ret && ret != -ENODEV) {
> +        error_report("Failed to setup device memory with error %d", ret);
> +    }
> +
>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
> @@ -3454,6 +3596,8 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
>                         sub_device_id, PCI_ANY_ID),
>      DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> +    DEFINE_PROP_UINT32("pxm-ns", VFIOPCIDevice, dev_node_start, 0),
> +    DEFINE_PROP_UINT32("pxm-nc", VFIOPCIDevice, dev_nodes, 0),
>      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
>                                     nv_gpudirect_clique,
>                                     qdev_prop_nv_gpudirect_clique, uint8_t),
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index a2771b9ff3..eef5ddfd06 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -158,6 +158,8 @@ struct VFIOPCIDevice {
>      uint32_t display_yres;
>      int32_t bootindex;
>      uint32_t igd_gms;
> +    uint32_t dev_node_start;
> +    uint32_t dev_nodes;
>      OffAutoPCIBAR msix_relo;
>      uint8_t pm_cap;
>      uint8_t nv_gpudirect_clique;
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index d3dd0f64b2..aacd2279ae 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -157,6 +157,9 @@ struct PCIDevice {
>      MSIVectorReleaseNotifier msix_vector_release_notifier;
>      MSIVectorPollNotifier msix_vector_poll_notifier;
>  
> +    /* GPU coherent memory */
> +    bool has_coherent_memory;
> +
>      /* ID of standby device in net_failover pair */
>      char *failover_pair_id;
>      uint32_t acpi_index;
Igor Mammedov Sept. 15, 2023, 2:48 p.m. UTC | #2
On Fri, 15 Sep 2023 15:25:09 +0100
Jonathan Cameron via <qemu-devel@nongnu.org> wrote:

> On Thu, 14 Sep 2023 19:45:56 -0700
> <ankita@nvidia.com> wrote:
> 
> > From: Ankit Agrawal <ankita@nvidia.com>
> > 
> > The CPU cache coherent device memory can be added as a set of
> > NUMA nodes distinct from the system memory nodes. The Qemu currently
> > do not provide a mechanism to support node creation for a vfio-pci
> > device.
> > 
> > Introduce new command line parameters to allow host admin provide
> > the desired starting NUMA node id (pxm-ns) and the number of such
> > nodes (pxm-nc) associated with the device. In this implementation,
> > a numerically consecutive nodes from pxm-ns to pxm-ns + pxm-nc
> > is created. Also validate the requested range of nodes to check  
> 
> 
> Hi Ankit,
> 
> That's not a particularly intuitive bit of interface!
> 
> pxm-start
> pxm-number
> perhaps?  However, in QEMU commmand lines the node terminology is used so
> numa-node-start
> numa-node-number
> 
> Though in general this feels backwards compared to how the rest of
> the numa definition is currently done.

QEMU have already means to assign NUMA node affinity
to PCI hierarchies in generic way by using a PBX per node
(also done 'backwards') by setting node option on it.
So every device behind it should belong to that node as well
and guest OS shall pickup device affinity from PCI tree it belongs to.
(I'd suspect that CXL is supposed to work the same way).

PS:
But then, I don't know much about PCI
(ccing Michael)

> 
> Could the current interface be extended.
> 
> -numa node,vfio-device=X
> 
> at the cost of a bit of house keeping and lookup.
> 
> We need something similar for generic initiators, so maybe
> vfio-mem=X? (might not even need to be vfio specific - even
> if we only support it for now on VFIO devices).
> leaving
> initiator=X available for later...
> 
> Also, good to say why multiple nodes per device are needed.
> 
> Jonathan
> 
> > for conflict with other nodes and to ensure that the id do not cross
> > QEMU limit.
> > 
> > Since the QEMU's SRAT and DST builder code needs the proximity
> > domain (PXM) id range, expose PXM start and count as device object
> > properties.
> > 
> > The device driver module communicates support for such feature through
> > sysfs. Check the presence of the feature to activate the code.
> > 
> > E.g. the following argument adds 8 PXM nodes starting from id 0x10.
> > -device vfio-pci-nohotplug,host=<pci-bdf>,pxm-ns=0x10,pxm-nc=8
> > 
> > Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> > ---
> >  hw/vfio/pci.c               | 144 ++++++++++++++++++++++++++++++++++++
> >  hw/vfio/pci.h               |   2 +
> >  include/hw/pci/pci_device.h |   3 +
> >  3 files changed, 149 insertions(+)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index a205c6b113..cc0c516161 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -42,6 +42,8 @@
> >  #include "qapi/error.h"
> >  #include "migration/blocker.h"
> >  #include "migration/qemu-file.h"
> > +#include "qapi/visitor.h"
> > +#include "include/hw/boards.h"
> >  
> >  #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
> >  
> > @@ -2955,6 +2957,22 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
> >      }
> >  }
> >  
> > +static void vfio_pci_get_dev_mem_pxm_start(Object *obj, Visitor *v,
> > +                                           const char *name,
> > +                                           void *opaque, Error **errp)
> > +{
> > +    uint64_t pxm_start = (uintptr_t) opaque;
> > +    visit_type_uint64(v, name, &pxm_start, errp);
> > +}
> > +
> > +static void vfio_pci_get_dev_mem_pxm_count(Object *obj, Visitor *v,
> > +                                           const char *name,
> > +                                           void *opaque, Error **errp)
> > +{
> > +    uint64_t pxm_count = (uintptr_t) opaque;
> > +    visit_type_uint64(v, name, &pxm_count, errp);
> > +}
> > +
> >  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> >  {
> >      Error *err = NULL;
> > @@ -2974,6 +2992,125 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> >      vdev->req_enabled = false;
> >  }
> >  
> > +static int validate_dev_numa(uint32_t dev_node_start, uint32_t num_nodes)
> > +{
> > +    MachineState *ms = MACHINE(qdev_get_machine());
> > +    unsigned int i;
> > +
> > +    if (num_nodes >= MAX_NODES) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    for (i = 0; i < num_nodes; i++) {
> > +        if (ms->numa_state->nodes[dev_node_start + i].present) {
> > +            return -EBUSY;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int mark_dev_node_present(uint32_t dev_node_start, uint32_t num_nodes)
> > +{
> > +    MachineState *ms = MACHINE(qdev_get_machine());
> > +    unsigned int i;
> > +
> > +    for (i = 0; i < num_nodes; i++) {
> > +        ms->numa_state->nodes[dev_node_start + i].present = true;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +
> > +static bool vfio_pci_read_cohmem_support_sysfs(VFIODevice *vdev)
> > +{
> > +    gchar *contents = NULL;
> > +    gsize length;
> > +    char *path;
> > +    bool ret = false;
> > +    uint32_t supported;
> > +
> > +    path = g_strdup_printf("%s/coherent_mem", vdev->sysfsdev);  
> 
> If someone has asked for it, why should we care if they specify it on a
> device where it doesn't do anything useful?  This to me feels like something
> to check at a higher level of the stack.
> 
> > +    if (g_file_get_contents(path, &contents, &length, NULL) && length > 0) {
> > +        if ((sscanf(contents, "%u", &supported) == 1) && supported) {
> > +            ret = true;
> > +        }
> > +    }
> > +
> > +    if (length) {
> > +        g_free(contents);  
> 
> g_autofree will clean this up for you I think
> 
> > +    }
> > +    g_free(path);  
> 
> and this.
> 
> > +
> > +    return ret;
> > +}
> > +
> > +static int vfio_pci_dev_mem_probe(VFIOPCIDevice *vPciDev,
> > +                                         Error **errp)
> > +{
> > +    Object *obj = NULL;
> > +    VFIODevice *vdev = &vPciDev->vbasedev;
> > +    MachineState *ms = MACHINE(qdev_get_machine());
> > +    int ret = 0;
> > +    uint32_t dev_node_start = vPciDev->dev_node_start;
> > +    uint32_t dev_node_count = vPciDev->dev_nodes;
> > +
> > +    if (!vdev->sysfsdev || !vfio_pci_read_cohmem_support_sysfs(vdev)) {
> > +        ret = -ENODEV;  
> return -ENODEV; 
> 
> and similar in all the other cases as no cleanup to do.
> 
> > +        goto done;
> > +    }
> > +
> > +    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {  
> 
> nicer to handle one condition at a time.
> 
>     if (vdev->type != VFIO_DEVICE_TYPE_PCI) {
>         return -EINVAL;
>     }
> 
>     obj = vfio_pci_get_object(vdev);
> this can't fail
> 
> Also get rid of assigning it to NULL above.
> 
>    if (DEVICE_CLASS(object...)) {
>       return -EINVAL;
>    }
> 
> 
> > +        obj = vfio_pci_get_object(vdev);
> > +    }
> > +
> > +    /* Since this device creates new NUMA node, hotplug is not supported. */
> > +    if (!obj || DEVICE_CLASS(object_get_class(obj))->hotpluggable) {
> > +        ret = -EINVAL;
> > +        goto done;
> > +    }
> > +
> > +    /*
> > +     * This device has memory that is coherently accessible from the CPU.
> > +     * The memory can be represented seperate memory-only NUMA nodes.
> > +     */
> > +    vPciDev->pdev.has_coherent_memory = true;
> > +
> > +    /*
> > +     * The device can create several NUMA nodes with consecutive IDs
> > +     * from dev_node_start to dev_node_start + dev_node_count.
> > +     * Verify
> > +     * - whether any node ID is occupied in the desired range.
> > +     * - Node ID is not crossing MAX_NODE.
> > +     */
> > +    ret = validate_dev_numa(dev_node_start, dev_node_count);
> > +    if (ret) {
> > +        goto done;  
>         return ret;
> 
> > +    }
> > +
> > +    /* Reserve the node by marking as present */
> > +    mark_dev_node_present(dev_node_start, dev_node_count);
> > +
> > +    /*
> > +     * To have multiple unique nodes in the VM, a series of PXM nodes are
> > +     * required to be added to VM's SRAT. Send the information about the
> > +     * starting node ID and the node count to the ACPI builder code.
> > +     */
> > +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_start", "uint64",
> > +                        vfio_pci_get_dev_mem_pxm_start, NULL, NULL,
> > +                        (void *) (uintptr_t) dev_node_start);
> > +
> > +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_count", "uint64",
> > +                        vfio_pci_get_dev_mem_pxm_count, NULL, NULL,
> > +                        (void *) (uintptr_t) dev_node_count);
> > +
> > +    ms->numa_state->num_nodes += dev_node_count;
> > +
> > +done:
> > +    return ret;
> > +}
> > +
> >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> >  {
> >      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> > @@ -3291,6 +3428,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> >          }
> >      }
> >  
> > +    ret = vfio_pci_dev_mem_probe(vdev, errp);
> > +    if (ret && ret != -ENODEV) {
> > +        error_report("Failed to setup device memory with error %d", ret);
> > +    }
> > +
> >      vfio_register_err_notifier(vdev);
> >      vfio_register_req_notifier(vdev);
> >      vfio_setup_resetfn_quirk(vdev);
> > @@ -3454,6 +3596,8 @@ static Property vfio_pci_dev_properties[] = {
> >      DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
> >                         sub_device_id, PCI_ANY_ID),
> >      DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> > +    DEFINE_PROP_UINT32("pxm-ns", VFIOPCIDevice, dev_node_start, 0),
> > +    DEFINE_PROP_UINT32("pxm-nc", VFIOPCIDevice, dev_nodes, 0),
> >      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> >                                     nv_gpudirect_clique,
> >                                     qdev_prop_nv_gpudirect_clique, uint8_t),
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index a2771b9ff3..eef5ddfd06 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -158,6 +158,8 @@ struct VFIOPCIDevice {
> >      uint32_t display_yres;
> >      int32_t bootindex;
> >      uint32_t igd_gms;
> > +    uint32_t dev_node_start;
> > +    uint32_t dev_nodes;
> >      OffAutoPCIBAR msix_relo;
> >      uint8_t pm_cap;
> >      uint8_t nv_gpudirect_clique;
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index d3dd0f64b2..aacd2279ae 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -157,6 +157,9 @@ struct PCIDevice {
> >      MSIVectorReleaseNotifier msix_vector_release_notifier;
> >      MSIVectorPollNotifier msix_vector_poll_notifier;
> >  
> > +    /* GPU coherent memory */
> > +    bool has_coherent_memory;
> > +
> >      /* ID of standby device in net_failover pair */
> >      char *failover_pair_id;
> >      uint32_t acpi_index;  
> 
>
Ankit Agrawal Sept. 22, 2023, 5:44 a.m. UTC | #3
> Also, good to say why multiple nodes per device are needed.
This is to support the GPU's MIG (Mult-Instance GPUs) feature,
(https://www.nvidia.com/en-in/technologies/multi-instance-gpu/) which
allows partitioning of the GPU device resources (including device memory) into
several isolated instances. We are creating multiple NUMA nodes to give
each partition their own node. Now the partitions are not fixed and they 
can be created/deleted and updated in (memory) sizes at runtime. This is
the reason these nodes are tagged as MEM_AFFINITY_HOTPLUGGABLE. Such
setting allows flexibility in the VM to associate a desired partition/range 
of device memory to a node (that is adjustable). Note that we are replicating
the behavior on baremetal here.

I will also put this detail on the cover letter in the next version.

> QEMU have already means to assign NUMA node affinity
> to PCI hierarchies in generic way by using a PBX per node
> (also done 'backwards') by setting node option on it.
> So every device behind it should belong to that node as well
> and guest OS shall pickup device affinity from PCI tree it belongs to.

Yes, but the problem is that only one node may be associated this way
and we have several.
Jonathan Cameron Sept. 25, 2023, 2:08 p.m. UTC | #4
On Fri, 15 Sep 2023 16:48:41 +0200
Igor Mammedov <imammedo@redhat.com> wrote:

> On Fri, 15 Sep 2023 15:25:09 +0100
> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
> 
> > On Thu, 14 Sep 2023 19:45:56 -0700
> > <ankita@nvidia.com> wrote:
> >   
> > > From: Ankit Agrawal <ankita@nvidia.com>
> > > 
> > > The CPU cache coherent device memory can be added as a set of
> > > NUMA nodes distinct from the system memory nodes. The Qemu currently
> > > do not provide a mechanism to support node creation for a vfio-pci
> > > device.
> > > 
> > > Introduce new command line parameters to allow host admin provide
> > > the desired starting NUMA node id (pxm-ns) and the number of such
> > > nodes (pxm-nc) associated with the device. In this implementation,
> > > a numerically consecutive nodes from pxm-ns to pxm-ns + pxm-nc
> > > is created. Also validate the requested range of nodes to check    
> > 
> > 
> > Hi Ankit,
> > 
> > That's not a particularly intuitive bit of interface!
> > 
> > pxm-start
> > pxm-number
> > perhaps?  However, in QEMU commmand lines the node terminology is used so
> > numa-node-start
> > numa-node-number
> > 
> > Though in general this feels backwards compared to how the rest of
> > the numa definition is currently done.  
> 
> QEMU have already means to assign NUMA node affinity
> to PCI hierarchies in generic way by using a PBX per node
> (also done 'backwards') by setting node option on it.
> So every device behind it should belong to that node as well
> and guest OS shall pickup device affinity from PCI tree it belongs to.
> (I'd suspect that CXL is supposed to work the same way).

Whilst QEMU doesn't yet support it (I think), there is nothing stopping
people using ACPI to assign _PXM to PCI devices. The early CXL specs specifically
call out that you should do this (before things were more discoverable) Note that the
node has to be defined in SRAT though. For most PCI devices that means
using a Generic Initiator entry.

CXL 2+ has some extra toys to play with, so of which apply to PCI as well:
* Generic Port Affinity Structures in ACPI.  The idea is you can
  define latency and bandwidth to the head of a discoverable PCI/CXL topology.
* Discoverable properties for all the CXL components via a mailbox interface:
  - Latency and bandwidth across switches
  - Latency and bandwidth to separate regions of memory.
  Note that whilst it came from CXL that stuff is in a UEFI maintained spec
  so 'could' be implemented on non CXL Devices.  See the CDAT specification.

Idea is that the OS can discover actual properties and then create appropriate
NUMA description if it wants to.

The Linux kernel currently relies on what I consider a hack: We also
have host memory windows (CXL Fixed Memory Windows) that have QoS associated
with them along with interleave etc. So a rough assumption is that if you map
CXL memory into a particular CFMWS then it is good enough to group it with
other memory mapped in that CFMWS and use a single NUMA node to describe it.
Hence the kernel creates a NUMA node per CFMWS entry and the properties can
be established by exploring the route to one of the devices being accesses
(and interleave etc).

This is an approximation I think we'll need to fix long term, but it seems
good enough for first generation or two of CXL devices where people tend
not to mix devices of different performances, or put some but not all of
them behind a switch.

So the problem for the devices being discussed here is to either fix
it properly (figure out how to do dynamic creation of NUMA nodes in Linux)
or find a good proxy to ensure enough spare nodes can be created early in boot.
That's early enough that it pretty much has to come from a static ACPI table.

Jonathan

> 
> PS:
> But then, I don't know much about PCI
> (ccing Michael)
> 
> > 
> > Could the current interface be extended.
> > 
> > -numa node,vfio-device=X
> > 
> > at the cost of a bit of house keeping and lookup.
> > 
> > We need something similar for generic initiators, so maybe
> > vfio-mem=X? (might not even need to be vfio specific - even
> > if we only support it for now on VFIO devices).
> > leaving
> > initiator=X available for later...
> > 
> > Also, good to say why multiple nodes per device are needed.
> > 
> > Jonathan
> >   
> > > for conflict with other nodes and to ensure that the id do not cross
> > > QEMU limit.
> > > 
> > > Since the QEMU's SRAT and DST builder code needs the proximity
> > > domain (PXM) id range, expose PXM start and count as device object
> > > properties.
> > > 
> > > The device driver module communicates support for such feature through
> > > sysfs. Check the presence of the feature to activate the code.
> > > 
> > > E.g. the following argument adds 8 PXM nodes starting from id 0x10.
> > > -device vfio-pci-nohotplug,host=<pci-bdf>,pxm-ns=0x10,pxm-nc=8
> > > 
> > > Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> > > ---
> > >  hw/vfio/pci.c               | 144 ++++++++++++++++++++++++++++++++++++
> > >  hw/vfio/pci.h               |   2 +
> > >  include/hw/pci/pci_device.h |   3 +
> > >  3 files changed, 149 insertions(+)
> > > 
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index a205c6b113..cc0c516161 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -42,6 +42,8 @@
> > >  #include "qapi/error.h"
> > >  #include "migration/blocker.h"
> > >  #include "migration/qemu-file.h"
> > > +#include "qapi/visitor.h"
> > > +#include "include/hw/boards.h"
> > >  
> > >  #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
> > >  
> > > @@ -2955,6 +2957,22 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
> > >      }
> > >  }
> > >  
> > > +static void vfio_pci_get_dev_mem_pxm_start(Object *obj, Visitor *v,
> > > +                                           const char *name,
> > > +                                           void *opaque, Error **errp)
> > > +{
> > > +    uint64_t pxm_start = (uintptr_t) opaque;
> > > +    visit_type_uint64(v, name, &pxm_start, errp);
> > > +}
> > > +
> > > +static void vfio_pci_get_dev_mem_pxm_count(Object *obj, Visitor *v,
> > > +                                           const char *name,
> > > +                                           void *opaque, Error **errp)
> > > +{
> > > +    uint64_t pxm_count = (uintptr_t) opaque;
> > > +    visit_type_uint64(v, name, &pxm_count, errp);
> > > +}
> > > +
> > >  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > >  {
> > >      Error *err = NULL;
> > > @@ -2974,6 +2992,125 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > >      vdev->req_enabled = false;
> > >  }
> > >  
> > > +static int validate_dev_numa(uint32_t dev_node_start, uint32_t num_nodes)
> > > +{
> > > +    MachineState *ms = MACHINE(qdev_get_machine());
> > > +    unsigned int i;
> > > +
> > > +    if (num_nodes >= MAX_NODES) {
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    for (i = 0; i < num_nodes; i++) {
> > > +        if (ms->numa_state->nodes[dev_node_start + i].present) {
> > > +            return -EBUSY;
> > > +        }
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int mark_dev_node_present(uint32_t dev_node_start, uint32_t num_nodes)
> > > +{
> > > +    MachineState *ms = MACHINE(qdev_get_machine());
> > > +    unsigned int i;
> > > +
> > > +    for (i = 0; i < num_nodes; i++) {
> > > +        ms->numa_state->nodes[dev_node_start + i].present = true;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +
> > > +static bool vfio_pci_read_cohmem_support_sysfs(VFIODevice *vdev)
> > > +{
> > > +    gchar *contents = NULL;
> > > +    gsize length;
> > > +    char *path;
> > > +    bool ret = false;
> > > +    uint32_t supported;
> > > +
> > > +    path = g_strdup_printf("%s/coherent_mem", vdev->sysfsdev);    
> > 
> > If someone has asked for it, why should we care if they specify it on a
> > device where it doesn't do anything useful?  This to me feels like something
> > to check at a higher level of the stack.
> >   
> > > +    if (g_file_get_contents(path, &contents, &length, NULL) && length > 0) {
> > > +        if ((sscanf(contents, "%u", &supported) == 1) && supported) {
> > > +            ret = true;
> > > +        }
> > > +    }
> > > +
> > > +    if (length) {
> > > +        g_free(contents);    
> > 
> > g_autofree will clean this up for you I think
> >   
> > > +    }
> > > +    g_free(path);    
> > 
> > and this.
> >   
> > > +
> > > +    return ret;
> > > +}
> > > +
> > > +static int vfio_pci_dev_mem_probe(VFIOPCIDevice *vPciDev,
> > > +                                         Error **errp)
> > > +{
> > > +    Object *obj = NULL;
> > > +    VFIODevice *vdev = &vPciDev->vbasedev;
> > > +    MachineState *ms = MACHINE(qdev_get_machine());
> > > +    int ret = 0;
> > > +    uint32_t dev_node_start = vPciDev->dev_node_start;
> > > +    uint32_t dev_node_count = vPciDev->dev_nodes;
> > > +
> > > +    if (!vdev->sysfsdev || !vfio_pci_read_cohmem_support_sysfs(vdev)) {
> > > +        ret = -ENODEV;    
> > return -ENODEV; 
> > 
> > and similar in all the other cases as no cleanup to do.
> >   
> > > +        goto done;
> > > +    }
> > > +
> > > +    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {    
> > 
> > nicer to handle one condition at a time.
> > 
> >     if (vdev->type != VFIO_DEVICE_TYPE_PCI) {
> >         return -EINVAL;
> >     }
> > 
> >     obj = vfio_pci_get_object(vdev);
> > this can't fail
> > 
> > Also get rid of assigning it to NULL above.
> > 
> >    if (DEVICE_CLASS(object...)) {
> >       return -EINVAL;
> >    }
> > 
> >   
> > > +        obj = vfio_pci_get_object(vdev);
> > > +    }
> > > +
> > > +    /* Since this device creates new NUMA node, hotplug is not supported. */
> > > +    if (!obj || DEVICE_CLASS(object_get_class(obj))->hotpluggable) {
> > > +        ret = -EINVAL;
> > > +        goto done;
> > > +    }
> > > +
> > > +    /*
> > > +     * This device has memory that is coherently accessible from the CPU.
> > > +     * The memory can be represented seperate memory-only NUMA nodes.
> > > +     */
> > > +    vPciDev->pdev.has_coherent_memory = true;
> > > +
> > > +    /*
> > > +     * The device can create several NUMA nodes with consecutive IDs
> > > +     * from dev_node_start to dev_node_start + dev_node_count.
> > > +     * Verify
> > > +     * - whether any node ID is occupied in the desired range.
> > > +     * - Node ID is not crossing MAX_NODE.
> > > +     */
> > > +    ret = validate_dev_numa(dev_node_start, dev_node_count);
> > > +    if (ret) {
> > > +        goto done;    
> >         return ret;
> >   
> > > +    }
> > > +
> > > +    /* Reserve the node by marking as present */
> > > +    mark_dev_node_present(dev_node_start, dev_node_count);
> > > +
> > > +    /*
> > > +     * To have multiple unique nodes in the VM, a series of PXM nodes are
> > > +     * required to be added to VM's SRAT. Send the information about the
> > > +     * starting node ID and the node count to the ACPI builder code.
> > > +     */
> > > +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_start", "uint64",
> > > +                        vfio_pci_get_dev_mem_pxm_start, NULL, NULL,
> > > +                        (void *) (uintptr_t) dev_node_start);
> > > +
> > > +    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_count", "uint64",
> > > +                        vfio_pci_get_dev_mem_pxm_count, NULL, NULL,
> > > +                        (void *) (uintptr_t) dev_node_count);
> > > +
> > > +    ms->numa_state->num_nodes += dev_node_count;
> > > +
> > > +done:
> > > +    return ret;
> > > +}
> > > +
> > >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> > >  {
> > >      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> > > @@ -3291,6 +3428,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> > >          }
> > >      }
> > >  
> > > +    ret = vfio_pci_dev_mem_probe(vdev, errp);
> > > +    if (ret && ret != -ENODEV) {
> > > +        error_report("Failed to setup device memory with error %d", ret);
> > > +    }
> > > +
> > >      vfio_register_err_notifier(vdev);
> > >      vfio_register_req_notifier(vdev);
> > >      vfio_setup_resetfn_quirk(vdev);
> > > @@ -3454,6 +3596,8 @@ static Property vfio_pci_dev_properties[] = {
> > >      DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
> > >                         sub_device_id, PCI_ANY_ID),
> > >      DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> > > +    DEFINE_PROP_UINT32("pxm-ns", VFIOPCIDevice, dev_node_start, 0),
> > > +    DEFINE_PROP_UINT32("pxm-nc", VFIOPCIDevice, dev_nodes, 0),
> > >      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> > >                                     nv_gpudirect_clique,
> > >                                     qdev_prop_nv_gpudirect_clique, uint8_t),
> > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > > index a2771b9ff3..eef5ddfd06 100644
> > > --- a/hw/vfio/pci.h
> > > +++ b/hw/vfio/pci.h
> > > @@ -158,6 +158,8 @@ struct VFIOPCIDevice {
> > >      uint32_t display_yres;
> > >      int32_t bootindex;
> > >      uint32_t igd_gms;
> > > +    uint32_t dev_node_start;
> > > +    uint32_t dev_nodes;
> > >      OffAutoPCIBAR msix_relo;
> > >      uint8_t pm_cap;
> > >      uint8_t nv_gpudirect_clique;
> > > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > > index d3dd0f64b2..aacd2279ae 100644
> > > --- a/include/hw/pci/pci_device.h
> > > +++ b/include/hw/pci/pci_device.h
> > > @@ -157,6 +157,9 @@ struct PCIDevice {
> > >      MSIVectorReleaseNotifier msix_vector_release_notifier;
> > >      MSIVectorPollNotifier msix_vector_poll_notifier;
> > >  
> > > +    /* GPU coherent memory */
> > > +    bool has_coherent_memory;
> > > +
> > >      /* ID of standby device in net_failover pair */
> > >      char *failover_pair_id;
> > >      uint32_t acpi_index;    
> > 
> >   
>
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a205c6b113..cc0c516161 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -42,6 +42,8 @@ 
 #include "qapi/error.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
+#include "qapi/visitor.h"
+#include "include/hw/boards.h"
 
 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
 
@@ -2955,6 +2957,22 @@  static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
     }
 }
 
+static void vfio_pci_get_dev_mem_pxm_start(Object *obj, Visitor *v,
+                                           const char *name,
+                                           void *opaque, Error **errp)
+{
+    uint64_t pxm_start = (uintptr_t) opaque;
+    visit_type_uint64(v, name, &pxm_start, errp);
+}
+
+static void vfio_pci_get_dev_mem_pxm_count(Object *obj, Visitor *v,
+                                           const char *name,
+                                           void *opaque, Error **errp)
+{
+    uint64_t pxm_count = (uintptr_t) opaque;
+    visit_type_uint64(v, name, &pxm_count, errp);
+}
+
 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
 {
     Error *err = NULL;
@@ -2974,6 +2992,125 @@  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static int validate_dev_numa(uint32_t dev_node_start, uint32_t num_nodes)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    unsigned int i;
+
+    if (num_nodes >= MAX_NODES) {
+        return -EINVAL;
+    }
+
+    for (i = 0; i < num_nodes; i++) {
+        if (ms->numa_state->nodes[dev_node_start + i].present) {
+            return -EBUSY;
+        }
+    }
+
+    return 0;
+}
+
+static int mark_dev_node_present(uint32_t dev_node_start, uint32_t num_nodes)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    unsigned int i;
+
+    for (i = 0; i < num_nodes; i++) {
+        ms->numa_state->nodes[dev_node_start + i].present = true;
+    }
+
+    return 0;
+}
+
+
+static bool vfio_pci_read_cohmem_support_sysfs(VFIODevice *vdev)
+{
+    gchar *contents = NULL;
+    gsize length;
+    char *path;
+    bool ret = false;
+    uint32_t supported;
+
+    path = g_strdup_printf("%s/coherent_mem", vdev->sysfsdev);
+    if (g_file_get_contents(path, &contents, &length, NULL) && length > 0) {
+        if ((sscanf(contents, "%u", &supported) == 1) && supported) {
+            ret = true;
+        }
+    }
+
+    if (length) {
+        g_free(contents);
+    }
+    g_free(path);
+
+    return ret;
+}
+
+static int vfio_pci_dev_mem_probe(VFIOPCIDevice *vPciDev,
+                                         Error **errp)
+{
+    Object *obj = NULL;
+    VFIODevice *vdev = &vPciDev->vbasedev;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int ret = 0;
+    uint32_t dev_node_start = vPciDev->dev_node_start;
+    uint32_t dev_node_count = vPciDev->dev_nodes;
+
+    if (!vdev->sysfsdev || !vfio_pci_read_cohmem_support_sysfs(vdev)) {
+        ret = -ENODEV;
+        goto done;
+    }
+
+    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {
+        obj = vfio_pci_get_object(vdev);
+    }
+
+    /* Since this device creates new NUMA node, hotplug is not supported. */
+    if (!obj || DEVICE_CLASS(object_get_class(obj))->hotpluggable) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    /*
+     * This device has memory that is coherently accessible from the CPU.
+     * The memory can be represented seperate memory-only NUMA nodes.
+     */
+    vPciDev->pdev.has_coherent_memory = true;
+
+    /*
+     * The device can create several NUMA nodes with consecutive IDs
+     * from dev_node_start to dev_node_start + dev_node_count.
+     * Verify
+     * - whether any node ID is occupied in the desired range.
+     * - Node ID is not crossing MAX_NODE.
+     */
+    ret = validate_dev_numa(dev_node_start, dev_node_count);
+    if (ret) {
+        goto done;
+    }
+
+    /* Reserve the node by marking as present */
+    mark_dev_node_present(dev_node_start, dev_node_count);
+
+    /*
+     * To have multiple unique nodes in the VM, a series of PXM nodes are
+     * required to be added to VM's SRAT. Send the information about the
+     * starting node ID and the node count to the ACPI builder code.
+     */
+    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_start", "uint64",
+                        vfio_pci_get_dev_mem_pxm_start, NULL, NULL,
+                        (void *) (uintptr_t) dev_node_start);
+
+    object_property_add(OBJECT(vPciDev), "dev_mem_pxm_count", "uint64",
+                        vfio_pci_get_dev_mem_pxm_count, NULL, NULL,
+                        (void *) (uintptr_t) dev_node_count);
+
+    ms->numa_state->num_nodes += dev_node_count;
+
+done:
+    return ret;
+}
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
@@ -3291,6 +3428,11 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 
+    ret = vfio_pci_dev_mem_probe(vdev, errp);
+    if (ret && ret != -ENODEV) {
+        error_report("Failed to setup device memory with error %d", ret);
+    }
+
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
@@ -3454,6 +3596,8 @@  static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                        sub_device_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
+    DEFINE_PROP_UINT32("pxm-ns", VFIOPCIDevice, dev_node_start, 0),
+    DEFINE_PROP_UINT32("pxm-nc", VFIOPCIDevice, dev_nodes, 0),
     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
                                    nv_gpudirect_clique,
                                    qdev_prop_nv_gpudirect_clique, uint8_t),
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a2771b9ff3..eef5ddfd06 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -158,6 +158,8 @@  struct VFIOPCIDevice {
     uint32_t display_yres;
     int32_t bootindex;
     uint32_t igd_gms;
+    uint32_t dev_node_start;
+    uint32_t dev_nodes;
     OffAutoPCIBAR msix_relo;
     uint8_t pm_cap;
     uint8_t nv_gpudirect_clique;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..aacd2279ae 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -157,6 +157,9 @@  struct PCIDevice {
     MSIVectorReleaseNotifier msix_vector_release_notifier;
     MSIVectorPollNotifier msix_vector_poll_notifier;
 
+    /* GPU coherent memory */
+    bool has_coherent_memory;
+
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
     uint32_t acpi_index;