Message ID | 1458554926-7844-7-git-send-email-caoj.fnst@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, 21 Mar 2016 18:08:42 +0800 Cao jin <caoj.fnst@cn.fujitsu.com> wrote: > From: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > when boot up a VM that assigning vfio devices with aer enabled, we > must check the vfio device whether support host bus reset. because > when one error occur. OS driver always recover the device by do a > bus reset, in order to recover the vfio device, qemu must able to do > a host bus reset to recover the device to default status. and for all > affected devices by the bus reset. we must check them whether all > are assigned to the VM and on the same virtual bus. meanwhile, for > simply done, the devices which don't affected by the host bus reset > are not allowed to assign to the same virtual bus. Rewording/expansion suggestion: When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus, therefore we place a few simplifying configuration restriction on the VM: - All physical devices affected by a bus reset must be assigned to the VM with AER enabled on each and be configured on the same virtual bus in the VM. - No devices unaffected by the bus reset, be they physical, emulated, or paravirtual may be configured on the same virtual bus as a device supporting AER signaling through vfio. In other words users wishing to enable AER on a multifunction device need to assign all functions of the device to the same virtual bus and enable AER support for each device. The easiest way to accomplish this is to identity map the physical functions to virtual functions with multifunction enabled on the virtual device. > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > --- > hw/vfio/pci.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > hw/vfio/pci.h | 1 + > 2 files changed, 205 insertions(+), 1 deletion(-) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 8842b7f..dce3b6d 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -38,6 +38,10 @@ > static void vfio_disable_interrupts(VFIOPCIDevice *vdev); > static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); > > +#define HOST_CMP_FUNC_MASK (1 << 0) > +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, > + uint8_t mask); > + This would of course go back to a _slot version of the function as you've had in previous versions. > /* > * Disabling BAR mmaping can be slow, but toggling it around INTx can > * also be a huge overhead. We try to get the best of both worlds by > @@ -1877,6 +1881,185 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) > return 0; > } > > +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) > +{ > + PCIBus *bus = vdev->pdev.bus; > + struct vfio_pci_hot_reset_info *info = NULL; > + struct vfio_pci_dependent_device *devices; > + VFIOGroup *group; > + int ret, i, devfn; > + > + ret = vfio_get_hot_reset_info(vdev, &info); > + if (ret) { > + error_setg(errp, "vfio: Cannot enable AER for device %s," > + " device does not support hot reset.", > + vdev->vbasedev.name); > + return; > + } > + > + /* List all affected devices by bus reset */ > + devices = &info->devices[0]; > + > + /* Verify that we have all the groups required */ > + for (i = 0; i < info->count; i++) { > + PCIHostDeviceAddress host; > + VFIOPCIDevice *tmp; > + VFIODevice *vbasedev_iter; > + bool found = false; > + > + host.domain = devices[i].segment; > + host.bus = devices[i].bus; > + host.slot = PCI_SLOT(devices[i].devfn); > + host.function = PCI_FUNC(devices[i].devfn); > + > + /* Skip the current device */ > + if (vfio_pci_host_match(&host, vdev->vbasedev.name, 0)) { > + continue; > + } > + > + /* Ensure we own the group of the affected device */ > + QLIST_FOREACH(group, &vfio_group_list, next) { > + if (group->groupid == devices[i].group_id) { > + break; > + } > + } > + > + if (!group) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, " > + "depends on group %d which is not owned.", > + vdev->vbasedev.name, devices[i].group_id); > + goto out; > + } > + > + /* Ensure affected devices for reset on the same bus */ > + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { > + if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { > + continue; > + } > + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); > + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { > + /* > + * AER errors may be broadcast to all functions of a multi- > + * function endpoint. If any of those sibling functions are > + * also assigned, they need to have AER enabled or else an > + * error may continue to cause a vm_stop condition. IOW, > + * AER setup of this function would be pointless. > + */ > + if (vfio_pci_host_match(&host, vdev->vbasedev.name, > + HOST_CMP_FUNC_MASK) && > + !(tmp->features & VFIO_FEATURE_ENABLE_AER)) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot" > + " the dependent device %s which does not enable AER.", > + vdev->vbasedev.name, tmp->vbasedev.name); > + goto out; > + } > + > + if (tmp->pdev.bus != bus) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, " > + "the dependent device %s is not on the same bus", > + vdev->vbasedev.name, tmp->vbasedev.name); > + goto out; > + } > + found = true; > + break; > + } > + } > + > + /* Ensure all affected devices assigned to VM */ > + if (!found) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, " > + "the dependent device %04x:%02x:%02x.%x " > + "is not assigned to VM.", > + vdev->vbasedev.name, host.domain, host.bus, > + host.slot, host.function); > + goto out; > + } > + } > + > + /* > + * To make things simply, functions must are combined > + * in the same way as on the host. no other irrelative > + * functions on the virtual same bus was required. > + */ Suggestion: The above code verified that all devices affected by a bus reset exist on the same bus in the VM. To further simplify, we also require that there are no additional devices beyond those existing on the VM bus. > + for (devfn = 0; devfn < 8; devfn++) { What about ARI? Don't we potentially need to walk up to 255 here if the parent bridge device has ARI enabled? Should we always walk up through 255 to prevent users from using non-zero slot numbers for other devices? > + VFIOPCIDevice *tmp; > + PCIDevice *dev; > + bool found = false; > + > + dev = pci_find_device(bus, pci_bus_num(bus), > + PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn)); > + > + if (!dev) { > + continue; > + } > + > + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, " > + "only support all dependent devices on the same bus, " > + "%s is not one of the dependent devices.", > + vdev->vbasedev.name, dev->name); "vfio: Cannot enable AER for device %s, device %s cannot be configured on the same virtual bus" Is dev->name sufficiently descriptive? Perhaps print the conflicting PCI address as well. > + goto out; > + } > + > + tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev); > + for (i = 0; i < info->count; i++) { > + PCIHostDeviceAddress host; > + > + host.domain = devices[i].segment; > + host.bus = devices[i].bus; > + host.slot = PCI_SLOT(devices[i].devfn); > + host.function = PCI_FUNC(devices[i].devfn); > + > + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { > + found = true; > + break; > + } > + } > + > + if (!found) { > + error_setg(errp, "vfio: Cannot enable AER for device %s, " > + "only support all dependent devices on the same bus, " > + "%s is not one of the dependent devices.", > + vdev->vbasedev.name, tmp->vbasedev.name); Update this comment similar to above too. > + goto out; > + } > + } > + > +out: > + g_free(info); > + return; > +} > + > +static void vfio_aer_check_host_bus_reset(Error **errp) > +{ > + VFIOGroup *group; > + VFIODevice *vbasedev; > + VFIOPCIDevice *vdev; > + Error *local_err = NULL; > + > + /* Check All vfio-pci devices if have bus reset capability */ > + QLIST_FOREACH(group, &vfio_group_list, next) { > + QLIST_FOREACH(vbasedev, &group->device_list, next) { > + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { > + continue; > + } > + vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > + if (vdev->features & VFIO_FEATURE_ENABLE_AER) { > + if (!pci_get_function_0(&vdev->pdev)) { > + continue; > + } Is this function_0 test really necessary? It seems like it's only relevant to the hotplug case, in which case it should be included in the next patch, but even there I don't see a path where we'll get here when function 0 is not populated. Is this perhaps leftover from previous versions? It actually seems like it's opening a gap where function 0 could be a non-vfio-pci device (or non-existent) and bypass our validation check here. > + vfio_check_hot_bus_reset(vdev, &local_err); > + if (local_err) { > + error_propagate(errp, local_err); > + return; > + } > + } > + } > + } > + > + return; > +} > + > static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, > int pos, uint16_t size) > { > @@ -2060,7 +2243,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > vfio_intx_enable(vdev); > } > > -#define HOST_CMP_FUNC_MASK (1 << 0) > static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, > uint8_t mask) > { > @@ -2587,6 +2769,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) > vdev->req_enabled = false; > } > > +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused) > +{ > + Error *local_err = NULL; > + > + vfio_aer_check_host_bus_reset(&local_err); > + if (local_err) { > + fprintf(stderr, "%s\n", error_get_pretty(local_err)); > + error_free(local_err); > + exit(1); > + } > +} > + > +static Notifier machine_notifier = { > + .notify = vfio_pci_machine_done_notify, > +}; > + > static int vfio_initfn(PCIDevice *pdev) > { > VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > @@ -2932,6 +3130,11 @@ static const TypeInfo vfio_pci_dev_info = { > static void register_vfio_pci_dev_type(void) > { > type_register_static(&vfio_pci_dev_info); > + /* > + * Register notifier when machine init is done, since we need > + * check the configration manner after all vfio device are inited. > + */ Suggestion: The AER configuration may depend on multiple devices, so we cannot validate consistency after each device is initialized. We can only depend on function initialization order (function 0 last) for hotplug devices, therefore a machine-init-done notifier is used to validate the configuration after all cold-plug devices are processed. > + qemu_add_machine_init_done_notifier(&machine_notifier); > } > > type_init(register_vfio_pci_dev_type) > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 7b3924e..db7c6d5 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -15,6 +15,7 @@ > #include "qemu-common.h" > #include "exec/memory.h" > #include "hw/pci/pci.h" > +#include "hw/pci/pci_bus.h" > #include "hw/pci/pci_bridge.h" > #include "hw/vfio/vfio-common.h" > #include "qemu/event_notifier.h"
On 03/22/2016 05:40 AM, Alex Williamson wrote: > On Mon, 21 Mar 2016 18:08:42 +0800 > Cao jin <caoj.fnst@cn.fujitsu.com> wrote: > >> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com> >> >> when boot up a VM that assigning vfio devices with aer enabled, we >> must check the vfio device whether support host bus reset. because >> when one error occur. OS driver always recover the device by do a >> bus reset, in order to recover the vfio device, qemu must able to do >> a host bus reset to recover the device to default status. and for all >> affected devices by the bus reset. we must check them whether all >> are assigned to the VM and on the same virtual bus. meanwhile, for >> simply done, the devices which don't affected by the host bus reset >> are not allowed to assign to the same virtual bus. > Rewording/expansion suggestion: > > When assigning a vfio device with AER enabled, we must check whether > the device supports a host bus reset (ie. hot reset) as this may be > used by the guest OS in order to recover the device from an AER > error. QEMU must therefore have the ability to perform a physical > host bus reset using the existing vfio APIs in response to a virtual > bus reset in the VM. A physical bus reset affects all of the devices > on the host bus, therefore we place a few simplifying configuration > restriction on the VM: > > - All physical devices affected by a bus reset must be assigned to > the VM with AER enabled on each and be configured on the same > virtual bus in the VM. > > - No devices unaffected by the bus reset, be they physical, emulated, > or paravirtual may be configured on the same virtual bus as a > device supporting AER signaling through vfio. > > In other words users wishing to enable AER on a multifunction device > need to assign all functions of the device to the same virtual bus > and enable AER support for each device. The easiest way to > accomplish this is to identity map the physical functions to virtual > functions with multifunction enabled on the virtual device. nice description. thanks :) > >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> >> --- >> hw/vfio/pci.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- >> hw/vfio/pci.h | 1 + >> 2 files changed, 205 insertions(+), 1 deletion(-) >> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index 8842b7f..dce3b6d 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -38,6 +38,10 @@ >> static void vfio_disable_interrupts(VFIOPCIDevice *vdev); >> static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); >> >> +#define HOST_CMP_FUNC_MASK (1 << 0) >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, >> + uint8_t mask); >> + > This would of course go back to a _slot version of the function as > you've had in previous versions. > >> /* >> * Disabling BAR mmaping can be slow, but toggling it around INTx can >> * also be a huge overhead. We try to get the best of both worlds by >> @@ -1877,6 +1881,185 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) >> return 0; >> } >> >> +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) >> +{ >> + PCIBus *bus = vdev->pdev.bus; >> + struct vfio_pci_hot_reset_info *info = NULL; >> + struct vfio_pci_dependent_device *devices; >> + VFIOGroup *group; >> + int ret, i, devfn; >> + >> + ret = vfio_get_hot_reset_info(vdev, &info); >> + if (ret) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s," >> + " device does not support hot reset.", >> + vdev->vbasedev.name); >> + return; >> + } >> + >> + /* List all affected devices by bus reset */ >> + devices = &info->devices[0]; >> + >> + /* Verify that we have all the groups required */ >> + for (i = 0; i < info->count; i++) { >> + PCIHostDeviceAddress host; >> + VFIOPCIDevice *tmp; >> + VFIODevice *vbasedev_iter; >> + bool found = false; >> + >> + host.domain = devices[i].segment; >> + host.bus = devices[i].bus; >> + host.slot = PCI_SLOT(devices[i].devfn); >> + host.function = PCI_FUNC(devices[i].devfn); >> + >> + /* Skip the current device */ >> + if (vfio_pci_host_match(&host, vdev->vbasedev.name, 0)) { >> + continue; >> + } >> + >> + /* Ensure we own the group of the affected device */ >> + QLIST_FOREACH(group, &vfio_group_list, next) { >> + if (group->groupid == devices[i].group_id) { >> + break; >> + } >> + } >> + >> + if (!group) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " >> + "depends on group %d which is not owned.", >> + vdev->vbasedev.name, devices[i].group_id); >> + goto out; >> + } >> + >> + /* Ensure affected devices for reset on the same bus */ >> + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { >> + if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { >> + continue; >> + } >> + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); >> + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { >> + /* >> + * AER errors may be broadcast to all functions of a multi- >> + * function endpoint. If any of those sibling functions are >> + * also assigned, they need to have AER enabled or else an >> + * error may continue to cause a vm_stop condition. IOW, >> + * AER setup of this function would be pointless. >> + */ >> + if (vfio_pci_host_match(&host, vdev->vbasedev.name, >> + HOST_CMP_FUNC_MASK) && >> + !(tmp->features & VFIO_FEATURE_ENABLE_AER)) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot" >> + " the dependent device %s which does not enable AER.", >> + vdev->vbasedev.name, tmp->vbasedev.name); >> + goto out; >> + } >> + >> + if (tmp->pdev.bus != bus) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " >> + "the dependent device %s is not on the same bus", >> + vdev->vbasedev.name, tmp->vbasedev.name); >> + goto out; >> + } >> + found = true; >> + break; >> + } >> + } >> + >> + /* Ensure all affected devices assigned to VM */ >> + if (!found) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " >> + "the dependent device %04x:%02x:%02x.%x " >> + "is not assigned to VM.", >> + vdev->vbasedev.name, host.domain, host.bus, >> + host.slot, host.function); >> + goto out; >> + } >> + } >> + >> + /* >> + * To make things simply, functions must are combined >> + * in the same way as on the host. no other irrelative >> + * functions on the virtual same bus was required. >> + */ > Suggestion: > > The above code verified that all devices affected by a bus reset > exist on the same bus in the VM. To further simplify, we also > require that there are no additional devices beyond those existing on > the VM bus. > >> + for (devfn = 0; devfn < 8; devfn++) { > What about ARI? Don't we potentially need to walk up to 255 here if > the parent bridge device has ARI enabled? Should we always walk up > through 255 to prevent users from using non-zero slot numbers for other > devices? can we use the function pcie_cap_is_arifwd_enabled to check the bridge whether enable ARI?, if don't enable, we can simple scan the slot 0, otherwise, we should walk up to 255. >> + VFIOPCIDevice *tmp; >> + PCIDevice *dev; >> + bool found = false; >> + >> + dev = pci_find_device(bus, pci_bus_num(bus), >> + PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn)); >> + >> + if (!dev) { >> + continue; >> + } >> + >> + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " >> + "only support all dependent devices on the same bus, " >> + "%s is not one of the dependent devices.", >> + vdev->vbasedev.name, dev->name); > "vfio: Cannot enable AER for device %s, device %s cannot be configured > on the same virtual bus" > > Is dev->name sufficiently descriptive? Perhaps print the conflicting > PCI address as well. > >> + goto out; >> + } >> + >> + tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev); >> + for (i = 0; i < info->count; i++) { >> + PCIHostDeviceAddress host; >> + >> + host.domain = devices[i].segment; >> + host.bus = devices[i].bus; >> + host.slot = PCI_SLOT(devices[i].devfn); >> + host.function = PCI_FUNC(devices[i].devfn); >> + >> + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { >> + found = true; >> + break; >> + } >> + } >> + >> + if (!found) { >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " >> + "only support all dependent devices on the same bus, " >> + "%s is not one of the dependent devices.", >> + vdev->vbasedev.name, tmp->vbasedev.name); > Update this comment similar to above too. > >> + goto out; >> + } >> + } >> + >> +out: >> + g_free(info); >> + return; >> +} >> + >> +static void vfio_aer_check_host_bus_reset(Error **errp) >> +{ >> + VFIOGroup *group; >> + VFIODevice *vbasedev; >> + VFIOPCIDevice *vdev; >> + Error *local_err = NULL; >> + >> + /* Check All vfio-pci devices if have bus reset capability */ >> + QLIST_FOREACH(group, &vfio_group_list, next) { >> + QLIST_FOREACH(vbasedev, &group->device_list, next) { >> + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >> + continue; >> + } >> + vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >> + if (vdev->features & VFIO_FEATURE_ENABLE_AER) { >> + if (!pci_get_function_0(&vdev->pdev)) { >> + continue; >> + } > Is this function_0 test really necessary? It seems like it's only > relevant to the hotplug case, in which case it should be included in > the next patch, but even there I don't see a path where we'll get here > when function 0 is not populated. Is this perhaps leftover from > previous versions? It actually seems like it's opening a gap where > function 0 could be a non-vfio-pci device (or non-existent) and bypass > our validation check here. yes, you are right, as for adding the function_0 test, I just thought that if we boot up VM without function_0 in command line , whether we should test the other not_function_0 vfio device with aer on the bus support hot reset. if we don't add the function 0 test, we probably can't boot up the VM. because the affected function 0 is not added yet. I never thought that this open a gap that the function 0 could be a non-vfio-pci device, thanks for pointing it out. for hotplug case, we also potential hotplug a not-vfio-pci device and bypass our check. so I think the check still need to place in device_realize like previous version? Thanks, Chen > >> + vfio_check_hot_bus_reset(vdev, &local_err); >> + if (local_err) { >> + error_propagate(errp, local_err); >> + return; >> + } >> + } >> + } >> + } >> + >> + return; >> +} >> + >> static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, >> int pos, uint16_t size) >> { >> @@ -2060,7 +2243,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) >> vfio_intx_enable(vdev); >> } >> >> -#define HOST_CMP_FUNC_MASK (1 << 0) >> static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, >> uint8_t mask) >> { >> @@ -2587,6 +2769,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) >> vdev->req_enabled = false; >> } >> >> +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused) >> +{ >> + Error *local_err = NULL; >> + >> + vfio_aer_check_host_bus_reset(&local_err); >> + if (local_err) { >> + fprintf(stderr, "%s\n", error_get_pretty(local_err)); >> + error_free(local_err); >> + exit(1); >> + } >> +} >> + >> +static Notifier machine_notifier = { >> + .notify = vfio_pci_machine_done_notify, >> +}; >> + >> static int vfio_initfn(PCIDevice *pdev) >> { >> VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); >> @@ -2932,6 +3130,11 @@ static const TypeInfo vfio_pci_dev_info = { >> static void register_vfio_pci_dev_type(void) >> { >> type_register_static(&vfio_pci_dev_info); >> + /* >> + * Register notifier when machine init is done, since we need >> + * check the configration manner after all vfio device are inited. >> + */ > Suggestion: > > The AER configuration may depend on multiple devices, so we cannot > validate consistency after each device is initialized. We can only > depend on function initialization order (function 0 last) for hotplug > devices, therefore a machine-init-done notifier is used to validate > the configuration after all cold-plug devices are processed. > >> + qemu_add_machine_init_done_notifier(&machine_notifier); >> } >> >> type_init(register_vfio_pci_dev_type) >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h >> index 7b3924e..db7c6d5 100644 >> --- a/hw/vfio/pci.h >> +++ b/hw/vfio/pci.h >> @@ -15,6 +15,7 @@ >> #include "qemu-common.h" >> #include "exec/memory.h" >> #include "hw/pci/pci.h" >> +#include "hw/pci/pci_bus.h" >> #include "hw/pci/pci_bridge.h" >> #include "hw/vfio/vfio-common.h" >> #include "qemu/event_notifier.h" > > > . >
On Tue, 22 Mar 2016 18:02:25 +0800 Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote: > On 03/22/2016 05:40 AM, Alex Williamson wrote: > > On Mon, 21 Mar 2016 18:08:42 +0800 > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote: > > > >> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > >> > >> when boot up a VM that assigning vfio devices with aer enabled, we > >> must check the vfio device whether support host bus reset. because > >> when one error occur. OS driver always recover the device by do a > >> bus reset, in order to recover the vfio device, qemu must able to do > >> a host bus reset to recover the device to default status. and for all > >> affected devices by the bus reset. we must check them whether all > >> are assigned to the VM and on the same virtual bus. meanwhile, for > >> simply done, the devices which don't affected by the host bus reset > >> are not allowed to assign to the same virtual bus. > > Rewording/expansion suggestion: > > > > When assigning a vfio device with AER enabled, we must check whether > > the device supports a host bus reset (ie. hot reset) as this may be > > used by the guest OS in order to recover the device from an AER > > error. QEMU must therefore have the ability to perform a physical > > host bus reset using the existing vfio APIs in response to a virtual > > bus reset in the VM. A physical bus reset affects all of the devices > > on the host bus, therefore we place a few simplifying configuration > > restriction on the VM: > > > > - All physical devices affected by a bus reset must be assigned to > > the VM with AER enabled on each and be configured on the same > > virtual bus in the VM. > > > > - No devices unaffected by the bus reset, be they physical, emulated, > > or paravirtual may be configured on the same virtual bus as a > > device supporting AER signaling through vfio. > > > > In other words users wishing to enable AER on a multifunction device > > need to assign all functions of the device to the same virtual bus > > and enable AER support for each device. The easiest way to > > accomplish this is to identity map the physical functions to virtual > > functions with multifunction enabled on the virtual device. > > nice description. thanks :) > > > >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > >> --- > >> hw/vfio/pci.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > >> hw/vfio/pci.h | 1 + > >> 2 files changed, 205 insertions(+), 1 deletion(-) > >> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > >> index 8842b7f..dce3b6d 100644 > >> --- a/hw/vfio/pci.c > >> +++ b/hw/vfio/pci.c > >> @@ -38,6 +38,10 @@ > >> static void vfio_disable_interrupts(VFIOPCIDevice *vdev); > >> static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); > >> > >> +#define HOST_CMP_FUNC_MASK (1 << 0) > >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, > >> + uint8_t mask); > >> + > > This would of course go back to a _slot version of the function as > > you've had in previous versions. > > > >> /* > >> * Disabling BAR mmaping can be slow, but toggling it around INTx can > >> * also be a huge overhead. We try to get the best of both worlds by > >> @@ -1877,6 +1881,185 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) > >> return 0; > >> } > >> > >> +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) > >> +{ > >> + PCIBus *bus = vdev->pdev.bus; > >> + struct vfio_pci_hot_reset_info *info = NULL; > >> + struct vfio_pci_dependent_device *devices; > >> + VFIOGroup *group; > >> + int ret, i, devfn; > >> + > >> + ret = vfio_get_hot_reset_info(vdev, &info); > >> + if (ret) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s," > >> + " device does not support hot reset.", > >> + vdev->vbasedev.name); > >> + return; > >> + } > >> + > >> + /* List all affected devices by bus reset */ > >> + devices = &info->devices[0]; > >> + > >> + /* Verify that we have all the groups required */ > >> + for (i = 0; i < info->count; i++) { > >> + PCIHostDeviceAddress host; > >> + VFIOPCIDevice *tmp; > >> + VFIODevice *vbasedev_iter; > >> + bool found = false; > >> + > >> + host.domain = devices[i].segment; > >> + host.bus = devices[i].bus; > >> + host.slot = PCI_SLOT(devices[i].devfn); > >> + host.function = PCI_FUNC(devices[i].devfn); > >> + > >> + /* Skip the current device */ > >> + if (vfio_pci_host_match(&host, vdev->vbasedev.name, 0)) { > >> + continue; > >> + } > >> + > >> + /* Ensure we own the group of the affected device */ > >> + QLIST_FOREACH(group, &vfio_group_list, next) { > >> + if (group->groupid == devices[i].group_id) { > >> + break; > >> + } > >> + } > >> + > >> + if (!group) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " > >> + "depends on group %d which is not owned.", > >> + vdev->vbasedev.name, devices[i].group_id); > >> + goto out; > >> + } > >> + > >> + /* Ensure affected devices for reset on the same bus */ > >> + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { > >> + if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { > >> + continue; > >> + } > >> + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); > >> + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { > >> + /* > >> + * AER errors may be broadcast to all functions of a multi- > >> + * function endpoint. If any of those sibling functions are > >> + * also assigned, they need to have AER enabled or else an > >> + * error may continue to cause a vm_stop condition. IOW, > >> + * AER setup of this function would be pointless. > >> + */ > >> + if (vfio_pci_host_match(&host, vdev->vbasedev.name, > >> + HOST_CMP_FUNC_MASK) && > >> + !(tmp->features & VFIO_FEATURE_ENABLE_AER)) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot" > >> + " the dependent device %s which does not enable AER.", > >> + vdev->vbasedev.name, tmp->vbasedev.name); > >> + goto out; > >> + } > >> + > >> + if (tmp->pdev.bus != bus) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " > >> + "the dependent device %s is not on the same bus", > >> + vdev->vbasedev.name, tmp->vbasedev.name); > >> + goto out; > >> + } > >> + found = true; > >> + break; > >> + } > >> + } > >> + > >> + /* Ensure all affected devices assigned to VM */ > >> + if (!found) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " > >> + "the dependent device %04x:%02x:%02x.%x " > >> + "is not assigned to VM.", > >> + vdev->vbasedev.name, host.domain, host.bus, > >> + host.slot, host.function); > >> + goto out; > >> + } > >> + } > >> + > >> + /* > >> + * To make things simply, functions must are combined > >> + * in the same way as on the host. no other irrelative > >> + * functions on the virtual same bus was required. > >> + */ > > Suggestion: > > > > The above code verified that all devices affected by a bus reset > > exist on the same bus in the VM. To further simplify, we also > > require that there are no additional devices beyond those existing on > > the VM bus. > > > >> + for (devfn = 0; devfn < 8; devfn++) { > > What about ARI? Don't we potentially need to walk up to 255 here if > > the parent bridge device has ARI enabled? Should we always walk up > > through 255 to prevent users from using non-zero slot numbers for other > > devices? > can we use the function pcie_cap_is_arifwd_enabled to check the bridge > whether enable ARI?, if don't enable, we can simple scan the slot 0, > otherwise, > we should walk up to 255. That seems reasonable. > >> + VFIOPCIDevice *tmp; > >> + PCIDevice *dev; > >> + bool found = false; > >> + > >> + dev = pci_find_device(bus, pci_bus_num(bus), > >> + PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn)); > >> + > >> + if (!dev) { > >> + continue; > >> + } > >> + > >> + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " > >> + "only support all dependent devices on the same bus, " > >> + "%s is not one of the dependent devices.", > >> + vdev->vbasedev.name, dev->name); > > "vfio: Cannot enable AER for device %s, device %s cannot be configured > > on the same virtual bus" > > > > Is dev->name sufficiently descriptive? Perhaps print the conflicting > > PCI address as well. > > > >> + goto out; > >> + } > >> + > >> + tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev); > >> + for (i = 0; i < info->count; i++) { > >> + PCIHostDeviceAddress host; > >> + > >> + host.domain = devices[i].segment; > >> + host.bus = devices[i].bus; > >> + host.slot = PCI_SLOT(devices[i].devfn); > >> + host.function = PCI_FUNC(devices[i].devfn); > >> + > >> + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { > >> + found = true; > >> + break; > >> + } > >> + } > >> + > >> + if (!found) { > >> + error_setg(errp, "vfio: Cannot enable AER for device %s, " > >> + "only support all dependent devices on the same bus, " > >> + "%s is not one of the dependent devices.", > >> + vdev->vbasedev.name, tmp->vbasedev.name); > > Update this comment similar to above too. > > > >> + goto out; > >> + } > >> + } > >> + > >> +out: > >> + g_free(info); > >> + return; > >> +} > >> + > >> +static void vfio_aer_check_host_bus_reset(Error **errp) > >> +{ > >> + VFIOGroup *group; > >> + VFIODevice *vbasedev; > >> + VFIOPCIDevice *vdev; > >> + Error *local_err = NULL; > >> + > >> + /* Check All vfio-pci devices if have bus reset capability */ > >> + QLIST_FOREACH(group, &vfio_group_list, next) { > >> + QLIST_FOREACH(vbasedev, &group->device_list, next) { > >> + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { > >> + continue; > >> + } > >> + vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >> + if (vdev->features & VFIO_FEATURE_ENABLE_AER) { > >> + if (!pci_get_function_0(&vdev->pdev)) { > >> + continue; > >> + } > > Is this function_0 test really necessary? It seems like it's only > > relevant to the hotplug case, in which case it should be included in > > the next patch, but even there I don't see a path where we'll get here > > when function 0 is not populated. Is this perhaps leftover from > > previous versions? It actually seems like it's opening a gap where > > function 0 could be a non-vfio-pci device (or non-existent) and bypass > > our validation check here. > yes, you are right, as for adding the function_0 test, I just thought that > if we boot up VM without function_0 in command line , whether we should > test the other not_function_0 vfio device with aer on the bus support > hot reset. > if we don't add the function 0 test, we probably can't boot up the VM. > because the affected function 0 is not added yet. I never thought that > this open a gap that the function 0 could be a non-vfio-pci device, thanks > for pointing it out. for hotplug case, we also potential hotplug a > not-vfio-pci device > and bypass our check. so I think the check still need to place in > device_realize like > previous version? Yes, I was excited that you had removed all the pci-core dependencies, but it seems like that's one that we cannot get around. We cannot validate the config as each function is added due to dependencies between functions and we have no guarantee that the user will make function zero be a vfio-pci device. It seems like we really do need that slot closure notification from the core. Thanks, Alex > >> + vfio_check_hot_bus_reset(vdev, &local_err); > >> + if (local_err) { > >> + error_propagate(errp, local_err); > >> + return; > >> + } > >> + } > >> + } > >> + } > >> + > >> + return; > >> +} > >> + > >> static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, > >> int pos, uint16_t size) > >> { > >> @@ -2060,7 +2243,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > >> vfio_intx_enable(vdev); > >> } > >> > >> -#define HOST_CMP_FUNC_MASK (1 << 0) > >> static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, > >> uint8_t mask) > >> { > >> @@ -2587,6 +2769,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) > >> vdev->req_enabled = false; > >> } > >> > >> +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused) > >> +{ > >> + Error *local_err = NULL; > >> + > >> + vfio_aer_check_host_bus_reset(&local_err); > >> + if (local_err) { > >> + fprintf(stderr, "%s\n", error_get_pretty(local_err)); > >> + error_free(local_err); > >> + exit(1); > >> + } > >> +} > >> + > >> +static Notifier machine_notifier = { > >> + .notify = vfio_pci_machine_done_notify, > >> +}; > >> + > >> static int vfio_initfn(PCIDevice *pdev) > >> { > >> VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > >> @@ -2932,6 +3130,11 @@ static const TypeInfo vfio_pci_dev_info = { > >> static void register_vfio_pci_dev_type(void) > >> { > >> type_register_static(&vfio_pci_dev_info); > >> + /* > >> + * Register notifier when machine init is done, since we need > >> + * check the configration manner after all vfio device are inited. > >> + */ > > Suggestion: > > > > The AER configuration may depend on multiple devices, so we cannot > > validate consistency after each device is initialized. We can only > > depend on function initialization order (function 0 last) for hotplug > > devices, therefore a machine-init-done notifier is used to validate > > the configuration after all cold-plug devices are processed. > > > >> + qemu_add_machine_init_done_notifier(&machine_notifier); > >> } > >> > >> type_init(register_vfio_pci_dev_type) > >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > >> index 7b3924e..db7c6d5 100644 > >> --- a/hw/vfio/pci.h > >> +++ b/hw/vfio/pci.h > >> @@ -15,6 +15,7 @@ > >> #include "qemu-common.h" > >> #include "exec/memory.h" > >> #include "hw/pci/pci.h" > >> +#include "hw/pci/pci_bus.h" > >> #include "hw/pci/pci_bridge.h" > >> #include "hw/vfio/vfio-common.h" > >> #include "qemu/event_notifier.h" > > > > > > . > > > > >
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 8842b7f..dce3b6d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -38,6 +38,10 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +#define HOST_CMP_FUNC_MASK (1 << 0) +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, + uint8_t mask); + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -1877,6 +1881,185 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) +{ + PCIBus *bus = vdev->pdev.bus; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + VFIOGroup *group; + int ret, i, devfn; + + ret = vfio_get_hot_reset_info(vdev, &info); + if (ret) { + error_setg(errp, "vfio: Cannot enable AER for device %s," + " device does not support hot reset.", + vdev->vbasedev.name); + return; + } + + /* List all affected devices by bus reset */ + devices = &info->devices[0]; + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + bool found = false; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + /* Skip the current device */ + if (vfio_pci_host_match(&host, vdev->vbasedev.name, 0)) { + continue; + } + + /* Ensure we own the group of the affected device */ + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + error_setg(errp, "vfio: Cannot enable AER for device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + goto out; + } + + /* Ensure affected devices for reset on the same bus */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { + /* + * AER errors may be broadcast to all functions of a multi- + * function endpoint. If any of those sibling functions are + * also assigned, they need to have AER enabled or else an + * error may continue to cause a vm_stop condition. IOW, + * AER setup of this function would be pointless. + */ + if (vfio_pci_host_match(&host, vdev->vbasedev.name, + HOST_CMP_FUNC_MASK) && + !(tmp->features & VFIO_FEATURE_ENABLE_AER)) { + error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot" + " the dependent device %s which does not enable AER.", + vdev->vbasedev.name, tmp->vbasedev.name); + goto out; + } + + if (tmp->pdev.bus != bus) { + error_setg(errp, "vfio: Cannot enable AER for device %s, " + "the dependent device %s is not on the same bus", + vdev->vbasedev.name, tmp->vbasedev.name); + goto out; + } + found = true; + break; + } + } + + /* Ensure all affected devices assigned to VM */ + if (!found) { + error_setg(errp, "vfio: Cannot enable AER for device %s, " + "the dependent device %04x:%02x:%02x.%x " + "is not assigned to VM.", + vdev->vbasedev.name, host.domain, host.bus, + host.slot, host.function); + goto out; + } + } + + /* + * To make things simply, functions must are combined + * in the same way as on the host. no other irrelative + * functions on the virtual same bus was required. + */ + for (devfn = 0; devfn < 8; devfn++) { + VFIOPCIDevice *tmp; + PCIDevice *dev; + bool found = false; + + dev = pci_find_device(bus, pci_bus_num(bus), + PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn)); + + if (!dev) { + continue; + } + + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { + error_setg(errp, "vfio: Cannot enable AER for device %s, " + "only support all dependent devices on the same bus, " + "%s is not one of the dependent devices.", + vdev->vbasedev.name, dev->name); + goto out; + } + + tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev); + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) { + found = true; + break; + } + } + + if (!found) { + error_setg(errp, "vfio: Cannot enable AER for device %s, " + "only support all dependent devices on the same bus, " + "%s is not one of the dependent devices.", + vdev->vbasedev.name, tmp->vbasedev.name); + goto out; + } + } + +out: + g_free(info); + return; +} + +static void vfio_aer_check_host_bus_reset(Error **errp) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + VFIOPCIDevice *vdev; + Error *local_err = NULL; + + /* Check All vfio-pci devices if have bus reset capability */ + QLIST_FOREACH(group, &vfio_group_list, next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + if (vdev->features & VFIO_FEATURE_ENABLE_AER) { + if (!pci_get_function_0(&vdev->pdev)) { + continue; + } + vfio_check_hot_bus_reset(vdev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + } + } + + return; +} + static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, int pos, uint16_t size) { @@ -2060,7 +2243,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_intx_enable(vdev); } -#define HOST_CMP_FUNC_MASK (1 << 0) static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name, uint8_t mask) { @@ -2587,6 +2769,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused) +{ + Error *local_err = NULL; + + vfio_aer_check_host_bus_reset(&local_err); + if (local_err) { + fprintf(stderr, "%s\n", error_get_pretty(local_err)); + error_free(local_err); + exit(1); + } +} + +static Notifier machine_notifier = { + .notify = vfio_pci_machine_done_notify, +}; + static int vfio_initfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); @@ -2932,6 +3130,11 @@ static const TypeInfo vfio_pci_dev_info = { static void register_vfio_pci_dev_type(void) { type_register_static(&vfio_pci_dev_info); + /* + * Register notifier when machine init is done, since we need + * check the configration manner after all vfio device are inited. + */ + qemu_add_machine_init_done_notifier(&machine_notifier); } type_init(register_vfio_pci_dev_type) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 7b3924e..db7c6d5 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -15,6 +15,7 @@ #include "qemu-common.h" #include "exec/memory.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" #include "hw/pci/pci_bridge.h" #include "hw/vfio/vfio-common.h" #include "qemu/event_notifier.h"