diff mbox

[v3,3/3] vfio-pci: process non fatal error of AER

Message ID 1490260163-6157-4-git-send-email-caoj.fnst@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao jin March 23, 2017, 9:09 a.m. UTC
Make use of the non fatal error eventfd that the kernel module provide
to process the AER non fatal error. Fatal error still goes into the
legacy way which results in VM stop.

Register the handler, wait for notification. Construct aer message and
pass it to root port on notification. Root port will trigger an interrupt
to signal guest, then guest driver will do the recovery.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h              |   2 +
 linux-headers/linux/vfio.h |   2 +
 3 files changed, 206 insertions(+)

Comments

Alex Williamson March 24, 2017, 10:12 p.m. UTC | #1
On Thu, 23 Mar 2017 17:09:23 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> Make use of the non fatal error eventfd that the kernel module provide
> to process the AER non fatal error. Fatal error still goes into the
> legacy way which results in VM stop.
> 
> Register the handler, wait for notification. Construct aer message and
> pass it to root port on notification. Root port will trigger an interrupt
> to signal guest, then guest driver will do the recovery.

Can we guarantee this is the better solution in all cases or could
there be guests without AER support where the VM stop is the better
solution?

> 
> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h              |   2 +
>  linux-headers/linux/vfio.h |   2 +
>  3 files changed, 206 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 3d0d005..c6786d5 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>      vfio_put_base_device(&vdev->vbasedev);
>  }
>  
> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> +        .source_id = pci_requester_id(dev),
> +    };
> +
> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> +        return;
> +    }
> +
> +    /* Populate the aer msg and send it to root port */
> +    if (dev->exp.aer_cap) {

Why would we have registered this notifier otherwise?

> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +        if (!uncor_status) {
> +            return;
> +        }
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +        if (isfatal) {
> +            goto stop;
> +        }

Huh?  How can we get a non-fatal error notice for a fatal error?  (and
why are we saving this to a variable rather than testing it within the
'if' condition?

> +
> +        error_report("%s sending non fatal event to root port. uncor status = "
> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> +        pcie_aer_msg(dev, &msg);
> +        return;
> +    }
> +
> +stop:
> +    /* Terminate the guest in case of fatal error */
> +    error_report("%s: Device detected a fatal error. VM stopped",
> +	    vdev->vbasedev.name);
> +    vm_stop(RUN_STATE_INTERNAL_ERROR);

Shouldn't we use the existing error index if we can't make use of
correctable errors?

> +}
> +
> +/*
> + * Register non fatal error notifier for devices supporting error recovery.
> + * If we encounter a failure in this function, we report an error
> + * and continue after disabling error recovery support for the device.
> + */
> +static void vfio_register_non_fatal_err_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    if (event_notifier_init(&vdev->non_fatal_err_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for non-fatal error detection");
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->non_fatal_err_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_non_fatal_err_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up non-fatal error notification: %m");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->non_fatal_err_notifier);
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_non_fatal_err_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->non_fatal_err_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->non_fatal_err_notifier);
> +}
> +
> +static void vfio_passive_reset_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +
> +    if (!event_notifier_test_and_clear(&vdev->passive_reset_notifier)) {
> +        return;
> +    }
> +
> +    error_report("%s: Device lost state due to host device reset. VM stopped",
> +	    vdev->vbasedev.name);
> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> +}
> +
> +/*
> + * Register passive reset notifier, in case of certain function of a
> + * multifunction device is passthroughed,  while other functions are still
> + * controlled by device driver.
> + */
> +static void vfio_register_passive_reset_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    if (event_notifier_init(&vdev->passive_reset_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for passive reset");
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->passive_reset_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_passive_reset_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up passive reset notification: %m");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->passive_reset_notifier);
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_passive_reset_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->passive_reset_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->passive_reset_notifier);
> +}
> +
>  static void vfio_err_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          }
>      }
>  
> +    vfio_register_passive_reset_notifier(vdev);
> +    vfio_register_non_fatal_err_notifier(vdev);

I think it's wrong that we configure these unconditionally.  Why do we
care about these unless we're configuring the guest to receive AER
events?

>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
> @@ -2900,6 +3096,12 @@ static void vfio_exitfn(PCIDevice *pdev)
>  
>      vfio_unregister_req_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
> +    if (event_notifier_get_fd(&vdev->non_fatal_err_notifier)) {
> +        vfio_unregister_non_fatal_err_notifier(vdev);
> +    }
> +    if (event_notifier_get_fd(&vdev->passive_reset_notifier)) {
> +        vfio_unregister_passive_reset_notifier(vdev);
> +    }

Do these tests in the cleanup function.

>      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>      vfio_disable_interrupts(vdev);
>      if (vdev->intx.mmap_timer) {
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 34e8b04..b35c617 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -119,6 +119,8 @@ typedef struct VFIOPCIDevice {
>      void *igd_opregion;
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
> +    EventNotifier non_fatal_err_notifier;
> +    EventNotifier passive_reset_notifier;
>      EventNotifier req_notifier;
>      int (*resetfn)(struct VFIOPCIDevice *);
>      uint32_t vendor_id;
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 759b850..726ddbe 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -433,6 +433,8 @@ enum {
>  	VFIO_PCI_MSIX_IRQ_INDEX,
>  	VFIO_PCI_ERR_IRQ_INDEX,
>  	VFIO_PCI_REQ_IRQ_INDEX,
> +	VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX,
> +	VFIO_PCI_PASSIVE_RESET_IRQ_INDEX,
>  	VFIO_PCI_NUM_IRQS
>  };
>
Cao jin March 28, 2017, 1:49 p.m. UTC | #2
On 03/25/2017 06:12 AM, Alex Williamson wrote:
> On Thu, 23 Mar 2017 17:09:23 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> Make use of the non fatal error eventfd that the kernel module provide
>> to process the AER non fatal error. Fatal error still goes into the
>> legacy way which results in VM stop.
>>
>> Register the handler, wait for notification. Construct aer message and
>> pass it to root port on notification. Root port will trigger an interrupt
>> to signal guest, then guest driver will do the recovery.
> 
> Can we guarantee this is the better solution in all cases or could
> there be guests without AER support where the VM stop is the better
> solution?
> 

Currently, we only have VM stop on errors, that looks the same as a
sudden power down to me.  With this solution, we have about
50%(non-fatal) chance to reduce the sudden power-down risk.

What if a guest doesn't support AER?  It looks the same as a host
without AER support. Now I only can speculate the worst condition: guest
crash, would that be quite different from a sudden power-down?

>>
>> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
>> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
>> ---
>>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/pci.h              |   2 +
>>  linux-headers/linux/vfio.h |   2 +
>>  3 files changed, 206 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 3d0d005..c6786d5 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>      vfio_put_base_device(&vdev->vbasedev);
>>  }
>>  
>> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
>> +{
>> +    VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
>> +        .source_id = pci_requester_id(dev),
>> +    };
>> +
>> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
>> +        return;
>> +    }
>> +
>> +    /* Populate the aer msg and send it to root port */
>> +    if (dev->exp.aer_cap) {
> 
> Why would we have registered this notifier otherwise?
> 
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +        if (!uncor_status) {
>> +            return;
>> +        }
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>> +        if (isfatal) {
>> +            goto stop;
>> +        }
> 
> Huh?  How can we get a non-fatal error notice for a fatal error?  (and
> why are we saving this to a variable rather than testing it within the
> 'if' condition?
>

Both of these are for the unsure corner cases.
Is it possible that register reading shows a fatal error?
Saving it into a variable just is personal taste: more neat.

>> +
>> +        error_report("%s sending non fatal event to root port. uncor status = "
>> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
>> +        pcie_aer_msg(dev, &msg);
>> +        return;
>> +    }
>> +
>> +stop:
>> +    /* Terminate the guest in case of fatal error */
>> +    error_report("%s: Device detected a fatal error. VM stopped",
>> +	    vdev->vbasedev.name);
>> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> 
> Shouldn't we use the existing error index if we can't make use of
> correctable errors?
> 

Why? If register reading shows it is actually a fatal error, is it the
same as fatal error handler is notified?  what we use the existing error
index for?


>> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>>          }
>>      }
>>  
>> +    vfio_register_passive_reset_notifier(vdev);
>> +    vfio_register_non_fatal_err_notifier(vdev);
> 
> I think it's wrong that we configure these unconditionally.  Why do we
> care about these unless we're configuring the guest to receive AER
> events?
> 

But do we have ways to know whether the guest has AER support? For now,
I don't think so.

If guest don't have AER support, for the worst condition: guest crash,
it is not worse than a sudden power-down.
Alex Williamson March 28, 2017, 4:12 p.m. UTC | #3
On Tue, 28 Mar 2017 21:49:17 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 03/25/2017 06:12 AM, Alex Williamson wrote:
> > On Thu, 23 Mar 2017 17:09:23 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> Make use of the non fatal error eventfd that the kernel module provide
> >> to process the AER non fatal error. Fatal error still goes into the
> >> legacy way which results in VM stop.
> >>
> >> Register the handler, wait for notification. Construct aer message and
> >> pass it to root port on notification. Root port will trigger an interrupt
> >> to signal guest, then guest driver will do the recovery.  
> > 
> > Can we guarantee this is the better solution in all cases or could
> > there be guests without AER support where the VM stop is the better
> > solution?
> >   
> 
> Currently, we only have VM stop on errors, that looks the same as a
> sudden power down to me.  With this solution, we have about
> 50%(non-fatal) chance to reduce the sudden power-down risk.

If half of all faults are expected to be non-fatal, then you must have
some real examples of devices triggering non-fatal errors which can be
corrected in the guest driver that you can share to justify why it's a
good thing to enable this behavior.

> What if a guest doesn't support AER?  It looks the same as a host
> without AER support. Now I only can speculate the worst condition: guest
> crash, would that be quite different from a sudden power-down?

Yes, it's very different.  In one case we contain the fault by stopping
the guest, in the other case we allow the guest to continue operating
with a known fault in the device which may allow the fault to propagate
and perhaps go unnoticed.  We have established with the current
behavior that QEMU will prevent further propagation of a fault by
halting the VM.  To change QEMU's behavior here risks that a VM relying
on that behavior no longer has that protection.  So it seems we either
need to detect whether the VM is handling AER or we need to require the
VM administrator to opt-in to this new feature.  Real hardware has
these same issues and I believe there are handshakes that can be done
through ACPI to allow the guest to take over error handling from the
system.

> >> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> >> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> >> ---
> >>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/vfio/pci.h              |   2 +
> >>  linux-headers/linux/vfio.h |   2 +
> >>  3 files changed, 206 insertions(+)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 3d0d005..c6786d5 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
> >>      vfio_put_base_device(&vdev->vbasedev);
> >>  }
> >>  
> >> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> >> +{
> >> +    VFIOPCIDevice *vdev = opaque;
> >> +    PCIDevice *dev = &vdev->pdev;
> >> +    PCIEAERMsg msg = {
> >> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> >> +        .source_id = pci_requester_id(dev),
> >> +    };
> >> +
> >> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> >> +        return;
> >> +    }
> >> +
> >> +    /* Populate the aer msg and send it to root port */
> >> +    if (dev->exp.aer_cap) {  
> > 
> > Why would we have registered this notifier otherwise?
> >   
> >> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> >> +        uint32_t uncor_status;
> >> +        bool isfatal;
> >> +
> >> +        uncor_status = vfio_pci_read_config(dev,
> >> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> >> +        if (!uncor_status) {
> >> +            return;
> >> +        }
> >> +
> >> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> >> +        if (isfatal) {
> >> +            goto stop;
> >> +        }  
> > 
> > Huh?  How can we get a non-fatal error notice for a fatal error?  (and
> > why are we saving this to a variable rather than testing it within the
> > 'if' condition?
> >  
> 
> Both of these are for the unsure corner cases.
> Is it possible that register reading shows a fatal error?
> Saving it into a variable just is personal taste: more neat.

Why are there unsure corner cases?  Shouldn't the kernel have done this
check if there was any doubt whether the error was fatal or not?
Signaling the user with a non-fatal trigger for a fatal error certainly
doesn't make me have much confidence in this code.

> >> +
> >> +        error_report("%s sending non fatal event to root port. uncor status = "
> >> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> >> +        pcie_aer_msg(dev, &msg);
> >> +        return;
> >> +    }
> >> +
> >> +stop:
> >> +    /* Terminate the guest in case of fatal error */
> >> +    error_report("%s: Device detected a fatal error. VM stopped",
> >> +	    vdev->vbasedev.name);
> >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);  
> > 
> > Shouldn't we use the existing error index if we can't make use of
> > correctable errors?
> >   
> 
> Why? If register reading shows it is actually a fatal error, is it the
> same as fatal error handler is notified?  what we use the existing error
> index for?

See below.

> >> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> >>          }
> >>      }
> >>  
> >> +    vfio_register_passive_reset_notifier(vdev);
> >> +    vfio_register_non_fatal_err_notifier(vdev);  
> > 
> > I think it's wrong that we configure these unconditionally.  Why do we
> > care about these unless we're configuring the guest to receive AER
> > events?
> >   
> 
> But do we have ways to know whether the guest has AER support? For now,
> I don't think so.

By unconditionally here, I'm referring to not even testing whether the
device is in a VM topology where it can receive AER events.  If we
cannot signal the device, we don't need these additional triggers and
therefore we don't need the aer_cap test in the non-fatal error
handler, we can use the existing error index instead.  Enabling these
triggers at the point where the guest takes over error handling from
the system would be even better.
 
> If guest don't have AER support, for the worst condition: guest crash,
> it is not worse than a sudden power-down.

Worst case is that a non-fatal error introduces a data corruption which
was previously noted via a VM stop (even if asynchronous notification
allowed some propagation) and now potentially goes unnoticed.  That's a
very big difference.  Thanks,

Alex
Michael S. Tsirkin March 28, 2017, 11:59 p.m. UTC | #4
On Tue, Mar 28, 2017 at 10:12:25AM -0600, Alex Williamson wrote:
> On Tue, 28 Mar 2017 21:49:17 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
> > On 03/25/2017 06:12 AM, Alex Williamson wrote:
> > > On Thu, 23 Mar 2017 17:09:23 +0800
> > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > >   
> > >> Make use of the non fatal error eventfd that the kernel module provide
> > >> to process the AER non fatal error. Fatal error still goes into the
> > >> legacy way which results in VM stop.
> > >>
> > >> Register the handler, wait for notification. Construct aer message and
> > >> pass it to root port on notification. Root port will trigger an interrupt
> > >> to signal guest, then guest driver will do the recovery.  
> > > 
> > > Can we guarantee this is the better solution in all cases or could
> > > there be guests without AER support where the VM stop is the better
> > > solution?
> > >   
> > 
> > Currently, we only have VM stop on errors, that looks the same as a
> > sudden power down to me.  With this solution, we have about
> > 50%(non-fatal) chance to reduce the sudden power-down risk.
> 
> If half of all faults are expected to be non-fatal, then you must have
> some real examples of devices triggering non-fatal errors which can be
> corrected in the guest driver that you can share to justify why it's a
> good thing to enable this behavior.
> 
> > What if a guest doesn't support AER?  It looks the same as a host
> > without AER support. Now I only can speculate the worst condition: guest
> > crash, would that be quite different from a sudden power-down?
> 
> Yes, it's very different.  In one case we contain the fault by stopping
> the guest, in the other case we allow the guest to continue operating
> with a known fault in the device which may allow the fault to propagate
> and perhaps go unnoticed.  We have established with the current
> behavior that QEMU will prevent further propagation of a fault by
> halting the VM.  To change QEMU's behavior here risks that a VM relying
> on that behavior no longer has that protection.  So it seems we either
> need to detect whether the VM is handling AER or we need to require the
> VM administrator to opt-in to this new feature.

An opt-in flag sounds very reasonable. It can also specify whether
to log the errors. We have a similar flag for disk errors.

>  Real hardware has
> these same issues and I believe there are handshakes that can be done
> through ACPI to allow the guest to take over error handling from the
> system.

No, that's only for error reporting IIUC. Driver needs to be
aware of a chance for errors to trigger and be able to
handle them.

So yes, some guests might have benefitted from VM stop
on AER but
1. the stop happens asynchronously so if guest can't handle
   errors there's a chance it is already crashed by the time we
   try to do vm stop
2. it's more of a chance by-product - we never promised
   guests that VMs would be more robust than bare metal



> > >> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> > >> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> > >> ---
> > >>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
> > >>  hw/vfio/pci.h              |   2 +
> > >>  linux-headers/linux/vfio.h |   2 +
> > >>  3 files changed, 206 insertions(+)
> > >>
> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > >> index 3d0d005..c6786d5 100644
> > >> --- a/hw/vfio/pci.c
> > >> +++ b/hw/vfio/pci.c
> > >> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
> > >>      vfio_put_base_device(&vdev->vbasedev);
> > >>  }
> > >>  
> > >> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> > >> +{
> > >> +    VFIOPCIDevice *vdev = opaque;
> > >> +    PCIDevice *dev = &vdev->pdev;
> > >> +    PCIEAERMsg msg = {
> > >> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> > >> +        .source_id = pci_requester_id(dev),
> > >> +    };
> > >> +
> > >> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> > >> +        return;
> > >> +    }
> > >> +
> > >> +    /* Populate the aer msg and send it to root port */
> > >> +    if (dev->exp.aer_cap) {  
> > > 
> > > Why would we have registered this notifier otherwise?
> > >   
> > >> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> > >> +        uint32_t uncor_status;
> > >> +        bool isfatal;
> > >> +
> > >> +        uncor_status = vfio_pci_read_config(dev,
> > >> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> > >> +        if (!uncor_status) {
> > >> +            return;
> > >> +        }
> > >> +
> > >> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> > >> +        if (isfatal) {
> > >> +            goto stop;
> > >> +        }  
> > > 
> > > Huh?  How can we get a non-fatal error notice for a fatal error?  (and
> > > why are we saving this to a variable rather than testing it within the
> > > 'if' condition?
> > >  
> > 
> > Both of these are for the unsure corner cases.
> > Is it possible that register reading shows a fatal error?
> > Saving it into a variable just is personal taste: more neat.
> 
> Why are there unsure corner cases?  Shouldn't the kernel have done this
> check if there was any doubt whether the error was fatal or not?
> Signaling the user with a non-fatal trigger for a fatal error certainly
> doesn't make me have much confidence in this code.
> 
> > >> +
> > >> +        error_report("%s sending non fatal event to root port. uncor status = "
> > >> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> > >> +        pcie_aer_msg(dev, &msg);
> > >> +        return;
> > >> +    }
> > >> +
> > >> +stop:
> > >> +    /* Terminate the guest in case of fatal error */
> > >> +    error_report("%s: Device detected a fatal error. VM stopped",
> > >> +	    vdev->vbasedev.name);
> > >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);  
> > > 
> > > Shouldn't we use the existing error index if we can't make use of
> > > correctable errors?
> > >   
> > 
> > Why? If register reading shows it is actually a fatal error, is it the
> > same as fatal error handler is notified?  what we use the existing error
> > index for?
> 
> See below.
> 
> > >> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> > >>          }
> > >>      }
> > >>  
> > >> +    vfio_register_passive_reset_notifier(vdev);
> > >> +    vfio_register_non_fatal_err_notifier(vdev);  
> > > 
> > > I think it's wrong that we configure these unconditionally.  Why do we
> > > care about these unless we're configuring the guest to receive AER
> > > events?
> > >   
> > 
> > But do we have ways to know whether the guest has AER support? For now,
> > I don't think so.
> 
> By unconditionally here, I'm referring to not even testing whether the
> device is in a VM topology where it can receive AER events.  If we
> cannot signal the device, we don't need these additional triggers and
> therefore we don't need the aer_cap test in the non-fatal error
> handler, we can use the existing error index instead.  Enabling these
> triggers at the point where the guest takes over error handling from
> the system would be even better.
>  
> > If guest don't have AER support, for the worst condition: guest crash,
> > it is not worse than a sudden power-down.
> 
> Worst case is that a non-fatal error introduces a data corruption which
> was previously noted via a VM stop (even if asynchronous notification
> allowed some propagation) and now potentially goes unnoticed.  That's a
> very big difference.  Thanks,
> 
> Alex
Alex Williamson March 29, 2017, 2:55 a.m. UTC | #5
On Wed, 29 Mar 2017 02:59:34 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Mar 28, 2017 at 10:12:25AM -0600, Alex Williamson wrote:
> > On Tue, 28 Mar 2017 21:49:17 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> > > On 03/25/2017 06:12 AM, Alex Williamson wrote:  
> > > > On Thu, 23 Mar 2017 17:09:23 +0800
> > > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > > >     
> > > >> Make use of the non fatal error eventfd that the kernel module provide
> > > >> to process the AER non fatal error. Fatal error still goes into the
> > > >> legacy way which results in VM stop.
> > > >>
> > > >> Register the handler, wait for notification. Construct aer message and
> > > >> pass it to root port on notification. Root port will trigger an interrupt
> > > >> to signal guest, then guest driver will do the recovery.    
> > > > 
> > > > Can we guarantee this is the better solution in all cases or could
> > > > there be guests without AER support where the VM stop is the better
> > > > solution?
> > > >     
> > > 
> > > Currently, we only have VM stop on errors, that looks the same as a
> > > sudden power down to me.  With this solution, we have about
> > > 50%(non-fatal) chance to reduce the sudden power-down risk.  
> > 
> > If half of all faults are expected to be non-fatal, then you must have
> > some real examples of devices triggering non-fatal errors which can be
> > corrected in the guest driver that you can share to justify why it's a
> > good thing to enable this behavior.
> >   
> > > What if a guest doesn't support AER?  It looks the same as a host
> > > without AER support. Now I only can speculate the worst condition: guest
> > > crash, would that be quite different from a sudden power-down?  
> > 
> > Yes, it's very different.  In one case we contain the fault by stopping
> > the guest, in the other case we allow the guest to continue operating
> > with a known fault in the device which may allow the fault to propagate
> > and perhaps go unnoticed.  We have established with the current
> > behavior that QEMU will prevent further propagation of a fault by
> > halting the VM.  To change QEMU's behavior here risks that a VM relying
> > on that behavior no longer has that protection.  So it seems we either
> > need to detect whether the VM is handling AER or we need to require the
> > VM administrator to opt-in to this new feature.  
> 
> An opt-in flag sounds very reasonable. It can also specify whether
> to log the errors. We have a similar flag for disk errors.

An opt-in works, but is rather burdensome to the user.
 
> >  Real hardware has
> > these same issues and I believe there are handshakes that can be done
> > through ACPI to allow the guest to take over error handling from the
> > system.  
> 
> No, that's only for error reporting IIUC. Driver needs to be
> aware of a chance for errors to trigger and be able to
> handle them.

See drivers/acpi/pci_root.c:negotiate_os_control(), it seems that the
OSPM uses an _OSC to tell ACPI via OSC_PCI_EXPRESS_AER_CONTROL.  Would
that not be a reasonable mechanism for the guest to indicate AER
support?

> So yes, some guests might have benefitted from VM stop
> on AER but
> 1. the stop happens asynchronously so if guest can't handle
>    errors there's a chance it is already crashed by the time we
>    try to do vm stop

I fully concede that it's asynchronous, bad data can propagate and a
guest crash is one potential outcome.  That's fine, a guest crash
indicates a problem.  A VM stop also indicates a problem.  Potential
lack of a crash or VM stop is the worrisome case.

> 2. it's more of a chance by-product - we never promised
>    guests that VMs would be more robust than bare metal

Does that make it not a regression if we change the behavior?  I
wouldn't exactly call it a chance by-product, perhaps it wasn't the
primary motivation, but it was considered.  Thanks,

Alex

> > > >> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> > > >> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> > > >> ---
> > > >>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
> > > >>  hw/vfio/pci.h              |   2 +
> > > >>  linux-headers/linux/vfio.h |   2 +
> > > >>  3 files changed, 206 insertions(+)
> > > >>
> > > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > >> index 3d0d005..c6786d5 100644
> > > >> --- a/hw/vfio/pci.c
> > > >> +++ b/hw/vfio/pci.c
> > > >> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
> > > >>      vfio_put_base_device(&vdev->vbasedev);
> > > >>  }
> > > >>  
> > > >> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> > > >> +{
> > > >> +    VFIOPCIDevice *vdev = opaque;
> > > >> +    PCIDevice *dev = &vdev->pdev;
> > > >> +    PCIEAERMsg msg = {
> > > >> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> > > >> +        .source_id = pci_requester_id(dev),
> > > >> +    };
> > > >> +
> > > >> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> > > >> +        return;
> > > >> +    }
> > > >> +
> > > >> +    /* Populate the aer msg and send it to root port */
> > > >> +    if (dev->exp.aer_cap) {    
> > > > 
> > > > Why would we have registered this notifier otherwise?
> > > >     
> > > >> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> > > >> +        uint32_t uncor_status;
> > > >> +        bool isfatal;
> > > >> +
> > > >> +        uncor_status = vfio_pci_read_config(dev,
> > > >> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> > > >> +        if (!uncor_status) {
> > > >> +            return;
> > > >> +        }
> > > >> +
> > > >> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> > > >> +        if (isfatal) {
> > > >> +            goto stop;
> > > >> +        }    
> > > > 
> > > > Huh?  How can we get a non-fatal error notice for a fatal error?  (and
> > > > why are we saving this to a variable rather than testing it within the
> > > > 'if' condition?
> > > >    
> > > 
> > > Both of these are for the unsure corner cases.
> > > Is it possible that register reading shows a fatal error?
> > > Saving it into a variable just is personal taste: more neat.  
> > 
> > Why are there unsure corner cases?  Shouldn't the kernel have done this
> > check if there was any doubt whether the error was fatal or not?
> > Signaling the user with a non-fatal trigger for a fatal error certainly
> > doesn't make me have much confidence in this code.
> >   
> > > >> +
> > > >> +        error_report("%s sending non fatal event to root port. uncor status = "
> > > >> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> > > >> +        pcie_aer_msg(dev, &msg);
> > > >> +        return;
> > > >> +    }
> > > >> +
> > > >> +stop:
> > > >> +    /* Terminate the guest in case of fatal error */
> > > >> +    error_report("%s: Device detected a fatal error. VM stopped",
> > > >> +	    vdev->vbasedev.name);
> > > >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);    
> > > > 
> > > > Shouldn't we use the existing error index if we can't make use of
> > > > correctable errors?
> > > >     
> > > 
> > > Why? If register reading shows it is actually a fatal error, is it the
> > > same as fatal error handler is notified?  what we use the existing error
> > > index for?  
> > 
> > See below.
> >   
> > > >> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> > > >>          }
> > > >>      }
> > > >>  
> > > >> +    vfio_register_passive_reset_notifier(vdev);
> > > >> +    vfio_register_non_fatal_err_notifier(vdev);    
> > > > 
> > > > I think it's wrong that we configure these unconditionally.  Why do we
> > > > care about these unless we're configuring the guest to receive AER
> > > > events?
> > > >     
> > > 
> > > But do we have ways to know whether the guest has AER support? For now,
> > > I don't think so.  
> > 
> > By unconditionally here, I'm referring to not even testing whether the
> > device is in a VM topology where it can receive AER events.  If we
> > cannot signal the device, we don't need these additional triggers and
> > therefore we don't need the aer_cap test in the non-fatal error
> > handler, we can use the existing error index instead.  Enabling these
> > triggers at the point where the guest takes over error handling from
> > the system would be even better.
> >    
> > > If guest don't have AER support, for the worst condition: guest crash,
> > > it is not worse than a sudden power-down.  
> > 
> > Worst case is that a non-fatal error introduces a data corruption which
> > was previously noted via a VM stop (even if asynchronous notification
> > allowed some propagation) and now potentially goes unnoticed.  That's a
> > very big difference.  Thanks,
> > 
> > Alex
Michael S. Tsirkin April 25, 2017, 8:32 p.m. UTC | #6
On Tue, Mar 28, 2017 at 08:55:04PM -0600, Alex Williamson wrote:
> On Wed, 29 Mar 2017 02:59:34 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, Mar 28, 2017 at 10:12:25AM -0600, Alex Williamson wrote:
> > > On Tue, 28 Mar 2017 21:49:17 +0800
> > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > >   
> > > > On 03/25/2017 06:12 AM, Alex Williamson wrote:  
> > > > > On Thu, 23 Mar 2017 17:09:23 +0800
> > > > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > > > >     
> > > > >> Make use of the non fatal error eventfd that the kernel module provide
> > > > >> to process the AER non fatal error. Fatal error still goes into the
> > > > >> legacy way which results in VM stop.
> > > > >>
> > > > >> Register the handler, wait for notification. Construct aer message and
> > > > >> pass it to root port on notification. Root port will trigger an interrupt
> > > > >> to signal guest, then guest driver will do the recovery.    
> > > > > 
> > > > > Can we guarantee this is the better solution in all cases or could
> > > > > there be guests without AER support where the VM stop is the better
> > > > > solution?
> > > > >     
> > > > 
> > > > Currently, we only have VM stop on errors, that looks the same as a
> > > > sudden power down to me.  With this solution, we have about
> > > > 50%(non-fatal) chance to reduce the sudden power-down risk.  
> > > 
> > > If half of all faults are expected to be non-fatal, then you must have
> > > some real examples of devices triggering non-fatal errors which can be
> > > corrected in the guest driver that you can share to justify why it's a
> > > good thing to enable this behavior.
> > >   
> > > > What if a guest doesn't support AER?  It looks the same as a host
> > > > without AER support. Now I only can speculate the worst condition: guest
> > > > crash, would that be quite different from a sudden power-down?  
> > > 
> > > Yes, it's very different.  In one case we contain the fault by stopping
> > > the guest, in the other case we allow the guest to continue operating
> > > with a known fault in the device which may allow the fault to propagate
> > > and perhaps go unnoticed.  We have established with the current
> > > behavior that QEMU will prevent further propagation of a fault by
> > > halting the VM.  To change QEMU's behavior here risks that a VM relying
> > > on that behavior no longer has that protection.  So it seems we either
> > > need to detect whether the VM is handling AER or we need to require the
> > > VM administrator to opt-in to this new feature.  
> > 
> > An opt-in flag sounds very reasonable. It can also specify whether
> > to log the errors. We have a similar flag for disk errors.
> 
> An opt-in works, but is rather burdensome to the user.
>  
> > >  Real hardware has
> > > these same issues and I believe there are handshakes that can be done
> > > through ACPI to allow the guest to take over error handling from the
> > > system.  
> > 
> > No, that's only for error reporting IIUC. Driver needs to be
> > aware of a chance for errors to trigger and be able to
> > handle them.
> 
> See drivers/acpi/pci_root.c:negotiate_os_control(), it seems that the
> OSPM uses an _OSC to tell ACPI via OSC_PCI_EXPRESS_AER_CONTROL.  Would
> that not be a reasonable mechanism for the guest to indicate AER
> support?

I'm not sure - it seems to be designed for firmware that can drive
AER natively. E.g. if we ever set FIRMWARE_FIRST then linux
will not set this bit.

It's also global so doesn't really indicate a given driver
supports AER.

Still - would this remove some or all of your concern?

Could you outline a set of requirements that can be satisfied
to make you consider the feature for inclusion?

I tried by writing up
	vfio/pci: guest error recovery proposal
back in December and there didn't seem to be any objections, so I am
quite surprised to see patches implementing that proposal more or less
verbatim getting rejected.

Could you write up a proposal of your own? It shouldn't take
long but I don't believe progress can be made otherwise.

Please note that at least in Linux most driver developers
test using software error injection. Documentation/PCI/pcieaer-howto.txt
actually says:

	4. Software error injection

	Debugging PCIe AER error recovery code is quite difficult because it
	is hard to trigger real hardware errors. Software based error
	injection can be used to fake various kinds of PCIe errors.

it might be that even though yes, we'd prefer real testing
we just might have to be satisfied with software injection.


> > So yes, some guests might have benefitted from VM stop
> > on AER but
> > 1. the stop happens asynchronously so if guest can't handle
> >    errors there's a chance it is already crashed by the time we
> >    try to do vm stop
> 
> I fully concede that it's asynchronous, bad data can propagate and a
> guest crash is one potential outcome.  That's fine, a guest crash
> indicates a problem.  A VM stop also indicates a problem.  Potential
> lack of a crash or VM stop is the worrisome case.
> 
> > 2. it's more of a chance by-product - we never promised
> >    guests that VMs would be more robust than bare metal
> 
> Does that make it not a regression if we change the behavior?  I
> wouldn't exactly call it a chance by-product, perhaps it wasn't the
> primary motivation, but it was considered.  Thanks,
> 
> Alex
> > > > >> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> > > > >> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> > > > >> ---
> > > > >>  hw/vfio/pci.c              | 202 +++++++++++++++++++++++++++++++++++++++++++++
> > > > >>  hw/vfio/pci.h              |   2 +
> > > > >>  linux-headers/linux/vfio.h |   2 +
> > > > >>  3 files changed, 206 insertions(+)
> > > > >>
> > > > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > >> index 3d0d005..c6786d5 100644
> > > > >> --- a/hw/vfio/pci.c
> > > > >> +++ b/hw/vfio/pci.c
> > > > >> @@ -2432,6 +2432,200 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
> > > > >>      vfio_put_base_device(&vdev->vbasedev);
> > > > >>  }
> > > > >>  
> > > > >> +static void vfio_non_fatal_err_notifier_handler(void *opaque)
> > > > >> +{
> > > > >> +    VFIOPCIDevice *vdev = opaque;
> > > > >> +    PCIDevice *dev = &vdev->pdev;
> > > > >> +    PCIEAERMsg msg = {
> > > > >> +        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
> > > > >> +        .source_id = pci_requester_id(dev),
> > > > >> +    };
> > > > >> +
> > > > >> +    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
> > > > >> +        return;
> > > > >> +    }
> > > > >> +
> > > > >> +    /* Populate the aer msg and send it to root port */
> > > > >> +    if (dev->exp.aer_cap) {    
> > > > > 
> > > > > Why would we have registered this notifier otherwise?
> > > > >     
> > > > >> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> > > > >> +        uint32_t uncor_status;
> > > > >> +        bool isfatal;
> > > > >> +
> > > > >> +        uncor_status = vfio_pci_read_config(dev,
> > > > >> +                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> > > > >> +        if (!uncor_status) {
> > > > >> +            return;
> > > > >> +        }
> > > > >> +
> > > > >> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> > > > >> +        if (isfatal) {
> > > > >> +            goto stop;
> > > > >> +        }    
> > > > > 
> > > > > Huh?  How can we get a non-fatal error notice for a fatal error?  (and
> > > > > why are we saving this to a variable rather than testing it within the
> > > > > 'if' condition?
> > > > >    
> > > > 
> > > > Both of these are for the unsure corner cases.
> > > > Is it possible that register reading shows a fatal error?
> > > > Saving it into a variable just is personal taste: more neat.  
> > > 
> > > Why are there unsure corner cases?  Shouldn't the kernel have done this
> > > check if there was any doubt whether the error was fatal or not?
> > > Signaling the user with a non-fatal trigger for a fatal error certainly
> > > doesn't make me have much confidence in this code.
> > >   
> > > > >> +
> > > > >> +        error_report("%s sending non fatal event to root port. uncor status = "
> > > > >> +                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
> > > > >> +        pcie_aer_msg(dev, &msg);
> > > > >> +        return;
> > > > >> +    }
> > > > >> +
> > > > >> +stop:
> > > > >> +    /* Terminate the guest in case of fatal error */
> > > > >> +    error_report("%s: Device detected a fatal error. VM stopped",
> > > > >> +	    vdev->vbasedev.name);
> > > > >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);    
> > > > > 
> > > > > Shouldn't we use the existing error index if we can't make use of
> > > > > correctable errors?
> > > > >     
> > > > 
> > > > Why? If register reading shows it is actually a fatal error, is it the
> > > > same as fatal error handler is notified?  what we use the existing error
> > > > index for?  
> > > 
> > > See below.
> > >   
> > > > >> @@ -2860,6 +3054,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> > > > >>          }
> > > > >>      }
> > > > >>  
> > > > >> +    vfio_register_passive_reset_notifier(vdev);
> > > > >> +    vfio_register_non_fatal_err_notifier(vdev);    
> > > > > 
> > > > > I think it's wrong that we configure these unconditionally.  Why do we
> > > > > care about these unless we're configuring the guest to receive AER
> > > > > events?
> > > > >     
> > > > 
> > > > But do we have ways to know whether the guest has AER support? For now,
> > > > I don't think so.  
> > > 
> > > By unconditionally here, I'm referring to not even testing whether the
> > > device is in a VM topology where it can receive AER events.  If we
> > > cannot signal the device, we don't need these additional triggers and
> > > therefore we don't need the aer_cap test in the non-fatal error
> > > handler, we can use the existing error index instead.  Enabling these
> > > triggers at the point where the guest takes over error handling from
> > > the system would be even better.
> > >    
> > > > If guest don't have AER support, for the worst condition: guest crash,
> > > > it is not worse than a sudden power-down.  
> > > 
> > > Worst case is that a non-fatal error introduces a data corruption which
> > > was previously noted via a VM stop (even if asynchronous notification
> > > allowed some propagation) and now potentially goes unnoticed.  That's a
> > > very big difference.  Thanks,
> > > 
> > > Alex  
>
Alex Williamson April 26, 2017, 12:06 a.m. UTC | #7
On Tue, 25 Apr 2017 23:32:52 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Mar 28, 2017 at 08:55:04PM -0600, Alex Williamson wrote:
> > On Wed, 29 Mar 2017 02:59:34 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, Mar 28, 2017 at 10:12:25AM -0600, Alex Williamson wrote:  
> > > > On Tue, 28 Mar 2017 21:49:17 +0800
> > > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > > >     
> > > > > On 03/25/2017 06:12 AM, Alex Williamson wrote:    
> > > > > > On Thu, 23 Mar 2017 17:09:23 +0800
> > > > > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > > > > >       
> > > > > >> Make use of the non fatal error eventfd that the kernel module provide
> > > > > >> to process the AER non fatal error. Fatal error still goes into the
> > > > > >> legacy way which results in VM stop.
> > > > > >>
> > > > > >> Register the handler, wait for notification. Construct aer message and
> > > > > >> pass it to root port on notification. Root port will trigger an interrupt
> > > > > >> to signal guest, then guest driver will do the recovery.      
> > > > > > 
> > > > > > Can we guarantee this is the better solution in all cases or could
> > > > > > there be guests without AER support where the VM stop is the better
> > > > > > solution?
> > > > > >       
> > > > > 
> > > > > Currently, we only have VM stop on errors, that looks the same as a
> > > > > sudden power down to me.  With this solution, we have about
> > > > > 50%(non-fatal) chance to reduce the sudden power-down risk.    
> > > > 
> > > > If half of all faults are expected to be non-fatal, then you must have
> > > > some real examples of devices triggering non-fatal errors which can be
> > > > corrected in the guest driver that you can share to justify why it's a
> > > > good thing to enable this behavior.
> > > >     
> > > > > What if a guest doesn't support AER?  It looks the same as a host
> > > > > without AER support. Now I only can speculate the worst condition: guest
> > > > > crash, would that be quite different from a sudden power-down?    
> > > > 
> > > > Yes, it's very different.  In one case we contain the fault by stopping
> > > > the guest, in the other case we allow the guest to continue operating
> > > > with a known fault in the device which may allow the fault to propagate
> > > > and perhaps go unnoticed.  We have established with the current
> > > > behavior that QEMU will prevent further propagation of a fault by
> > > > halting the VM.  To change QEMU's behavior here risks that a VM relying
> > > > on that behavior no longer has that protection.  So it seems we either
> > > > need to detect whether the VM is handling AER or we need to require the
> > > > VM administrator to opt-in to this new feature.    
> > > 
> > > An opt-in flag sounds very reasonable. It can also specify whether
> > > to log the errors. We have a similar flag for disk errors.  
> > 
> > An opt-in works, but is rather burdensome to the user.
> >    
> > > >  Real hardware has
> > > > these same issues and I believe there are handshakes that can be done
> > > > through ACPI to allow the guest to take over error handling from the
> > > > system.    
> > > 
> > > No, that's only for error reporting IIUC. Driver needs to be
> > > aware of a chance for errors to trigger and be able to
> > > handle them.  
> > 
> > See drivers/acpi/pci_root.c:negotiate_os_control(), it seems that the
> > OSPM uses an _OSC to tell ACPI via OSC_PCI_EXPRESS_AER_CONTROL.  Would
> > that not be a reasonable mechanism for the guest to indicate AER
> > support?  
> 
> I'm not sure - it seems to be designed for firmware that can drive
> AER natively. E.g. if we ever set FIRMWARE_FIRST then linux
> will not set this bit.
> 
> It's also global so doesn't really indicate a given driver
> supports AER.
> 
> Still - would this remove some or all of your concern?

Certainly not all, I have rather deep concerns about where we're going
here.

> Could you outline a set of requirements that can be satisfied
> to make you consider the feature for inclusion?

Like any enhancement, show me that it's useful, show me that we've done
due diligence in researching the problem and solution, show me that
we're not painting ourselves into a corner by only addressing a subset
of the problem, show me that it's been thoroughly tested and reviewed.
Currently, I have little confidence in any of this.  We seem to just be
tossing AER spitballs at the wall hoping one of them sticks.
 
> I tried by writing up
> 	vfio/pci: guest error recovery proposal
> back in December and there didn't seem to be any objections, so I am
> quite surprised to see patches implementing that proposal more or less
> verbatim getting rejected.

Are the concerns from the patch review not valid?  I think this is
indicative of the problems we've had throughout this 2+ year
development process, a suggestion is made and it's implemented without
a thorough analysis of the surrounding issues (or even testing), patches
are sent out, issues are found, further suggestions are made, and a new
revision comes out with the same lack of insight.  Rinse, repeat.  It's
effectively writing patches by proxy.
 
> Could you write up a proposal of your own? It shouldn't take
> long but I don't believe progress can be made otherwise.

Why do you make it sound like this should be an easy task?  Clearly the
problem is hard and a cursory proposal only feeds into the process I
mention above.  If someone wants to make a legitimate attempt at
improving this space, I feel like they really need to own the problem
themselves, invest in the research to figure out all the nuances.

> Please note that at least in Linux most driver developers
> test using software error injection. Documentation/PCI/pcieaer-howto.txt
> actually says:
> 
> 	4. Software error injection
> 
> 	Debugging PCIe AER error recovery code is quite difficult because it
> 	is hard to trigger real hardware errors. Software based error
> 	injection can be used to fake various kinds of PCIe errors.
> 
> it might be that even though yes, we'd prefer real testing
> we just might have to be satisfied with software injection.

Of course artificial AER injection is the way to go for testing.  I
don't expect someone to stick a wet finger into a system to try to
generate transient hardware failures, but I do expect more testing than
we've seen in the past.

Really, to make progress in this space I think we need to first
determine the actual problem we're trying to solve.  Are non-fatal
errors really a significant problem to tackle on their own or are we
just trying to appease the patch submitter by helping to get anything
AER related upstream?  I don't think that really helps anyone long
term.  So what if we return to full AER support?  We were stuck on bus
resets and how to coordinate host recovery resets vs guest resets. It's
clearly a hard problem and your approach was to avoid it by handling a
sub-class of errors for which bus resets are perhaps unnecessary.  What
if instead of a sub-class of errors, we pick a sub-class of devices for
which there's no such thing as a bus reset.  Aren't SR-IOV VFs really
designed for this use case?  I think with RAS features we're really
targeting the enterprise use cases, where VFs are (or should be) the
predominant class of device.  Would this work?  I don't know, and I
really don't want this to be just another idea that gets caught in the
cyclic rut we're stuck in so far.  If someone wants to pursue this I'd
expect to see some research into how errors are propagated from PF to
VF and certainly testing on relevant hardware.  Thanks,

Alex
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3d0d005..c6786d5 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2432,6 +2432,200 @@  static void vfio_put_device(VFIOPCIDevice *vdev)
     vfio_put_base_device(&vdev->vbasedev);
 }
 
+static void vfio_non_fatal_err_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN,
+        .source_id = pci_requester_id(dev),
+    };
+
+    if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) {
+        return;
+    }
+
+    /* Populate the aer msg and send it to root port */
+    if (dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                            dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+        if (!uncor_status) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+        if (isfatal) {
+            goto stop;
+        }
+
+        error_report("%s sending non fatal event to root port. uncor status = "
+                     "0x%"PRIx32, vdev->vbasedev.name, uncor_status);
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
+    /* Terminate the guest in case of fatal error */
+    error_report("%s: Device detected a fatal error. VM stopped",
+	    vdev->vbasedev.name);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
+}
+
+/*
+ * Register non fatal error notifier for devices supporting error recovery.
+ * If we encounter a failure in this function, we report an error
+ * and continue after disabling error recovery support for the device.
+ */
+static void vfio_register_non_fatal_err_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    if (event_notifier_init(&vdev->non_fatal_err_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for non-fatal error detection");
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->non_fatal_err_notifier);
+    qemu_set_fd_handler(*pfd, vfio_non_fatal_err_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up non-fatal error notification: %m");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->non_fatal_err_notifier);
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_non_fatal_err_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->non_fatal_err_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->non_fatal_err_notifier);
+}
+
+static void vfio_passive_reset_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->passive_reset_notifier)) {
+        return;
+    }
+
+    error_report("%s: Device lost state due to host device reset. VM stopped",
+	    vdev->vbasedev.name);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
+}
+
+/*
+ * Register passive reset notifier, in case of certain function of a
+ * multifunction device is passthroughed,  while other functions are still
+ * controlled by device driver.
+ */
+static void vfio_register_passive_reset_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    if (event_notifier_init(&vdev->passive_reset_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for passive reset");
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->passive_reset_notifier);
+    qemu_set_fd_handler(*pfd, vfio_passive_reset_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up passive reset notification: %m");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->passive_reset_notifier);
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_passive_reset_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->passive_reset_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->passive_reset_notifier);
+}
+
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
@@ -2860,6 +3054,8 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 
+    vfio_register_passive_reset_notifier(vdev);
+    vfio_register_non_fatal_err_notifier(vdev);
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
@@ -2900,6 +3096,12 @@  static void vfio_exitfn(PCIDevice *pdev)
 
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
+    if (event_notifier_get_fd(&vdev->non_fatal_err_notifier)) {
+        vfio_unregister_non_fatal_err_notifier(vdev);
+    }
+    if (event_notifier_get_fd(&vdev->passive_reset_notifier)) {
+        vfio_unregister_passive_reset_notifier(vdev);
+    }
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
     vfio_disable_interrupts(vdev);
     if (vdev->intx.mmap_timer) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 34e8b04..b35c617 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -119,6 +119,8 @@  typedef struct VFIOPCIDevice {
     void *igd_opregion;
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
+    EventNotifier non_fatal_err_notifier;
+    EventNotifier passive_reset_notifier;
     EventNotifier req_notifier;
     int (*resetfn)(struct VFIOPCIDevice *);
     uint32_t vendor_id;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 759b850..726ddbe 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -433,6 +433,8 @@  enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
+	VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX,
+	VFIO_PCI_PASSIVE_RESET_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };