diff mbox

vfio/pci: Support error recovery

Message ID 1480246457-10368-1-git-send-email-caoj.fnst@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao jin Nov. 27, 2016, 11:34 a.m. UTC
It is user space driver's or device-specific driver's(in guest) responsbility
to do a serious recovery when error happened. Link-reset is one part of
recovery, when pci device is assigned to VM via vfio, link-reset will do
twice in host & guest separately, which will cause many trouble for a
successful recovery, so, disable the vfio-pci's link-reset in aer driver
in host, this is a keypoint for guest to do error recovery successfully.

CC: alex.williamson@redhat.com
CC: mst@redhat.com
Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
---
This is actually a RFC version(has debug lines left), and has minor changes in
aer driver, so I think maybe it is better not to CC pci guys in this round.
Later will do.

 drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
 drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 3 files changed, 74 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin Nov. 28, 2016, 3 a.m. UTC | #1
On Sun, Nov 27, 2016 at 07:34:17PM +0800, Cao jin wrote:
> It is user space driver's or device-specific driver's(in guest) responsbility
> to do a serious recovery when error happened. Link-reset is one part of
> recovery, when pci device is assigned to VM via vfio, link-reset will do
> twice in host & guest separately, which will cause many trouble for a
> successful recovery, so, disable the vfio-pci's link-reset in aer driver
> in host, this is a keypoint for guest to do error recovery successfully.
> 
> CC: alex.williamson@redhat.com
> CC: mst@redhat.com
> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> ---
> This is actually a RFC version(has debug lines left), and has minor changes in
> aer driver, so I think maybe it is better not to CC pci guys in this round.
> Later will do.
> 
>  drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
>  drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
>  drivers/vfio/pci/vfio_pci_private.h |  2 ++
>  3 files changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> index 521e39c..289fb8e 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
>  			"error_detected",
>  			report_error_detected);
>  
> -	if (severity == AER_FATAL) {
> +	/* vfio-pci as a general meta driver, it actually couldn't do any real
> +	 * recovery for device. It is user space driver, or device-specific
> +	 * driver in guest who should take care of the serious error recovery,
> +	 * link reset actually is one part of whole recovery. Doing reset_link
> +	 * in aer driver of host kernel for vfio-pci devices will cause many
> +	 * trouble for user space driver or guest's device-specific driver,
> +	 * for example: the serious recovery often need to read register in
> +	 * config space, but if register reading happens during link-resetting,
> +	 * it is quite possible to return invalid value like all F's, which
> +	 * will result in unpredictable error. */

Fix multi-comment style please.

> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {

You really want some flag in the device, or something similar.
Also, how do we know driver is not going away at this point?


>  		result = reset_link(dev);
>  		if (result != PCI_ERS_RESULT_RECOVERED)
>  			goto failed;
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 712a849..aefd751 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -535,6 +535,15 @@ static long vfio_pci_ioctl(void *device_data,
>  	struct vfio_pci_device *vdev = device_data;
>  	unsigned long minsz;
>  
> +	if (vdev->aer_recovering && (cmd == VFIO_DEVICE_SET_IRQS ||
> +	    cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
> +		int ret;
> +		ret = wait_for_completion_interruptible(
> +			&vdev->aer_error_completion);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	if (cmd == VFIO_DEVICE_GET_INFO) {
>  		struct vfio_device_info info;
>  
> @@ -1117,6 +1126,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	vdev->irq_type = VFIO_PCI_NUM_IRQS;
>  	mutex_init(&vdev->igate);
>  	spin_lock_init(&vdev->irqlock);
> +	init_completion(&vdev->aer_error_completion);
>  
>  	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
>  	if (ret) {
> @@ -1176,6 +1186,9 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  {
>  	struct vfio_pci_device *vdev;
>  	struct vfio_device *device;
> +	u32 uncor_status = 0;
> +	unsigned int aer_cap_offset = 0;
> +	int ret;
>  
>  	device = vfio_device_get_from_dev(&pdev->dev);
>  	if (device == NULL)
> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> +	/* get device's uncorrectable error status as soon as possible,
> +	 * and signal it to user space. The later we read it, the possibility
> +	 * the register value is mangled grows. */
> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> +        if (ret)
> +                return PCI_ERS_RESULT_DISCONNECT;
> +
> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>  	mutex_lock(&vdev->igate);
> +    
> +	vdev->aer_recovering = true;
> +	reinit_completion(&vdev->aer_error_completion);
> +
> +	/* suspend config space access from user space,
> +	 * when vfio-pci's error recovery process is on */

what about access to memory etc? Do you need to suspend this as well?

> +	pci_cfg_access_trylock(vdev->pdev);

If you trylock, you need to handle failure.

>  
> -	if (vdev->err_trigger)
> -		eventfd_signal(vdev->err_trigger, 1);
> +	if (vdev->err_trigger && uncor_status) {
> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
> +		/* signal uncorrectable error status to user space */
> +		eventfd_signal(vdev->err_trigger, uncor_status);
> +        }
>  
>  	mutex_unlock(&vdev->igate);
>  
> @@ -1199,8 +1232,34 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  	return PCI_ERS_RESULT_CAN_RECOVER;
>  }
>  
> +static void vfio_pci_aer_resume(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_device *vdev;
> +	struct vfio_device *device;
> +
> +	device = vfio_device_get_from_dev(&pdev->dev);
> +	if (device == NULL)
> +		return;
> +
> +	vdev = vfio_device_data(device);
> +	if (vdev == NULL) {
> +		vfio_device_put(device);
> +		return;
> +	}
> +
> +	/* vfio-pci's error recovery is done, time to
> +	 * resume pci config space's accesses */
> +	pci_cfg_access_unlock(vdev->pdev);

pci_cfg_access blocks accesses through sysfs.
I would not bother, focus on accesses through vfio.


> +
> +	vdev->aer_recovering = false;

don't you need locks when changing aer_recovering?

> +	complete_all(&vdev->aer_error_completion);
> +
> +	vfio_device_put(device);
> +}
> +
>  static const struct pci_error_handlers vfio_err_handlers = {
>  	.error_detected = vfio_pci_aer_err_detected,
> +	.resume         = vfio_pci_aer_resume,
>  };
>  
>  static struct pci_driver vfio_pci_driver = {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 8a7d546..ebf1041 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -83,6 +83,8 @@ struct vfio_pci_device {
>  	bool			bardirty;
>  	bool			has_vga;
>  	bool			needs_reset;
> +	bool			aer_recovering;
> +	struct completion	aer_error_completion;
>  	struct pci_saved_state	*pci_saved_state;
>  	int			refcnt;
>  	struct eventfd_ctx	*err_trigger;
> -- 
> 1.8.3.1
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Nov. 28, 2016, 9:32 a.m. UTC | #2
On 11/28/2016 11:00 AM, Michael S. Tsirkin wrote:
> On Sun, Nov 27, 2016 at 07:34:17PM +0800, Cao jin wrote:
>> It is user space driver's or device-specific driver's(in guest) responsbility
>> to do a serious recovery when error happened. Link-reset is one part of
>> recovery, when pci device is assigned to VM via vfio, link-reset will do
>> twice in host & guest separately, which will cause many trouble for a
>> successful recovery, so, disable the vfio-pci's link-reset in aer driver
>> in host, this is a keypoint for guest to do error recovery successfully.
>>
>> CC: alex.williamson@redhat.com
>> CC: mst@redhat.com
>> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
>> ---
>> This is actually a RFC version(has debug lines left), and has minor changes in
>> aer driver, so I think maybe it is better not to CC pci guys in this round.
>> Later will do.
>>
>>   drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
>>   drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
>>   drivers/vfio/pci/vfio_pci_private.h |  2 ++
>>   3 files changed, 74 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
>> index 521e39c..289fb8e 100644
>> --- a/drivers/pci/pcie/aer/aerdrv_core.c
>> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
>> @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
>>   			"error_detected",
>>   			report_error_detected);
>>
>> -	if (severity == AER_FATAL) {
>> +	/* vfio-pci as a general meta driver, it actually couldn't do any real
>> +	 * recovery for device. It is user space driver, or device-specific
>> +	 * driver in guest who should take care of the serious error recovery,
>> +	 * link reset actually is one part of whole recovery. Doing reset_link
>> +	 * in aer driver of host kernel for vfio-pci devices will cause many
>> +	 * trouble for user space driver or guest's device-specific driver,
>> +	 * for example: the serious recovery often need to read register in
>> +	 * config space, but if register reading happens during link-resetting,
>> +	 * it is quite possible to return invalid value like all F's, which
>> +	 * will result in unpredictable error. */
>
> Fix multi-comment style please.
>
>> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
>
> You really want some flag in the device, or something similar.
> Also, how do we know driver is not going away at this point?
>

I didn't think of this condition, and I don't quite follow how would 
driver go away?(device has error happened, then is removed?)

>>   		result = reset_link(dev);
>>   		if (result != PCI_ERS_RESULT_RECOVERED)
>>   			goto failed;

>> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>>   		return PCI_ERS_RESULT_DISCONNECT;
>>   	}
>>
>> +	/* get device's uncorrectable error status as soon as possible,
>> +	 * and signal it to user space. The later we read it, the possibility
>> +	 * the register value is mangled grows. */
>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset +
>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
>> +        if (ret)
>> +                return PCI_ERS_RESULT_DISCONNECT;
>> +
>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>>   	mutex_lock(&vdev->igate);
>> +
>> +	vdev->aer_recovering = true;
>> +	reinit_completion(&vdev->aer_error_completion);
>> +
>> +	/* suspend config space access from user space,
>> +	 * when vfio-pci's error recovery process is on */
>
> what about access to memory etc? Do you need to suspend this as well?
>

Yes, this question came into my mind a little bit, but I didn't see some 
existing APIs like pci_cfg_access_xxx which can help to do this.(I am 
still not familiar with kernel)

>> +	pci_cfg_access_trylock(vdev->pdev);
>
> If you trylock, you need to handle failure.

try lock returns 0 if access is already locked, 1 otherwise. Is it 
necessary to check its return value?
Michael S. Tsirkin Nov. 30, 2016, 1:46 a.m. UTC | #3
On Mon, Nov 28, 2016 at 05:32:15PM +0800, Cao jin wrote:
> 
> 
> On 11/28/2016 11:00 AM, Michael S. Tsirkin wrote:
> > On Sun, Nov 27, 2016 at 07:34:17PM +0800, Cao jin wrote:
> > > It is user space driver's or device-specific driver's(in guest) responsbility
> > > to do a serious recovery when error happened. Link-reset is one part of
> > > recovery, when pci device is assigned to VM via vfio, link-reset will do
> > > twice in host & guest separately, which will cause many trouble for a
> > > successful recovery, so, disable the vfio-pci's link-reset in aer driver
> > > in host, this is a keypoint for guest to do error recovery successfully.
> > > 
> > > CC: alex.williamson@redhat.com
> > > CC: mst@redhat.com
> > > Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> > > ---
> > > This is actually a RFC version(has debug lines left), and has minor changes in
> > > aer driver, so I think maybe it is better not to CC pci guys in this round.
> > > Later will do.
> > > 
> > >   drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
> > >   drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
> > >   drivers/vfio/pci/vfio_pci_private.h |  2 ++
> > >   3 files changed, 74 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> > > index 521e39c..289fb8e 100644
> > > --- a/drivers/pci/pcie/aer/aerdrv_core.c
> > > +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> > > @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
> > >   			"error_detected",
> > >   			report_error_detected);
> > > 
> > > -	if (severity == AER_FATAL) {
> > > +	/* vfio-pci as a general meta driver, it actually couldn't do any real
> > > +	 * recovery for device. It is user space driver, or device-specific
> > > +	 * driver in guest who should take care of the serious error recovery,
> > > +	 * link reset actually is one part of whole recovery. Doing reset_link
> > > +	 * in aer driver of host kernel for vfio-pci devices will cause many
> > > +	 * trouble for user space driver or guest's device-specific driver,
> > > +	 * for example: the serious recovery often need to read register in
> > > +	 * config space, but if register reading happens during link-resetting,
> > > +	 * it is quite possible to return invalid value like all F's, which
> > > +	 * will result in unpredictable error. */
> > 
> > Fix multi-comment style please.
> > 
> > > +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
> > 
> > You really want some flag in the device, or something similar.
> > Also, how do we know driver is not going away at this point?
> > 
> 
> I didn't think of this condition, and I don't quite follow how would driver
> go away?(device has error happened, then is removed?)

Yes - hotplug request detected. Does something prevent this?

> > >   		result = reset_link(dev);
> > >   		if (result != PCI_ERS_RESULT_RECOVERED)
> > >   			goto failed;
> 
> > > @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> > >   		return PCI_ERS_RESULT_DISCONNECT;
> > >   	}
> > > 
> > > +	/* get device's uncorrectable error status as soon as possible,
> > > +	 * and signal it to user space. The later we read it, the possibility
> > > +	 * the register value is mangled grows. */
> > > +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> > > +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset +
> > > +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> > > +        if (ret)
> > > +                return PCI_ERS_RESULT_DISCONNECT;
> > > +
> > > +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
> > >   	mutex_lock(&vdev->igate);
> > > +
> > > +	vdev->aer_recovering = true;
> > > +	reinit_completion(&vdev->aer_error_completion);
> > > +
> > > +	/* suspend config space access from user space,
> > > +	 * when vfio-pci's error recovery process is on */
> > 
> > what about access to memory etc? Do you need to suspend this as well?
> > 
> 
> Yes, this question came into my mind a little bit, but I didn't see some
> existing APIs like pci_cfg_access_xxx which can help to do this.(I am still
> not familiar with kernel)

This isn't easy to do at all.


> > > +	pci_cfg_access_trylock(vdev->pdev);
> > 
> > If you trylock, you need to handle failure.
> 
> try lock returns 0 if access is already locked, 1 otherwise. Is it necessary
> to check its return value?

Locked by whom? You blissfully access as if it's locked by you.

> 
> -- 
> Sincerely,
> Cao jin
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Dec. 1, 2016, 4:04 a.m. UTC | #4
On Sun, 27 Nov 2016 19:34:17 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> It is user space driver's or device-specific driver's(in guest) responsbility
> to do a serious recovery when error happened. Link-reset is one part of
> recovery, when pci device is assigned to VM via vfio, link-reset will do
> twice in host & guest separately, which will cause many trouble for a
> successful recovery, so, disable the vfio-pci's link-reset in aer driver
> in host, this is a keypoint for guest to do error recovery successfully.
> 
> CC: alex.williamson@redhat.com
> CC: mst@redhat.com
> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> ---
> This is actually a RFC version(has debug lines left), and has minor changes in
> aer driver, so I think maybe it is better not to CC pci guys in this round.
> Later will do.
> 
>  drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
>  drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
>  drivers/vfio/pci/vfio_pci_private.h |  2 ++
>  3 files changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> index 521e39c..289fb8e 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
>  			"error_detected",
>  			report_error_detected);
>  
> -	if (severity == AER_FATAL) {
> +	/* vfio-pci as a general meta driver, it actually couldn't do any real
> +	 * recovery for device. It is user space driver, or device-specific
> +	 * driver in guest who should take care of the serious error recovery,
> +	 * link reset actually is one part of whole recovery. Doing reset_link
> +	 * in aer driver of host kernel for vfio-pci devices will cause many
> +	 * trouble for user space driver or guest's device-specific driver,
> +	 * for example: the serious recovery often need to read register in
> +	 * config space, but if register reading happens during link-resetting,
> +	 * it is quite possible to return invalid value like all F's, which
> +	 * will result in unpredictable error. */
> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
>  		result = reset_link(dev);
>  		if (result != PCI_ERS_RESULT_RECOVERED)
>  			goto failed;

This is not acceptable.  If we want to make a path through AER recovery
that does not reset the link, there should be a way for the driver to
request it.  Testing the driver name is a horrible hack.  The other
question is what guarantees does vfio make that the device does get
reset?  If an AER fault occurs and the user doesn't do a reset, what
happens when that device is released and a host driver tries to make
use of it?  The user makes no commitment to do a reset and there are
only limited configurations where we even allow the user to perform a
reset.

> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 712a849..aefd751 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -535,6 +535,15 @@ static long vfio_pci_ioctl(void *device_data,
>  	struct vfio_pci_device *vdev = device_data;
>  	unsigned long minsz;
>  
> +	if (vdev->aer_recovering && (cmd == VFIO_DEVICE_SET_IRQS ||
> +	    cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
> +		int ret;
> +		ret = wait_for_completion_interruptible(
> +			&vdev->aer_error_completion);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	if (cmd == VFIO_DEVICE_GET_INFO) {
>  		struct vfio_device_info info;
>  
> @@ -1117,6 +1126,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	vdev->irq_type = VFIO_PCI_NUM_IRQS;
>  	mutex_init(&vdev->igate);
>  	spin_lock_init(&vdev->irqlock);
> +	init_completion(&vdev->aer_error_completion);
>  
>  	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
>  	if (ret) {
> @@ -1176,6 +1186,9 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  {
>  	struct vfio_pci_device *vdev;
>  	struct vfio_device *device;
> +	u32 uncor_status = 0;
> +	unsigned int aer_cap_offset = 0;

Unnecessary initialization.

> +	int ret;
>  
>  	device = vfio_device_get_from_dev(&pdev->dev);
>  	if (device == NULL)
> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> +	/* get device's uncorrectable error status as soon as possible,
> +	 * and signal it to user space. The later we read it, the possibility
> +	 * the register value is mangled grows. */

Bad comment style (throughout).

> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> +        if (ret)
> +                return PCI_ERS_RESULT_DISCONNECT;
> +
> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>  	mutex_lock(&vdev->igate);
> +    
> +	vdev->aer_recovering = true;
> +	reinit_completion(&vdev->aer_error_completion);
> +
> +	/* suspend config space access from user space,
> +	 * when vfio-pci's error recovery process is on */
> +	pci_cfg_access_trylock(vdev->pdev);
>  
> -	if (vdev->err_trigger)
> -		eventfd_signal(vdev->err_trigger, 1);
> +	if (vdev->err_trigger && uncor_status) {
> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
> +		/* signal uncorrectable error status to user space */
> +		eventfd_signal(vdev->err_trigger, uncor_status);
> +        }

} else... what?  By bypassing the AER link reset we've assumed
responsibility for resetting the device.  Even if we signal the user,
what guarantee do we have that the device is recovered?

>  
>  	mutex_unlock(&vdev->igate);
>  
> @@ -1199,8 +1232,34 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  	return PCI_ERS_RESULT_CAN_RECOVER;
>  }
>  
> +static void vfio_pci_aer_resume(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_device *vdev;
> +	struct vfio_device *device;
> +
> +	device = vfio_device_get_from_dev(&pdev->dev);
> +	if (device == NULL)
> +		return;
> +
> +	vdev = vfio_device_data(device);
> +	if (vdev == NULL) {
> +		vfio_device_put(device);
> +		return;
> +	}
> +
> +	/* vfio-pci's error recovery is done, time to
> +	 * resume pci config space's accesses */
> +	pci_cfg_access_unlock(vdev->pdev);
> +
> +	vdev->aer_recovering = false;
> +	complete_all(&vdev->aer_error_completion);
> +
> +	vfio_device_put(device);
> +}
> +
>  static const struct pci_error_handlers vfio_err_handlers = {
>  	.error_detected = vfio_pci_aer_err_detected,
> +	.resume         = vfio_pci_aer_resume,
>  };
>  
>  static struct pci_driver vfio_pci_driver = {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 8a7d546..ebf1041 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -83,6 +83,8 @@ struct vfio_pci_device {
>  	bool			bardirty;
>  	bool			has_vga;
>  	bool			needs_reset;
> +	bool			aer_recovering;
> +	struct completion	aer_error_completion;
>  	struct pci_saved_state	*pci_saved_state;
>  	int			refcnt;
>  	struct eventfd_ctx	*err_trigger;

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 1, 2016, 4:51 a.m. UTC | #5
On Wed, Nov 30, 2016 at 09:04:13PM -0700, Alex Williamson wrote:
> On Sun, 27 Nov 2016 19:34:17 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
> > It is user space driver's or device-specific driver's(in guest) responsbility
> > to do a serious recovery when error happened. Link-reset is one part of
> > recovery, when pci device is assigned to VM via vfio, link-reset will do
> > twice in host & guest separately, which will cause many trouble for a
> > successful recovery, so, disable the vfio-pci's link-reset in aer driver
> > in host, this is a keypoint for guest to do error recovery successfully.
> > 
> > CC: alex.williamson@redhat.com
> > CC: mst@redhat.com
> > Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> > ---
> > This is actually a RFC version(has debug lines left), and has minor changes in
> > aer driver, so I think maybe it is better not to CC pci guys in this round.
> > Later will do.
> > 
> >  drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
> >  drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
> >  drivers/vfio/pci/vfio_pci_private.h |  2 ++
> >  3 files changed, 74 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> > index 521e39c..289fb8e 100644
> > --- a/drivers/pci/pcie/aer/aerdrv_core.c
> > +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> > @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
> >  			"error_detected",
> >  			report_error_detected);
> >  
> > -	if (severity == AER_FATAL) {
> > +	/* vfio-pci as a general meta driver, it actually couldn't do any real
> > +	 * recovery for device. It is user space driver, or device-specific
> > +	 * driver in guest who should take care of the serious error recovery,
> > +	 * link reset actually is one part of whole recovery. Doing reset_link
> > +	 * in aer driver of host kernel for vfio-pci devices will cause many
> > +	 * trouble for user space driver or guest's device-specific driver,
> > +	 * for example: the serious recovery often need to read register in
> > +	 * config space, but if register reading happens during link-resetting,
> > +	 * it is quite possible to return invalid value like all F's, which
> > +	 * will result in unpredictable error. */
> > +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
> >  		result = reset_link(dev);
> >  		if (result != PCI_ERS_RESULT_RECOVERED)
> >  			goto failed;
> 
> This is not acceptable.  If we want to make a path through AER recovery
> that does not reset the link, there should be a way for the driver to
> request it.  Testing the driver name is a horrible hack.

Right, just set a flag or something.

> The other
> question is what guarantees does vfio make that the device does get
> reset?  If an AER fault occurs and the user doesn't do a reset, what
> happens when that device is released and a host driver tries to make
> use of it?  The user makes no commitment to do a reset and there are
> only limited configurations where we even allow the user to perform a
> reset.

Let's try to work out what can be done a bit more constructively.
Doesn't vfio already reset device on release?
How about detecting there was an unhandled AER event
and doing link reset at that point?



> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 712a849..aefd751 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -535,6 +535,15 @@ static long vfio_pci_ioctl(void *device_data,
> >  	struct vfio_pci_device *vdev = device_data;
> >  	unsigned long minsz;
> >  
> > +	if (vdev->aer_recovering && (cmd == VFIO_DEVICE_SET_IRQS ||
> > +	    cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
> > +		int ret;
> > +		ret = wait_for_completion_interruptible(
> > +			&vdev->aer_error_completion);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> >  	if (cmd == VFIO_DEVICE_GET_INFO) {
> >  		struct vfio_device_info info;
> >  
> > @@ -1117,6 +1126,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	vdev->irq_type = VFIO_PCI_NUM_IRQS;
> >  	mutex_init(&vdev->igate);
> >  	spin_lock_init(&vdev->irqlock);
> > +	init_completion(&vdev->aer_error_completion);
> >  
> >  	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
> >  	if (ret) {
> > @@ -1176,6 +1186,9 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> >  {
> >  	struct vfio_pci_device *vdev;
> >  	struct vfio_device *device;
> > +	u32 uncor_status = 0;
> > +	unsigned int aer_cap_offset = 0;
> 
> Unnecessary initialization.
> 
> > +	int ret;
> >  
> >  	device = vfio_device_get_from_dev(&pdev->dev);
> >  	if (device == NULL)
> > @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> >  		return PCI_ERS_RESULT_DISCONNECT;
> >  	}
> >  
> > +	/* get device's uncorrectable error status as soon as possible,
> > +	 * and signal it to user space. The later we read it, the possibility
> > +	 * the register value is mangled grows. */
> 
> Bad comment style (throughout).
> 
> > +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> > +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
> > +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> > +        if (ret)
> > +                return PCI_ERS_RESULT_DISCONNECT;
> > +
> > +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
> >  	mutex_lock(&vdev->igate);
> > +    
> > +	vdev->aer_recovering = true;
> > +	reinit_completion(&vdev->aer_error_completion);
> > +
> > +	/* suspend config space access from user space,
> > +	 * when vfio-pci's error recovery process is on */
> > +	pci_cfg_access_trylock(vdev->pdev);
> >  
> > -	if (vdev->err_trigger)
> > -		eventfd_signal(vdev->err_trigger, 1);
> > +	if (vdev->err_trigger && uncor_status) {
> > +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
> > +		/* signal uncorrectable error status to user space */
> > +		eventfd_signal(vdev->err_trigger, uncor_status);

What does this mean that we trigger with uncor_status as opposed to 1?

> > +        }
> 
> } else... what?  By bypassing the AER link reset we've assumed
> responsibility for resetting the device.  Even if we signal the user,
> what guarantee do we have that the device is recovered?

So set flag and recover ?

> >  
> >  	mutex_unlock(&vdev->igate);
> >  
> > @@ -1199,8 +1232,34 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> >  	return PCI_ERS_RESULT_CAN_RECOVER;
> >  }
> >  
> > +static void vfio_pci_aer_resume(struct pci_dev *pdev)
> > +{
> > +	struct vfio_pci_device *vdev;
> > +	struct vfio_device *device;
> > +
> > +	device = vfio_device_get_from_dev(&pdev->dev);
> > +	if (device == NULL)
> > +		return;
> > +
> > +	vdev = vfio_device_data(device);
> > +	if (vdev == NULL) {
> > +		vfio_device_put(device);
> > +		return;
> > +	}
> > +
> > +	/* vfio-pci's error recovery is done, time to
> > +	 * resume pci config space's accesses */
> > +	pci_cfg_access_unlock(vdev->pdev);
> > +
> > +	vdev->aer_recovering = false;
> > +	complete_all(&vdev->aer_error_completion);
> > +
> > +	vfio_device_put(device);
> > +}
> > +
> >  static const struct pci_error_handlers vfio_err_handlers = {
> >  	.error_detected = vfio_pci_aer_err_detected,
> > +	.resume         = vfio_pci_aer_resume,
> >  };
> >  
> >  static struct pci_driver vfio_pci_driver = {
> > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> > index 8a7d546..ebf1041 100644
> > --- a/drivers/vfio/pci/vfio_pci_private.h
> > +++ b/drivers/vfio/pci/vfio_pci_private.h
> > @@ -83,6 +83,8 @@ struct vfio_pci_device {
> >  	bool			bardirty;
> >  	bool			has_vga;
> >  	bool			needs_reset;
> > +	bool			aer_recovering;
> > +	struct completion	aer_error_completion;
> >  	struct pci_saved_state	*pci_saved_state;
> >  	int			refcnt;
> >  	struct eventfd_ctx	*err_trigger;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 1, 2016, 1:38 p.m. UTC | #6
On 11/30/2016 09:46 AM, Michael S. Tsirkin wrote:
> On Mon, Nov 28, 2016 at 05:32:15PM +0800, Cao jin wrote:
>>
>>

>>>
>>>> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
>>>
>>> You really want some flag in the device, or something similar.
>>> Also, how do we know driver is not going away at this point?
>>>
>>
>> I didn't think of this condition, and I don't quite follow how would driver
>> go away?(device has error happened, then is removed?)
> 
> Yes - hotplug request detected. Does something prevent this?
> 

I wasn't realized there would be possible to have hotplug happened
during error recovery. But, it seems is the aer driver's thing to
consider it?

>>>>   		result = reset_link(dev);
>>>>   		if (result != PCI_ERS_RESULT_RECOVERED)
>>>>   			goto failed;
>>
>>>> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>>>>   		return PCI_ERS_RESULT_DISCONNECT;
>>>>   	}
>>>>
>>>> +	/* get device's uncorrectable error status as soon as possible,
>>>> +	 * and signal it to user space. The later we read it, the possibility
>>>> +	 * the register value is mangled grows. */
>>>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
>>>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset +
>>>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
>>>> +        if (ret)
>>>> +                return PCI_ERS_RESULT_DISCONNECT;
>>>> +
>>>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>>>>   	mutex_lock(&vdev->igate);
>>>> +
>>>> +	vdev->aer_recovering = true;
>>>> +	reinit_completion(&vdev->aer_error_completion);
>>>> +
>>>> +	/* suspend config space access from user space,
>>>> +	 * when vfio-pci's error recovery process is on */
>>>
>>> what about access to memory etc? Do you need to suspend this as well?
>>>
>>
>> Yes, this question came into my mind a little bit, but I didn't see some
>> existing APIs like pci_cfg_access_xxx which can help to do this.(I am still
>> not familiar with kernel)
> 
> This isn't easy to do at all.
> 

I guess we can use completion in vfio_pci_rw to block all kinds of
access during vfio's error recovery. But from another perspective, now
that we have disabled reset_link in host, I think the access to device
could be performed normally. To me, the benefit of using pci_cfg_access
is: we won't bother to do "wait 3s to do guest's link reset"

> 
>>>> +	pci_cfg_access_trylock(vdev->pdev);
>>>
>>> If you trylock, you need to handle failure.
>>
>> try lock returns 0 if access is already locked, 1 otherwise. Is it necessary
>> to check its return value?
> 
> Locked by whom? You blissfully access as if it's locked by you.
> 
>>
>> -- 
>> Sincerely,
>> Cao jin
>>
> 
> 
> .
>
Cao jin Dec. 1, 2016, 1:40 p.m. UTC | #7
On 12/01/2016 12:04 PM, Alex Williamson wrote:
> On Sun, 27 Nov 2016 19:34:17 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> It is user space driver's or device-specific driver's(in guest) responsbility
>> to do a serious recovery when error happened. Link-reset is one part of
>> recovery, when pci device is assigned to VM via vfio, link-reset will do
>> twice in host & guest separately, which will cause many trouble for a
>> successful recovery, so, disable the vfio-pci's link-reset in aer driver
>> in host, this is a keypoint for guest to do error recovery successfully.
>>
>> CC: alex.williamson@redhat.com
>> CC: mst@redhat.com
>> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
>> ---
>> This is actually a RFC version(has debug lines left), and has minor changes in
>> aer driver, so I think maybe it is better not to CC pci guys in this round.
>> Later will do.
>>
>>  drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
>>  drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
>>  drivers/vfio/pci/vfio_pci_private.h |  2 ++
>>  3 files changed, 74 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
>> index 521e39c..289fb8e 100644
>> --- a/drivers/pci/pcie/aer/aerdrv_core.c
>> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
>> @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
>>  			"error_detected",
>>  			report_error_detected);
>>  
>> -	if (severity == AER_FATAL) {
>> +	/* vfio-pci as a general meta driver, it actually couldn't do any real
>> +	 * recovery for device. It is user space driver, or device-specific
>> +	 * driver in guest who should take care of the serious error recovery,
>> +	 * link reset actually is one part of whole recovery. Doing reset_link
>> +	 * in aer driver of host kernel for vfio-pci devices will cause many
>> +	 * trouble for user space driver or guest's device-specific driver,
>> +	 * for example: the serious recovery often need to read register in
>> +	 * config space, but if register reading happens during link-resetting,
>> +	 * it is quite possible to return invalid value like all F's, which
>> +	 * will result in unpredictable error. */
>> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
>>  		result = reset_link(dev);
>>  		if (result != PCI_ERS_RESULT_RECOVERED)
>>  			goto failed;
> 
> This is not acceptable.  If we want to make a path through AER recovery
> that does not reset the link, there should be a way for the driver to
> request it.  Testing the driver name is a horrible hack.  The other
> question is what guarantees does vfio make that the device does get
> reset?  

I am not sure how vfio guarantee that...When device is assigned to VM,
we have that guarantees(aer driver in guest driver will do that), so I
think it is a well-behaved user space driver's responsibility to do link
reset?  And I think if there is a user space driver, it is surely its
responsibility to consider how to do serious error recovery, like I said
before, vfio, as a general meta driver, it surely don't know how each
device does its specific recovery, except bus/slot reset

> If an AER fault occurs and the user doesn't do a reset, what
> happens when that device is released and a host driver tries to make
> use of it?  The user makes no commitment to do a reset and there are
> only limited configurations where we even allow the user to perform a
> reset.
> 

Limited? Do you mean the things __pci_dev_reset() can do?

...
> 
>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
>> +        if (ret)
>> +                return PCI_ERS_RESULT_DISCONNECT;
>> +
>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>>  	mutex_lock(&vdev->igate);
>> +    
>> +	vdev->aer_recovering = true;
>> +	reinit_completion(&vdev->aer_error_completion);
>> +
>> +	/* suspend config space access from user space,
>> +	 * when vfio-pci's error recovery process is on */
>> +	pci_cfg_access_trylock(vdev->pdev);
>>  
>> -	if (vdev->err_trigger)
>> -		eventfd_signal(vdev->err_trigger, 1);
>> +	if (vdev->err_trigger && uncor_status) {
>> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
>> +		/* signal uncorrectable error status to user space */
>> +		eventfd_signal(vdev->err_trigger, uncor_status);
>> +        }
> 
> } else... what?  By bypassing the AER link reset we've assumed
> responsibility for resetting the device.  Even if we signal the user,
> what guarantee do we have that the device is recovered?
> 

else...consider it as a fake error notification and ignore?

I am not sure I understand your thoughts totally, but it seems my
previous comments apply, that is: it is well-behaved user space driver's
responsibility to do a serious recovery.

In my understanding, user space driver has 2 category: one is VM(has
guest OS running inside), the other is ordinary user space program.

When device is assigned to a VM, (qemu + guest OS) will do fully steps
to do recovery(roughly is what struct pci_error_handlers has). So,
equally, if it is a ordinary user space program acting as the driver,
the responsibility belongs to it.


Sincerely,
Cao jin
>>  
>>  	mutex_unlock(&vdev->igate);
>>  
>> @@ -1199,8 +1232,34 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>>  	return PCI_ERS_RESULT_CAN_RECOVER;
>>  }
>>  
>> +static void vfio_pci_aer_resume(struct pci_dev *pdev)
>> +{
>> +	struct vfio_pci_device *vdev;
>> +	struct vfio_device *device;
>> +
>> +	device = vfio_device_get_from_dev(&pdev->dev);
>> +	if (device == NULL)
>> +		return;
>> +
>> +	vdev = vfio_device_data(device);
>> +	if (vdev == NULL) {
>> +		vfio_device_put(device);
>> +		return;
>> +	}
>> +
>> +	/* vfio-pci's error recovery is done, time to
>> +	 * resume pci config space's accesses */
>> +	pci_cfg_access_unlock(vdev->pdev);
>> +
>> +	vdev->aer_recovering = false;
>> +	complete_all(&vdev->aer_error_completion);
>> +
>> +	vfio_device_put(device);
>> +}
>> +
>>  static const struct pci_error_handlers vfio_err_handlers = {
>>  	.error_detected = vfio_pci_aer_err_detected,
>> +	.resume         = vfio_pci_aer_resume,
>>  };
>>  
>>  static struct pci_driver vfio_pci_driver = {
>> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
>> index 8a7d546..ebf1041 100644
>> --- a/drivers/vfio/pci/vfio_pci_private.h
>> +++ b/drivers/vfio/pci/vfio_pci_private.h
>> @@ -83,6 +83,8 @@ struct vfio_pci_device {
>>  	bool			bardirty;
>>  	bool			has_vga;
>>  	bool			needs_reset;
>> +	bool			aer_recovering;
>> +	struct completion	aer_error_completion;
>>  	struct pci_saved_state	*pci_saved_state;
>>  	int			refcnt;
>>  	struct eventfd_ctx	*err_trigger;
> 
> 
> 
> .
> 




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 1, 2016, 1:40 p.m. UTC | #8
On 12/01/2016 12:51 PM, Michael S. Tsirkin wrote:
> On Wed, Nov 30, 2016 at 09:04:13PM -0700, Alex Williamson wrote:
>> On Sun, 27 Nov 2016 19:34:17 +0800
>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>

>>> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>>>  		return PCI_ERS_RESULT_DISCONNECT;
>>>  	}
>>>  
>>> +	/* get device's uncorrectable error status as soon as possible,
>>> +	 * and signal it to user space. The later we read it, the possibility
>>> +	 * the register value is mangled grows. */
>>
>> Bad comment style (throughout).
>>
>>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
>>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
>>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
>>> +        if (ret)
>>> +                return PCI_ERS_RESULT_DISCONNECT;
>>> +
>>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>>>  	mutex_lock(&vdev->igate);
>>> +    
>>> +	vdev->aer_recovering = true;
>>> +	reinit_completion(&vdev->aer_error_completion);
>>> +
>>> +	/* suspend config space access from user space,
>>> +	 * when vfio-pci's error recovery process is on */
>>> +	pci_cfg_access_trylock(vdev->pdev);
>>>  
>>> -	if (vdev->err_trigger)
>>> -		eventfd_signal(vdev->err_trigger, 1);
>>> +	if (vdev->err_trigger && uncor_status) {
>>> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
>>> +		/* signal uncorrectable error status to user space */
>>> +		eventfd_signal(vdev->err_trigger, uncor_status);
> 
> What does this mean that we trigger with uncor_status as opposed to 1?
> 

I guess you missed the changelog in qemu patchset's cover letter, see if
it helps(copy from cover letter):

5. Change what eventfd signals(check vfio driver patch). Before,
vfio-pci driver add 1 to the counter, which doesn't have meaning, just
for notification. Now, vfio-pci's error detected handler read the
uncorrectable error status and signal it to qemu.

Why? When testing previous version(host aer driver still reset link),
found that there is quite possibility that register reading returns the
invalid value 0xFFFFFFFF, which results in all(2 in my environment)
qemu's vfio-pci function send AER to root port while I only inject error
into one function. This is unreasonable, and I think it is the result of
reading during device reset. Previous patch does considered to find the
real function who has error happened, but won't work under the situation
I found. So move the register reading as early as possible would be the
nature thoughts, and it is moved into vfio-pci driver. Although now
reset_link is disabled in aer driver, get the uncor error status as
early as possible still make sense, for example: if user space driver
does device reset once receiving the error notification, and then read
register.
Alex Williamson Dec. 1, 2016, 2:55 p.m. UTC | #9
On Thu, 1 Dec 2016 21:40:00 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/01/2016 12:04 PM, Alex Williamson wrote:
> > On Sun, 27 Nov 2016 19:34:17 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> It is user space driver's or device-specific driver's(in guest) responsbility
> >> to do a serious recovery when error happened. Link-reset is one part of
> >> recovery, when pci device is assigned to VM via vfio, link-reset will do
> >> twice in host & guest separately, which will cause many trouble for a
> >> successful recovery, so, disable the vfio-pci's link-reset in aer driver
> >> in host, this is a keypoint for guest to do error recovery successfully.
> >>
> >> CC: alex.williamson@redhat.com
> >> CC: mst@redhat.com
> >> Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
> >> ---
> >> This is actually a RFC version(has debug lines left), and has minor changes in
> >> aer driver, so I think maybe it is better not to CC pci guys in this round.
> >> Later will do.
> >>
> >>  drivers/pci/pcie/aer/aerdrv_core.c  | 12 ++++++-
> >>  drivers/vfio/pci/vfio_pci.c         | 63 +++++++++++++++++++++++++++++++++++--
> >>  drivers/vfio/pci/vfio_pci_private.h |  2 ++
> >>  3 files changed, 74 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
> >> index 521e39c..289fb8e 100644
> >> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> >> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> >> @@ -496,7 +496,17 @@ static void do_recovery(struct pci_dev *dev, int severity)
> >>  			"error_detected",
> >>  			report_error_detected);
> >>  
> >> -	if (severity == AER_FATAL) {
> >> +	/* vfio-pci as a general meta driver, it actually couldn't do any real
> >> +	 * recovery for device. It is user space driver, or device-specific
> >> +	 * driver in guest who should take care of the serious error recovery,
> >> +	 * link reset actually is one part of whole recovery. Doing reset_link
> >> +	 * in aer driver of host kernel for vfio-pci devices will cause many
> >> +	 * trouble for user space driver or guest's device-specific driver,
> >> +	 * for example: the serious recovery often need to read register in
> >> +	 * config space, but if register reading happens during link-resetting,
> >> +	 * it is quite possible to return invalid value like all F's, which
> >> +	 * will result in unpredictable error. */
> >> +	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
> >>  		result = reset_link(dev);
> >>  		if (result != PCI_ERS_RESULT_RECOVERED)
> >>  			goto failed;  
> > 
> > This is not acceptable.  If we want to make a path through AER recovery
> > that does not reset the link, there should be a way for the driver to
> > request it.  Testing the driver name is a horrible hack.  The other
> > question is what guarantees does vfio make that the device does get
> > reset?    
> 
> I am not sure how vfio guarantee that...When device is assigned to VM,
> we have that guarantees(aer driver in guest driver will do that), so I
> think it is a well-behaved user space driver's responsibility to do link
> reset?  And I think if there is a user space driver, it is surely its
> responsibility to consider how to do serious error recovery, like I said
> before, vfio, as a general meta driver, it surely don't know how each
> device does its specific recovery, except bus/slot reset

We can't assume the user it a VM and we certainly cannot assume that
the user is well behaved.  By bypassing the link reset here, you've
taken responsibility for assuring that at least that basic level of
recovery is performed.  We cannot assume the user will do it though.

> > If an AER fault occurs and the user doesn't do a reset, what
> > happens when that device is released and a host driver tries to make
> > use of it?  The user makes no commitment to do a reset and there are
> > only limited configurations where we even allow the user to perform a
> > reset.
> >   
> 
> Limited? Do you mean the things __pci_dev_reset() can do?

I mean that there are significant device and guest configuration
restrictions in order to support AER.  For instance, all the functions
of the slot need to appear in a PCI-e topology in the guest with all
the functions in the right place such that a guest bus reset translates
into a host bus reset.  The physical functions cannot be split between
guests even if IOMMU isolation would otherwise allow it.  The user
needs to explicitly enable AER support for the devices.  A VM need to
be specifically configured for AER support in order to set any sort of
expectations of a guest directed bus reset, let alone a guarantee that
it will happen.  So all the existing VMs, where functions are split
between guests, or the topology isn't exactly right, or AER isn't
enabled see a regression from the above change as the device is no
longer reset.

> ...
> >   
> >> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> >> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
> >> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> >> +        if (ret)
> >> +                return PCI_ERS_RESULT_DISCONNECT;
> >> +
> >> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
> >>  	mutex_lock(&vdev->igate);
> >> +    
> >> +	vdev->aer_recovering = true;
> >> +	reinit_completion(&vdev->aer_error_completion);
> >> +
> >> +	/* suspend config space access from user space,
> >> +	 * when vfio-pci's error recovery process is on */
> >> +	pci_cfg_access_trylock(vdev->pdev);
> >>  
> >> -	if (vdev->err_trigger)
> >> -		eventfd_signal(vdev->err_trigger, 1);
> >> +	if (vdev->err_trigger && uncor_status) {
> >> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
> >> +		/* signal uncorrectable error status to user space */
> >> +		eventfd_signal(vdev->err_trigger, uncor_status);
> >> +        }  
> > 
> > } else... what?  By bypassing the AER link reset we've assumed
> > responsibility for resetting the device.  Even if we signal the user,
> > what guarantee do we have that the device is recovered?
> >   
> 
> else...consider it as a fake error notification and ignore?

Nope.  Registering the error eventfd is optional, you've just
introduced a regression where the device was previously reset by the
AER core code and now it's in some unknown state because it generated
an error and we told the core not to reset it.

> I am not sure I understand your thoughts totally, but it seems my
> previous comments apply, that is: it is well-behaved user space driver's
> responsibility to do a serious recovery.

We can *never* assume a well behaved userspace.  That's an invalid
starting point.

> In my understanding, user space driver has 2 category: one is VM(has
> guest OS running inside), the other is ordinary user space program.
> 
> When device is assigned to a VM, (qemu + guest OS) will do fully steps
> to do recovery(roughly is what struct pci_error_handlers has). So,
> equally, if it is a ordinary user space program acting as the driver,
> the responsibility belongs to it.

This is simply not valid.  If vfio is going to assume responsibility
for resetting the device, it can't simply push that responsibility to
userspace.  There are only very limited user configurations where a
guest bus reset translating directly to a host bus reset are even
valid.  Also keep in mind that we must support old userspace on newer
kernels.  There currently does not exist AER recovery on existing
userspace VMs.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 4, 2016, 12:16 p.m. UTC | #10
On 12/01/2016 10:55 PM, Alex Williamson wrote:
> On Thu, 1 Dec 2016 21:40:00 +0800

>>> If an AER fault occurs and the user doesn't do a reset, what
>>> happens when that device is released and a host driver tries to make
>>> use of it?  The user makes no commitment to do a reset and there are
>>> only limited configurations where we even allow the user to perform a
>>> reset.
>>>   
>>
>> Limited? Do you mean the things __pci_dev_reset() can do?
> 
> I mean that there are significant device and guest configuration
> restrictions in order to support AER.  For instance, all the functions
> of the slot need to appear in a PCI-e topology in the guest with all
> the functions in the right place such that a guest bus reset translates
> into a host bus reset.  The physical functions cannot be split between
> guests even if IOMMU isolation would otherwise allow it.  The user
> needs to explicitly enable AER support for the devices.  A VM need to
> be specifically configured for AER support in order to set any sort of
> expectations of a guest directed bus reset, let alone a guarantee that
> it will happen.  So all the existing VMs, where functions are split
> between guests, or the topology isn't exactly right, or AER isn't
> enabled see a regression from the above change as the device is no
> longer reset.
> 

I am not clear why set these restrictions in the current design. I take
a glance at older versions of qemu's patchset, their thoughts is:
translate a guest bus reset into a host bus reset(Which is
unreasonable[*] to me). And I guess, that's the *cause* of these
restrictions?  Is there any other stories behind these restrictions?

[*] In physical world, set bridge's secondary bus reset would send
hot-reset TLP to all functions below, trigger every device's reset
separately. Emulated device should behave the same, means just using
each device's DeviceClass->reset method.
Alex Williamson Dec. 4, 2016, 3:30 p.m. UTC | #11
On Sun, 4 Dec 2016 20:16:42 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/01/2016 10:55 PM, Alex Williamson wrote:
> > On Thu, 1 Dec 2016 21:40:00 +0800  
> 
> >>> If an AER fault occurs and the user doesn't do a reset, what
> >>> happens when that device is released and a host driver tries to make
> >>> use of it?  The user makes no commitment to do a reset and there are
> >>> only limited configurations where we even allow the user to perform a
> >>> reset.
> >>>     
> >>
> >> Limited? Do you mean the things __pci_dev_reset() can do?  
> > 
> > I mean that there are significant device and guest configuration
> > restrictions in order to support AER.  For instance, all the functions
> > of the slot need to appear in a PCI-e topology in the guest with all
> > the functions in the right place such that a guest bus reset translates
> > into a host bus reset.  The physical functions cannot be split between
> > guests even if IOMMU isolation would otherwise allow it.  The user
> > needs to explicitly enable AER support for the devices.  A VM need to
> > be specifically configured for AER support in order to set any sort of
> > expectations of a guest directed bus reset, let alone a guarantee that
> > it will happen.  So all the existing VMs, where functions are split
> > between guests, or the topology isn't exactly right, or AER isn't
> > enabled see a regression from the above change as the device is no
> > longer reset.
> >   
> 
> I am not clear why set these restrictions in the current design. I take
> a glance at older versions of qemu's patchset, their thoughts is:
> translate a guest bus reset into a host bus reset(Which is
> unreasonable[*] to me). And I guess, that's the *cause* of these
> restrictions?  Is there any other stories behind these restrictions?
> 
> [*] In physical world, set bridge's secondary bus reset would send
> hot-reset TLP to all functions below, trigger every device's reset
> separately. Emulated device should behave the same, means just using
> each device's DeviceClass->reset method.

Are you trying to say that an FLR is equivalent to a link reset?
Please go read the previous discussions, especially if you're sending
patches you don't believe in.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 5, 2016, 5:52 a.m. UTC | #12
On 12/04/2016 11:30 PM, Alex Williamson wrote:
> On Sun, 4 Dec 2016 20:16:42 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> On 12/01/2016 10:55 PM, Alex Williamson wrote:
>>> On Thu, 1 Dec 2016 21:40:00 +0800  
>>
>>>>> If an AER fault occurs and the user doesn't do a reset, what
>>>>> happens when that device is released and a host driver tries to make
>>>>> use of it?  The user makes no commitment to do a reset and there are
>>>>> only limited configurations where we even allow the user to perform a
>>>>> reset.
>>>>>     
>>>>
>>>> Limited? Do you mean the things __pci_dev_reset() can do?  
>>>
>>> I mean that there are significant device and guest configuration
>>> restrictions in order to support AER.  For instance, all the functions
>>> of the slot need to appear in a PCI-e topology in the guest with all
>>> the functions in the right place such that a guest bus reset translates
>>> into a host bus reset.  The physical functions cannot be split between
>>> guests even if IOMMU isolation would otherwise allow it.  The user
>>> needs to explicitly enable AER support for the devices.  A VM need to
>>> be specifically configured for AER support in order to set any sort of
>>> expectations of a guest directed bus reset, let alone a guarantee that
>>> it will happen.  So all the existing VMs, where functions are split
>>> between guests, or the topology isn't exactly right, or AER isn't
>>> enabled see a regression from the above change as the device is no
>>> longer reset.
>>>   
>>
>> I am not clear why set these restrictions in the current design. I take
>> a glance at older versions of qemu's patchset, their thoughts is:
>> translate a guest bus reset into a host bus reset(Which is
>> unreasonable[*] to me). And I guess, that's the *cause* of these
>> restrictions?  Is there any other stories behind these restrictions?
>>
>> [*] In physical world, set bridge's secondary bus reset would send
>> hot-reset TLP to all functions below, trigger every device's reset
>> separately. Emulated device should behave the same, means just using
>> each device's DeviceClass->reset method.
> 
> Are you trying to say that an FLR is equivalent to a link reset?

No.  Look at old versions patchset, there is one names "vote the
function 0 to do host bus reset when aer occurred"[1], that is what I
called "translate guest link reset to host link reset", and what I think
unreasonable(and I think it also does it wrongly).  So in v10 version of
mine, I dropped it.

[1]https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02987.html

If "translate guest link reset to host link reset" is right, I can
understand these restrictions[2][3].

[2]. All physical functions in a single card must be assigned to the VM
     with AER enabled on each and configured on the same virtual bus.
[3]. Don't place other devices under the virtual bus in [2], no matter
     physical, emulated, or paravirtual, even if other device
     supporting AER signaling

Certain device's FLR calls its DeviceClass->reset method; link reset
calls DeviceClass->reset of each device which on the bus. So, apparently
they have difference.  But if there is only 1 vfio-pci device under the
virtual pci bus,  I think FLR can be equivalent to a link reset, right?

> Please go read the previous discussions, especially if you're sending
> patches you don't believe in.  Thanks,
> 

I does not read ALL version's discussion thoroughly, but these
restrictions exist for a long time, so I guess it is a result of
previous discussions.  If it is not, I am thinking of the possibility of
dropping these restrictions[2][3], and drop the "aer" property,
automatically enable this functionality or not according to device's
capability.
Alex Williamson Dec. 5, 2016, 4:17 p.m. UTC | #13
On Mon, 5 Dec 2016 13:52:03 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/04/2016 11:30 PM, Alex Williamson wrote:
> > On Sun, 4 Dec 2016 20:16:42 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> On 12/01/2016 10:55 PM, Alex Williamson wrote:  
> >>> On Thu, 1 Dec 2016 21:40:00 +0800    
> >>  
> >>>>> If an AER fault occurs and the user doesn't do a reset, what
> >>>>> happens when that device is released and a host driver tries to make
> >>>>> use of it?  The user makes no commitment to do a reset and there are
> >>>>> only limited configurations where we even allow the user to perform a
> >>>>> reset.
> >>>>>       
> >>>>
> >>>> Limited? Do you mean the things __pci_dev_reset() can do?    
> >>>
> >>> I mean that there are significant device and guest configuration
> >>> restrictions in order to support AER.  For instance, all the functions
> >>> of the slot need to appear in a PCI-e topology in the guest with all
> >>> the functions in the right place such that a guest bus reset translates
> >>> into a host bus reset.  The physical functions cannot be split between
> >>> guests even if IOMMU isolation would otherwise allow it.  The user
> >>> needs to explicitly enable AER support for the devices.  A VM need to
> >>> be specifically configured for AER support in order to set any sort of
> >>> expectations of a guest directed bus reset, let alone a guarantee that
> >>> it will happen.  So all the existing VMs, where functions are split
> >>> between guests, or the topology isn't exactly right, or AER isn't
> >>> enabled see a regression from the above change as the device is no
> >>> longer reset.
> >>>     
> >>
> >> I am not clear why set these restrictions in the current design. I take
> >> a glance at older versions of qemu's patchset, their thoughts is:
> >> translate a guest bus reset into a host bus reset(Which is
> >> unreasonable[*] to me). And I guess, that's the *cause* of these
> >> restrictions?  Is there any other stories behind these restrictions?
> >>
> >> [*] In physical world, set bridge's secondary bus reset would send
> >> hot-reset TLP to all functions below, trigger every device's reset
> >> separately. Emulated device should behave the same, means just using
> >> each device's DeviceClass->reset method.  
> > 
> > Are you trying to say that an FLR is equivalent to a link reset?  
> 
> No.  Look at old versions patchset, there is one names "vote the
> function 0 to do host bus reset when aer occurred"[1], that is what I
> called "translate guest link reset to host link reset", and what I think
> unreasonable(and I think it also does it wrongly).  So in v10 version of
> mine, I dropped it.
> 
> [1]https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02987.html
> 
> If "translate guest link reset to host link reset" is right, I can
> understand these restrictions[2][3].
> 
> [2]. All physical functions in a single card must be assigned to the VM
>      with AER enabled on each and configured on the same virtual bus.
> [3]. Don't place other devices under the virtual bus in [2], no matter
>      physical, emulated, or paravirtual, even if other device
>      supporting AER signaling
> 
> Certain device's FLR calls its DeviceClass->reset method; link reset
> calls DeviceClass->reset of each device which on the bus. So, apparently
> they have difference.  But if there is only 1 vfio-pci device under the
> virtual pci bus,  I think FLR can be equivalent to a link reset, right?

No.  An FLR resets the device while a secondary bus reset does a reset
of the link and the device.  AER errors are sometimes issues with the
link, not the device.  If we were to perform only an FLR, we're not
performing the same reset as would be done on bare metal.
 
> > Please go read the previous discussions, especially if you're sending
> > patches you don't believe in.  Thanks,
> >   
> 
> I does not read ALL version's discussion thoroughly, but these
> restrictions exist for a long time, so I guess it is a result of
> previous discussions.  If it is not, I am thinking of the possibility of
> dropping these restrictions[2][3], and drop the "aer" property,
> automatically enable this functionality or not according to device's
> capability.

If you're going to take the lead for these AER patches, I would
certainly suggest that understanding the reasoning behind the bus reset
behavior is a central aspect to this series.  This effort has dragged
out for nearly two years and I apologize, but I don't really have a lot
of patience for rehashing some of these issues if you're not going to
read the previous discussions or consult with your colleagues to
understand how we got to this point.  If you want to challenge some of
the design points, that's great, it could use some new eyes, but please
understand how we got here first.  If we could replace a bus reset with
an FLR that would make things far more simple, but I don't see that as
a valid option.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 6, 2016, 3:46 a.m. UTC | #14
On Thu, Dec 01, 2016 at 09:40:23PM +0800, Cao jin wrote:
> 
> 
> On 12/01/2016 12:51 PM, Michael S. Tsirkin wrote:
> > On Wed, Nov 30, 2016 at 09:04:13PM -0700, Alex Williamson wrote:
> >> On Sun, 27 Nov 2016 19:34:17 +0800
> >> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >>
> 
> >>> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> >>>  		return PCI_ERS_RESULT_DISCONNECT;
> >>>  	}
> >>>  
> >>> +	/* get device's uncorrectable error status as soon as possible,
> >>> +	 * and signal it to user space. The later we read it, the possibility
> >>> +	 * the register value is mangled grows. */
> >>
> >> Bad comment style (throughout).
> >>
> >>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
> >>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
> >>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
> >>> +        if (ret)
> >>> +                return PCI_ERS_RESULT_DISCONNECT;
> >>> +
> >>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
> >>>  	mutex_lock(&vdev->igate);
> >>> +    
> >>> +	vdev->aer_recovering = true;
> >>> +	reinit_completion(&vdev->aer_error_completion);
> >>> +
> >>> +	/* suspend config space access from user space,
> >>> +	 * when vfio-pci's error recovery process is on */
> >>> +	pci_cfg_access_trylock(vdev->pdev);
> >>>  
> >>> -	if (vdev->err_trigger)
> >>> -		eventfd_signal(vdev->err_trigger, 1);
> >>> +	if (vdev->err_trigger && uncor_status) {
> >>> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
> >>> +		/* signal uncorrectable error status to user space */
> >>> +		eventfd_signal(vdev->err_trigger, uncor_status);
> > 
> > What does this mean that we trigger with uncor_status as opposed to 1?
> > 
> 
> I guess you missed the changelog in qemu patchset's cover letter, see if
> it helps(copy from cover letter):
> 
> 5. Change what eventfd signals(check vfio driver patch). Before,
> vfio-pci driver add 1 to the counter, which doesn't have meaning, just
> for notification. Now, vfio-pci's error detected handler read the
> uncorrectable error status and signal it to qemu.

I don't think you can use an eventfd to send values like this.

eventfd does a sum of these values, so sending e.g.
value 2 will look the same as sending value 1 twice.



> Why? When testing previous version(host aer driver still reset link),
> found that there is quite possibility that register reading returns the
> invalid value 0xFFFFFFFF, which results in all(2 in my environment)
> qemu's vfio-pci function send AER to root port while I only inject error
> into one function. This is unreasonable, and I think it is the result of
> reading during device reset.
>
> Previous patch does considered to find the
> real function who has error happened, but won't work under the situation
> I found. So move the register reading as early as possible would be the
> nature thoughts, and it is moved into vfio-pci driver. Although now
> reset_link is disabled in aer driver, get the uncor error status as
> early as possible still make sense, for example: if user space driver
> does device reset once receiving the error notification, and then read
> register.

I had trouble understanding the above. Let me ask you this:
should we try to avoid triggering uncorrectable errors?
Aren't any such errors likely to crash guests?

> -- 
> Sincerely,
> Cao jin
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 6, 2016, 3:55 a.m. UTC | #15
On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:
> If you're going to take the lead for these AER patches, I would
> certainly suggest that understanding the reasoning behind the bus reset
> behavior is a central aspect to this series.  This effort has dragged
> out for nearly two years and I apologize, but I don't really have a lot
> of patience for rehashing some of these issues if you're not going to
> read the previous discussions or consult with your colleagues to
> understand how we got to this point.  If you want to challenge some of
> the design points, that's great, it could use some new eyes, but please
> understand how we got here first.

Well I'm guessing Cao jin here isn't the only one not
willing to plough through all historical versions of the patchset
just to figure out the motivation for some code.

Including a summary of a high level architecture couldn't hurt.

Any chance of writing such?  Alternatively, we can try to build it as
part of this thread.  Shouldn't be hard as it seems somewhat
straight-forward on the surface:

- detect link error on the host, don't reset link as we would normally do
- report link error to guest
- detect link reset request from guest
- reset link on host

Since link reset will reset all devices behind it, for this to work we
need same set of devices behind the link in host and guest.  Enforcing
this would be nice to have.

- as link now might end up in bad state, reset
  it when device is unassigned

Any details I missed?
Alex Williamson Dec. 6, 2016, 4:59 a.m. UTC | #16
On Tue, 6 Dec 2016 05:55:28 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:
> > If you're going to take the lead for these AER patches, I would
> > certainly suggest that understanding the reasoning behind the bus reset
> > behavior is a central aspect to this series.  This effort has dragged
> > out for nearly two years and I apologize, but I don't really have a lot
> > of patience for rehashing some of these issues if you're not going to
> > read the previous discussions or consult with your colleagues to
> > understand how we got to this point.  If you want to challenge some of
> > the design points, that's great, it could use some new eyes, but please
> > understand how we got here first.  
> 
> Well I'm guessing Cao jin here isn't the only one not
> willing to plough through all historical versions of the patchset
> just to figure out the motivation for some code.
> 
> Including a summary of a high level architecture couldn't hurt.
> 
> Any chance of writing such?  Alternatively, we can try to build it as
> part of this thread.  Shouldn't be hard as it seems somewhat
> straight-forward on the surface:
> 
> - detect link error on the host, don't reset link as we would normally do

This is actually a new approach that I'm not sure I agree with.  By
skipping the host directed link reset, vfio is taking responsibility
for doing this, but then we just assume the user will do it.  I have
issues with this.

The previous approach was to use the error detected notifier to block
access to the device, allowing the host to perform the link reset.  A
subsequent notification in the AER process released the user access
which allowed the user AER process to proceed.  This did result in both
a host directed and a guest directed link reset, but other than
coordinating the blocking of the user process during host reset, that
hasn't been brought up as an issue previously.

> - report link error to guest
> - detect link reset request from guest
> - reset link on host
> 
> Since link reset will reset all devices behind it, for this to work we
> need same set of devices behind the link in host and guest.  Enforcing
> this would be nice to have.

This is a pretty significant complication and I think it's a
requirement.  This is the heart of why we have an AER vfio-pci device
option and why we require that QEMU should fail to initialize the
device if AER is enabled in an incompatible configuration.  If a user
cannot be sure that AER is enabled on a device, it's pointless to
bother implementing it, though honestly I question the value of it in
the VM altogether given configuration requirements (ie. are users
going to accept the reason that all the ports of a device need to be
assigned to a single guest for guest-based AER recovery when they were
previously able to assign each port to a separate VM?).
 
> - as link now might end up in bad state, reset
>   it when device is unassigned

This is also a new aspect for the approach here, previously we allowed
the host directed link reset so we could assume the device was
sufficiently recovered.  In the proposal here, the AER core skips any
devices bound to vfio-pci, but vfio can't guarantee that we can do a
bus reset on them.  PCI bus isolation is not accounted for in DMA
isolation, which is the basis for IOMMU groups.  A bus can host
multiple IOMMU groups, each of which may have a different user.  Only
in a very specific configuration can vfio do a bus reset.
 
> Any details I missed?

AIUI, the critical feature is that the guest needs to be able to reset
the device link, all the other design elements play out from the
logical expansion of that feature.  It means that a guest bus reset
needs to translate to a host bus reset, which means that all of the
affected host devices need to be accounted for and those that are
assigned to the guest need to be affected in the guest in the same
way.  QEMU must enforce this configuration or else a user cannot know
the result of a AER fault, ie. will it cause a VMSTOP condition or is
the fault forwarded to the guest.  The required configuration
restrictions are quite involved, therefore we can't simply require this
of all configurations, so a vfio-pci device option is added.  The
coordination of the host directed reset with the guest directed reset
was only a complications discovered within the last few revisions of
the series.  As noted above, the previous solution to this was to
attempt to block access to the device while the host reset proceeds.

Clearly it's a little disconcerting if we throw all of that away and
simply assume that an FLR is sufficient to reset the device when it
seems like link issues might be a nontrivial source of AER faults.  If
FLR is sufficient, then why does the core AER handling code in the
kernel not simply do this?  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 6, 2016, 6:11 a.m. UTC | #17
On 12/06/2016 12:17 AM, Alex Williamson wrote:
> On Mon, 5 Dec 2016 13:52:03 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> On 12/04/2016 11:30 PM, Alex Williamson wrote:
>>> On Sun, 4 Dec 2016 20:16:42 +0800
>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>   
>>>> On 12/01/2016 10:55 PM, Alex Williamson wrote:  
>>>>> On Thu, 1 Dec 2016 21:40:00 +0800    
>>>>  
>>>>>>> If an AER fault occurs and the user doesn't do a reset, what
>>>>>>> happens when that device is released and a host driver tries to make
>>>>>>> use of it?  The user makes no commitment to do a reset and there are
>>>>>>> only limited configurations where we even allow the user to perform a
>>>>>>> reset.
>>>>>>>       
>>>>>>
>>>>>> Limited? Do you mean the things __pci_dev_reset() can do?    
>>>>>
>>>>> I mean that there are significant device and guest configuration
>>>>> restrictions in order to support AER.  For instance, all the functions
>>>>> of the slot need to appear in a PCI-e topology in the guest with all
>>>>> the functions in the right place such that a guest bus reset translates
>>>>> into a host bus reset.  The physical functions cannot be split between
>>>>> guests even if IOMMU isolation would otherwise allow it.  The user
>>>>> needs to explicitly enable AER support for the devices.  A VM need to
>>>>> be specifically configured for AER support in order to set any sort of
>>>>> expectations of a guest directed bus reset, let alone a guarantee that
>>>>> it will happen.  So all the existing VMs, where functions are split
>>>>> between guests, or the topology isn't exactly right, or AER isn't
>>>>> enabled see a regression from the above change as the device is no
>>>>> longer reset.
>>>>>     
>>>>
>>>> I am not clear why set these restrictions in the current design. I take
>>>> a glance at older versions of qemu's patchset, their thoughts is:
>>>> translate a guest bus reset into a host bus reset(Which is
>>>> unreasonable[*] to me). And I guess, that's the *cause* of these
>>>> restrictions?  Is there any other stories behind these restrictions?
>>>>
>>>> [*] In physical world, set bridge's secondary bus reset would send
>>>> hot-reset TLP to all functions below, trigger every device's reset
>>>> separately. Emulated device should behave the same, means just using
>>>> each device's DeviceClass->reset method.  
>>>
>>> Are you trying to say that an FLR is equivalent to a link reset?  
>>
>> No.  Look at old versions patchset, there is one names "vote the
>> function 0 to do host bus reset when aer occurred"[1], that is what I
>> called "translate guest link reset to host link reset", and what I think
>> unreasonable(and I think it also does it wrongly).  So in v10 version of
>> mine, I dropped it.
>>
>> [1]https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02987.html
>>
>> If "translate guest link reset to host link reset" is right, I can
>> understand these restrictions[2][3].
>>
>> [2]. All physical functions in a single card must be assigned to the VM
>>      with AER enabled on each and configured on the same virtual bus.
>> [3]. Don't place other devices under the virtual bus in [2], no matter
>>      physical, emulated, or paravirtual, even if other device
>>      supporting AER signaling
>>
>> Certain device's FLR calls its DeviceClass->reset method; link reset
>> calls DeviceClass->reset of each device which on the bus. So, apparently
>> they have difference.  But if there is only 1 vfio-pci device under the
>> virtual pci bus,  I think FLR can be equivalent to a link reset, right?
> 
> No.  An FLR resets the device while a secondary bus reset does a reset
> of the link and the device.  AER errors are sometimes issues with the
> link, not the device.  If we were to perform only an FLR, we're not
> performing the same reset as would be done on bare metal.
>  

Thanks for you explanation, it does helps, except the last sentence, I
think I understand it now: fatal error implies there may be link issue
exists(pci express spec: 6.2.2.2.1), so, should do link reset for fatal
error(that is what and why aer core does). And so, in patch[1] above,
qemu does a link reset when seeing secondary bus reset bit of virtual
bus got set. is it right?
Cao jin Dec. 6, 2016, 6:47 a.m. UTC | #18
On 12/06/2016 11:46 AM, Michael S. Tsirkin wrote:
> On Thu, Dec 01, 2016 at 09:40:23PM +0800, Cao jin wrote:
>>
>>
>> On 12/01/2016 12:51 PM, Michael S. Tsirkin wrote:
>>> On Wed, Nov 30, 2016 at 09:04:13PM -0700, Alex Williamson wrote:
>>>> On Sun, 27 Nov 2016 19:34:17 +0800
>>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>>
>>
>>>>> @@ -1187,10 +1200,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
>>>>>  		return PCI_ERS_RESULT_DISCONNECT;
>>>>>  	}
>>>>>  
>>>>> +	/* get device's uncorrectable error status as soon as possible,
>>>>> +	 * and signal it to user space. The later we read it, the possibility
>>>>> +	 * the register value is mangled grows. */
>>>>
>>>> Bad comment style (throughout).
>>>>
>>>>> +	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
>>>>> +	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
>>>>> +                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
>>>>> +        if (ret)
>>>>> +                return PCI_ERS_RESULT_DISCONNECT;
>>>>> +
>>>>> +	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
>>>>>  	mutex_lock(&vdev->igate);
>>>>> +    
>>>>> +	vdev->aer_recovering = true;
>>>>> +	reinit_completion(&vdev->aer_error_completion);
>>>>> +
>>>>> +	/* suspend config space access from user space,
>>>>> +	 * when vfio-pci's error recovery process is on */
>>>>> +	pci_cfg_access_trylock(vdev->pdev);
>>>>>  
>>>>> -	if (vdev->err_trigger)
>>>>> -		eventfd_signal(vdev->err_trigger, 1);
>>>>> +	if (vdev->err_trigger && uncor_status) {
>>>>> +		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
>>>>> +		/* signal uncorrectable error status to user space */
>>>>> +		eventfd_signal(vdev->err_trigger, uncor_status);
>>>
>>> What does this mean that we trigger with uncor_status as opposed to 1?
>>>
>>
>> I guess you missed the changelog in qemu patchset's cover letter, see if
>> it helps(copy from cover letter):
>>
>> 5. Change what eventfd signals(check vfio driver patch). Before,
>> vfio-pci driver add 1 to the counter, which doesn't have meaning, just
>> for notification. Now, vfio-pci's error detected handler read the
>> uncorrectable error status and signal it to qemu.
> 
> I don't think you can use an eventfd to send values like this.
> 
> eventfd does a sum of these values, so sending e.g.
> value 2 will look the same as sending value 1 twice.
> 

Yes, eventfd has a uint_64 counter, and eventfd_signal does a sum.

manpage of "eventfd" says:

The semantics of read(2) depend on whether the eventfd counter currently
has a nonzero value and whether the EFD_SEMAPHORE flag was specified
when creating the eventfd file descriptor:

  *  If EFD_SEMAPHORE was not specified and the eventfd counter has a
nonzero value, then a read(2) returns 8 bytes containing that value, and
the counter's value is reset to zero.

......

  *  If the eventfd counter is zero at the time of the call to  read(2),
 then the call either blocks until the counter becomes  nonzero (at
which time, the read(2) proceeds as described above) or fails with the
error EAGAIN if the file descriptor has been made nonblocking.

And in actual tests, the debug line I add in vfio_err_notifier_handler
shows[*]: qemu read this value right every time, the same as in vfio-pci
driver.

quick reference:
[*]https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg04831.html

> 
>> Why? When testing previous version(host aer driver still reset link),
>> found that there is quite possibility that register reading returns the
>> invalid value 0xFFFFFFFF, which results in all(2 in my environment)
>> qemu's vfio-pci function send AER to root port while I only inject error
>> into one function. This is unreasonable, and I think it is the result of
>> reading during device reset.
>>
>> Previous patch does considered to find the
>> real function who has error happened, but won't work under the situation
>> I found. So move the register reading as early as possible would be the
>> nature thoughts, and it is moved into vfio-pci driver. Although now
>> reset_link is disabled in aer driver, get the uncor error status as
>> early as possible still make sense, for example: if user space driver
>> does device reset once receiving the error notification, and then read
>> register.
> 
> I had trouble understanding the above. Let me ask you this:
> should we try to avoid triggering uncorrectable errors?
> Aren't any such errors likely to crash guests?
> 

Sorry I don't quite understand why your questions are raised, it seems
just describing the current situation without my AER patch: when assign
device via vfio, we don't handle uncorrectable error, just
vm_stop()(this is what you mean "crash guests", right?), this is the
contents of current vfio_err_notifier_handler().

Without our AER patches, the answer to your questions are YES. But the
purpose of our patch is to fix your questions. Not sure my answer helps
you, but feel free to ask if still any questions.
Cao jin Dec. 6, 2016, 10:46 a.m. UTC | #19
On 12/06/2016 12:59 PM, Alex Williamson wrote:
> On Tue, 6 Dec 2016 05:55:28 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:
>>> If you're going to take the lead for these AER patches, I would
>>> certainly suggest that understanding the reasoning behind the bus reset
>>> behavior is a central aspect to this series.  This effort has dragged
>>> out for nearly two years and I apologize, but I don't really have a lot
>>> of patience for rehashing some of these issues if you're not going to
>>> read the previous discussions or consult with your colleagues to
>>> understand how we got to this point.  If you want to challenge some of
>>> the design points, that's great, it could use some new eyes, but please
>>> understand how we got here first.  
>>
>> Well I'm guessing Cao jin here isn't the only one not
>> willing to plough through all historical versions of the patchset
>> just to figure out the motivation for some code.
>>
>> Including a summary of a high level architecture couldn't hurt.
>>
>> Any chance of writing such?  Alternatively, we can try to build it as
>> part of this thread.  Shouldn't be hard as it seems somewhat
>> straight-forward on the surface:
>>
>> - detect link error on the host, don't reset link as we would normally do
> 
> This is actually a new approach that I'm not sure I agree with.  By
> skipping the host directed link reset, vfio is taking responsibility
> for doing this, but then we just assume the user will do it.  I have
> issues with this.
> 
> The previous approach was to use the error detected notifier to block
> access to the device, allowing the host to perform the link reset.  A
> subsequent notification in the AER process released the user access
> which allowed the user AER process to proceed.  This did result in both
> a host directed and a guest directed link reset, but other than
> coordinating the blocking of the user process during host reset, that
> hasn't been brought up as an issue previously.
> 

Tests on previous versions didn't bring up issues as I find, I think
that is because we didn't test it completely. As I know, before August
of this year, we didn't have cable connected to NIC, let alone
connecting NIC to gateway.

Even if I fixed the guest oops issue in igb driver that Alex found in
v9, v9 still cannot work in my test. And in my test, disable link
reset(in host) in aer core for vfio-pci is the most significant step to
get my test passed.

>> - report link error to guest
>> - detect link reset request from guest
>> - reset link on host
>>
>> Since link reset will reset all devices behind it, for this to work we
>> need same set of devices behind the link in host and guest.  Enforcing
>> this would be nice to have.
> 
> This is a pretty significant complication and I think it's a
> requirement.  This is the heart of why we have an AER vfio-pci device
> option and why we require that QEMU should fail to initialize the
> device if AER is enabled in an incompatible configuration.  If a user
> cannot be sure that AER is enabled on a device, it's pointless to
> bother implementing it, though honestly I question the value of it in
> the VM altogether given configuration requirements (ie. are users
> going to accept the reason that all the ports of a device need to be
> assigned to a single guest for guest-based AER recovery when they were
> previously able to assign each port to a separate VM?).
>  
>> - as link now might end up in bad state, reset
>>   it when device is unassigned
> 
> This is also a new aspect for the approach here, previously we allowed
> the host directed link reset so we could assume the device was
> sufficiently recovered.  In the proposal here, the AER core skips any
> devices bound to vfio-pci, but vfio can't guarantee that we can do a
> bus reset on them.  PCI bus isolation is not accounted for in DMA
> isolation, which is the basis for IOMMU groups.  A bus can host
> multiple IOMMU groups, each of which may have a different user.  Only
> in a very specific configuration can vfio do a bus reset.
>  
>> Any details I missed?
> 
> AIUI, the critical feature is that the guest needs to be able to reset
> the device link, all the other design elements play out from the
> logical expansion of that feature.  It means that a guest bus reset
> needs to translate to a host bus reset, which means that all of the
> affected host devices need to be accounted for and those that are
> assigned to the guest need to be affected in the guest in the same
> way.  QEMU must enforce this configuration or else a user cannot know
> the result of a AER fault, ie. will it cause a VMSTOP condition or is
> the fault forwarded to the guest.  The required configuration
> restrictions are quite involved, therefore we can't simply require this
> of all configurations, so a vfio-pci device option is added.  The
> coordination of the host directed reset with the guest directed reset
> was only a complications discovered within the last few revisions of
> the series.  As noted above, the previous solution to this was to
> attempt to block access to the device while the host reset proceeds.
> 
> Clearly it's a little disconcerting if we throw all of that away and
> simply assume that an FLR is sufficient to reset the device when it
> seems like link issues might be a nontrivial source of AER faults.  If
> FLR is sufficient, then why does the core AER handling code in the
> kernel not simply do this?  Thanks,
> 

I agree with the points here.  Now I understand why  translate a guest
link reset to host link reset[*], and FLR shouldn't be equivalent to
link reset, but the PITY is, we can't trigger a real fatal error to see
if a FLR is sufficient to reset the device.

[*]https://lists.gnu.org/archive/html/qemu-devel/2016-07/msg04100.html
Alex Williamson Dec. 6, 2016, 3:25 p.m. UTC | #20
On Tue, 6 Dec 2016 14:11:03 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/06/2016 12:17 AM, Alex Williamson wrote:
> > On Mon, 5 Dec 2016 13:52:03 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> On 12/04/2016 11:30 PM, Alex Williamson wrote:  
> >>> On Sun, 4 Dec 2016 20:16:42 +0800
> >>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >>>     
> >>>> On 12/01/2016 10:55 PM, Alex Williamson wrote:    
> >>>>> On Thu, 1 Dec 2016 21:40:00 +0800      
> >>>>    
> >>>>>>> If an AER fault occurs and the user doesn't do a reset, what
> >>>>>>> happens when that device is released and a host driver tries to make
> >>>>>>> use of it?  The user makes no commitment to do a reset and there are
> >>>>>>> only limited configurations where we even allow the user to perform a
> >>>>>>> reset.
> >>>>>>>         
> >>>>>>
> >>>>>> Limited? Do you mean the things __pci_dev_reset() can do?      
> >>>>>
> >>>>> I mean that there are significant device and guest configuration
> >>>>> restrictions in order to support AER.  For instance, all the functions
> >>>>> of the slot need to appear in a PCI-e topology in the guest with all
> >>>>> the functions in the right place such that a guest bus reset translates
> >>>>> into a host bus reset.  The physical functions cannot be split between
> >>>>> guests even if IOMMU isolation would otherwise allow it.  The user
> >>>>> needs to explicitly enable AER support for the devices.  A VM need to
> >>>>> be specifically configured for AER support in order to set any sort of
> >>>>> expectations of a guest directed bus reset, let alone a guarantee that
> >>>>> it will happen.  So all the existing VMs, where functions are split
> >>>>> between guests, or the topology isn't exactly right, or AER isn't
> >>>>> enabled see a regression from the above change as the device is no
> >>>>> longer reset.
> >>>>>       
> >>>>
> >>>> I am not clear why set these restrictions in the current design. I take
> >>>> a glance at older versions of qemu's patchset, their thoughts is:
> >>>> translate a guest bus reset into a host bus reset(Which is
> >>>> unreasonable[*] to me). And I guess, that's the *cause* of these
> >>>> restrictions?  Is there any other stories behind these restrictions?
> >>>>
> >>>> [*] In physical world, set bridge's secondary bus reset would send
> >>>> hot-reset TLP to all functions below, trigger every device's reset
> >>>> separately. Emulated device should behave the same, means just using
> >>>> each device's DeviceClass->reset method.    
> >>>
> >>> Are you trying to say that an FLR is equivalent to a link reset?    
> >>
> >> No.  Look at old versions patchset, there is one names "vote the
> >> function 0 to do host bus reset when aer occurred"[1], that is what I
> >> called "translate guest link reset to host link reset", and what I think
> >> unreasonable(and I think it also does it wrongly).  So in v10 version of
> >> mine, I dropped it.
> >>
> >> [1]https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02987.html
> >>
> >> If "translate guest link reset to host link reset" is right, I can
> >> understand these restrictions[2][3].
> >>
> >> [2]. All physical functions in a single card must be assigned to the VM
> >>      with AER enabled on each and configured on the same virtual bus.
> >> [3]. Don't place other devices under the virtual bus in [2], no matter
> >>      physical, emulated, or paravirtual, even if other device
> >>      supporting AER signaling
> >>
> >> Certain device's FLR calls its DeviceClass->reset method; link reset
> >> calls DeviceClass->reset of each device which on the bus. So, apparently
> >> they have difference.  But if there is only 1 vfio-pci device under the
> >> virtual pci bus,  I think FLR can be equivalent to a link reset, right?  
> > 
> > No.  An FLR resets the device while a secondary bus reset does a reset
> > of the link and the device.  AER errors are sometimes issues with the
> > link, not the device.  If we were to perform only an FLR, we're not
> > performing the same reset as would be done on bare metal.
> >    
> 
> Thanks for you explanation, it does helps, except the last sentence, I
> think I understand it now: fatal error implies there may be link issue
> exists(pci express spec: 6.2.2.2.1), so, should do link reset for fatal
> error(that is what and why aer core does). And so, in patch[1] above,
> qemu does a link reset when seeing secondary bus reset bit of virtual
> bus got set. is it right?

QEMU is only going to do a bus reset if the guest is participating in
AER recovery AND QEMU supports AER recovery AND the guest topology
configuration allows the guest bus reset to induce a host bus reset.
vfio does not know that QEMU is the user and cannot assume the user
will perform a bus reset.  We need to give the user the ability to
recover from an AER, but not rely on the user to do so.  We cannot
assume the user's intention or capabilities.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Dec. 6, 2016, 3:35 p.m. UTC | #21
On Tue, 6 Dec 2016 18:46:04 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/06/2016 12:59 PM, Alex Williamson wrote:
> > On Tue, 6 Dec 2016 05:55:28 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> >> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
> >>> If you're going to take the lead for these AER patches, I would
> >>> certainly suggest that understanding the reasoning behind the bus reset
> >>> behavior is a central aspect to this series.  This effort has dragged
> >>> out for nearly two years and I apologize, but I don't really have a lot
> >>> of patience for rehashing some of these issues if you're not going to
> >>> read the previous discussions or consult with your colleagues to
> >>> understand how we got to this point.  If you want to challenge some of
> >>> the design points, that's great, it could use some new eyes, but please
> >>> understand how we got here first.    
> >>
> >> Well I'm guessing Cao jin here isn't the only one not
> >> willing to plough through all historical versions of the patchset
> >> just to figure out the motivation for some code.
> >>
> >> Including a summary of a high level architecture couldn't hurt.
> >>
> >> Any chance of writing such?  Alternatively, we can try to build it as
> >> part of this thread.  Shouldn't be hard as it seems somewhat
> >> straight-forward on the surface:
> >>
> >> - detect link error on the host, don't reset link as we would normally do  
> > 
> > This is actually a new approach that I'm not sure I agree with.  By
> > skipping the host directed link reset, vfio is taking responsibility
> > for doing this, but then we just assume the user will do it.  I have
> > issues with this.
> > 
> > The previous approach was to use the error detected notifier to block
> > access to the device, allowing the host to perform the link reset.  A
> > subsequent notification in the AER process released the user access
> > which allowed the user AER process to proceed.  This did result in both
> > a host directed and a guest directed link reset, but other than
> > coordinating the blocking of the user process during host reset, that
> > hasn't been brought up as an issue previously.
> >   
> 
> Tests on previous versions didn't bring up issues as I find, I think
> that is because we didn't test it completely. As I know, before August
> of this year, we didn't have cable connected to NIC, let alone
> connecting NIC to gateway.

Lack of testing has been a significant issue throughout the development
of this series.

> Even if I fixed the guest oops issue in igb driver that Alex found in
> v9, v9 still cannot work in my test. And in my test, disable link
> reset(in host) in aer core for vfio-pci is the most significant step to
> get my test passed.

But is it the correct step?  I'm not convinced.  Why did blocking guest
access not work?  How do you plan to manage vfio taking the
responsibility to perform a bus reset when you don't know whether QEMU
is the user of the device or whether the user supports AER recovery?
 
> >> - report link error to guest
> >> - detect link reset request from guest
> >> - reset link on host
> >>
> >> Since link reset will reset all devices behind it, for this to work we
> >> need same set of devices behind the link in host and guest.  Enforcing
> >> this would be nice to have.  
> > 
> > This is a pretty significant complication and I think it's a
> > requirement.  This is the heart of why we have an AER vfio-pci device
> > option and why we require that QEMU should fail to initialize the
> > device if AER is enabled in an incompatible configuration.  If a user
> > cannot be sure that AER is enabled on a device, it's pointless to
> > bother implementing it, though honestly I question the value of it in
> > the VM altogether given configuration requirements (ie. are users
> > going to accept the reason that all the ports of a device need to be
> > assigned to a single guest for guest-based AER recovery when they were
> > previously able to assign each port to a separate VM?).
> >    
> >> - as link now might end up in bad state, reset
> >>   it when device is unassigned  
> > 
> > This is also a new aspect for the approach here, previously we allowed
> > the host directed link reset so we could assume the device was
> > sufficiently recovered.  In the proposal here, the AER core skips any
> > devices bound to vfio-pci, but vfio can't guarantee that we can do a
> > bus reset on them.  PCI bus isolation is not accounted for in DMA
> > isolation, which is the basis for IOMMU groups.  A bus can host
> > multiple IOMMU groups, each of which may have a different user.  Only
> > in a very specific configuration can vfio do a bus reset.
> >    
> >> Any details I missed?  
> > 
> > AIUI, the critical feature is that the guest needs to be able to reset
> > the device link, all the other design elements play out from the
> > logical expansion of that feature.  It means that a guest bus reset
> > needs to translate to a host bus reset, which means that all of the
> > affected host devices need to be accounted for and those that are
> > assigned to the guest need to be affected in the guest in the same
> > way.  QEMU must enforce this configuration or else a user cannot know
> > the result of a AER fault, ie. will it cause a VMSTOP condition or is
> > the fault forwarded to the guest.  The required configuration
> > restrictions are quite involved, therefore we can't simply require this
> > of all configurations, so a vfio-pci device option is added.  The
> > coordination of the host directed reset with the guest directed reset
> > was only a complications discovered within the last few revisions of
> > the series.  As noted above, the previous solution to this was to
> > attempt to block access to the device while the host reset proceeds.
> > 
> > Clearly it's a little disconcerting if we throw all of that away and
> > simply assume that an FLR is sufficient to reset the device when it
> > seems like link issues might be a nontrivial source of AER faults.  If
> > FLR is sufficient, then why does the core AER handling code in the
> > kernel not simply do this?  Thanks,
> >   
> 
> I agree with the points here.  Now I understand why  translate a guest
> link reset to host link reset[*], and FLR shouldn't be equivalent to
> link reset, but the PITY is, we can't trigger a real fatal error to see
> if a FLR is sufficient to reset the device.

In this case, it's not a matter of finding a scenario where it works.
Using FLR to recover from a fatal error would need to be supported by
verbiage in a specification.  I'm not interested in cases where it
happens to work on single device for a certain type of error.  The link
reset is the bare metal mechanism for recovering from a fatal error and
it should be our mechanism as well unless there's spec wording to
support another approach.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 7, 2016, 2:49 a.m. UTC | #22
On 12/06/2016 11:35 PM, Alex Williamson wrote:
> On Tue, 6 Dec 2016 18:46:04 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> On 12/06/2016 12:59 PM, Alex Williamson wrote:
>>> On Tue, 6 Dec 2016 05:55:28 +0200
>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>   
>>>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
>>>>> If you're going to take the lead for these AER patches, I would
>>>>> certainly suggest that understanding the reasoning behind the bus reset
>>>>> behavior is a central aspect to this series.  This effort has dragged
>>>>> out for nearly two years and I apologize, but I don't really have a lot
>>>>> of patience for rehashing some of these issues if you're not going to
>>>>> read the previous discussions or consult with your colleagues to
>>>>> understand how we got to this point.  If you want to challenge some of
>>>>> the design points, that's great, it could use some new eyes, but please
>>>>> understand how we got here first.    
>>>>
>>>> Well I'm guessing Cao jin here isn't the only one not
>>>> willing to plough through all historical versions of the patchset
>>>> just to figure out the motivation for some code.
>>>>
>>>> Including a summary of a high level architecture couldn't hurt.
>>>>
>>>> Any chance of writing such?  Alternatively, we can try to build it as
>>>> part of this thread.  Shouldn't be hard as it seems somewhat
>>>> straight-forward on the surface:
>>>>
>>>> - detect link error on the host, don't reset link as we would normally do  
>>>
>>> This is actually a new approach that I'm not sure I agree with.  By
>>> skipping the host directed link reset, vfio is taking responsibility
>>> for doing this, but then we just assume the user will do it.  I have
>>> issues with this.
>>>
>>> The previous approach was to use the error detected notifier to block
>>> access to the device, allowing the host to perform the link reset.  A
>>> subsequent notification in the AER process released the user access
>>> which allowed the user AER process to proceed.  This did result in both
>>> a host directed and a guest directed link reset, but other than
>>> coordinating the blocking of the user process during host reset, that
>>> hasn't been brought up as an issue previously.
>>>   
>>
>> Tests on previous versions didn't bring up issues as I find, I think
>> that is because we didn't test it completely. As I know, before August
>> of this year, we didn't have cable connected to NIC, let alone
>> connecting NIC to gateway.
> 
> Lack of testing has been a significant issue throughout the development
> of this series.
> 
>> Even if I fixed the guest oops issue in igb driver that Alex found in
>> v9, v9 still cannot work in my test. And in my test, disable link
>> reset(in host) in aer core for vfio-pci is the most significant step to
>> get my test passed.
> 
> But is it the correct step?  I'm not convinced.  Why did blocking guest
> access not work?  How do you plan to manage vfio taking the
> responsibility to perform a bus reset when you don't know whether QEMU
> is the user of the device or whether the user supports AER recovery?
>  
>>>> - report link error to guest
>>>> - detect link reset request from guest
>>>> - reset link on host
>>>>
>>>> Since link reset will reset all devices behind it, for this to work we
>>>> need same set of devices behind the link in host and guest.  Enforcing
>>>> this would be nice to have.  
>>>
>>> This is a pretty significant complication and I think it's a
>>> requirement.  This is the heart of why we have an AER vfio-pci device
>>> option and why we require that QEMU should fail to initialize the
>>> device if AER is enabled in an incompatible configuration.  If a user
>>> cannot be sure that AER is enabled on a device, it's pointless to
>>> bother implementing it, though honestly I question the value of it in
>>> the VM altogether given configuration requirements (ie. are users
>>> going to accept the reason that all the ports of a device need to be
>>> assigned to a single guest for guest-based AER recovery when they were
>>> previously able to assign each port to a separate VM?).
>>>    
>>>> - as link now might end up in bad state, reset
>>>>   it when device is unassigned  
>>>
>>> This is also a new aspect for the approach here, previously we allowed
>>> the host directed link reset so we could assume the device was
>>> sufficiently recovered.  In the proposal here, the AER core skips any
>>> devices bound to vfio-pci, but vfio can't guarantee that we can do a
>>> bus reset on them.  PCI bus isolation is not accounted for in DMA
>>> isolation, which is the basis for IOMMU groups.  A bus can host
>>> multiple IOMMU groups, each of which may have a different user.  Only
>>> in a very specific configuration can vfio do a bus reset.
>>>    
>>>> Any details I missed?  
>>>
>>> AIUI, the critical feature is that the guest needs to be able to reset
>>> the device link, all the other design elements play out from the
>>> logical expansion of that feature.  It means that a guest bus reset
>>> needs to translate to a host bus reset, which means that all of the
>>> affected host devices need to be accounted for and those that are
>>> assigned to the guest need to be affected in the guest in the same
>>> way.  QEMU must enforce this configuration or else a user cannot know
>>> the result of a AER fault, ie. will it cause a VMSTOP condition or is
>>> the fault forwarded to the guest.  The required configuration
>>> restrictions are quite involved, therefore we can't simply require this
>>> of all configurations, so a vfio-pci device option is added.  The
>>> coordination of the host directed reset with the guest directed reset
>>> was only a complications discovered within the last few revisions of
>>> the series.  As noted above, the previous solution to this was to
>>> attempt to block access to the device while the host reset proceeds.
>>>
>>> Clearly it's a little disconcerting if we throw all of that away and
>>> simply assume that an FLR is sufficient to reset the device when it
>>> seems like link issues might be a nontrivial source of AER faults.  If
>>> FLR is sufficient, then why does the core AER handling code in the
>>> kernel not simply do this?  Thanks,
>>>   
>>
>> I agree with the points here.  Now I understand why  translate a guest
>> link reset to host link reset[*], and FLR shouldn't be equivalent to
>> link reset, but the PITY is, we can't trigger a real fatal error to see
>> if a FLR is sufficient to reset the device.
> 
> In this case, it's not a matter of finding a scenario where it works.
> Using FLR to recover from a fatal error would need to be supported by
> verbiage in a specification.  I'm not interested in cases where it
> happens to work on single device for a certain type of error.  The link
> reset is the bare metal mechanism for recovering from a fatal error and
> it should be our mechanism as well unless there's spec wording to
> support another approach.  Thanks,
> 

Ok, I understand your points here, will drag that dropped patch back and
test.
Cao jin Dec. 7, 2016, 2:58 a.m. UTC | #23
On 12/06/2016 11:25 PM, Alex Williamson wrote:
> On Tue, 6 Dec 2016 14:11:03 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> On 12/06/2016 12:17 AM, Alex Williamson wrote:
>>> On Mon, 5 Dec 2016 13:52:03 +0800
>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>   
>>>> On 12/04/2016 11:30 PM, Alex Williamson wrote:  
>>>>> On Sun, 4 Dec 2016 20:16:42 +0800
>>>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>>>     
>>>>>> On 12/01/2016 10:55 PM, Alex Williamson wrote:    
>>>>>>> On Thu, 1 Dec 2016 21:40:00 +0800      
>>>>>>    
>>>>>>>>> If an AER fault occurs and the user doesn't do a reset, what
>>>>>>>>> happens when that device is released and a host driver tries to make
>>>>>>>>> use of it?  The user makes no commitment to do a reset and there are
>>>>>>>>> only limited configurations where we even allow the user to perform a
>>>>>>>>> reset.
>>>>>>>>>         
>>>>>>>>
>>>>>>>> Limited? Do you mean the things __pci_dev_reset() can do?      
>>>>>>>
>>>>>>> I mean that there are significant device and guest configuration
>>>>>>> restrictions in order to support AER.  For instance, all the functions
>>>>>>> of the slot need to appear in a PCI-e topology in the guest with all
>>>>>>> the functions in the right place such that a guest bus reset translates
>>>>>>> into a host bus reset.  The physical functions cannot be split between
>>>>>>> guests even if IOMMU isolation would otherwise allow it.  The user
>>>>>>> needs to explicitly enable AER support for the devices.  A VM need to
>>>>>>> be specifically configured for AER support in order to set any sort of
>>>>>>> expectations of a guest directed bus reset, let alone a guarantee that
>>>>>>> it will happen.  So all the existing VMs, where functions are split
>>>>>>> between guests, or the topology isn't exactly right, or AER isn't
>>>>>>> enabled see a regression from the above change as the device is no
>>>>>>> longer reset.
>>>>>>>       
>>>>>>
>>>>>> I am not clear why set these restrictions in the current design. I take
>>>>>> a glance at older versions of qemu's patchset, their thoughts is:
>>>>>> translate a guest bus reset into a host bus reset(Which is
>>>>>> unreasonable[*] to me). And I guess, that's the *cause* of these
>>>>>> restrictions?  Is there any other stories behind these restrictions?
>>>>>>
>>>>>> [*] In physical world, set bridge's secondary bus reset would send
>>>>>> hot-reset TLP to all functions below, trigger every device's reset
>>>>>> separately. Emulated device should behave the same, means just using
>>>>>> each device's DeviceClass->reset method.    
>>>>>
>>>>> Are you trying to say that an FLR is equivalent to a link reset?    
>>>>
>>>> No.  Look at old versions patchset, there is one names "vote the
>>>> function 0 to do host bus reset when aer occurred"[1], that is what I
>>>> called "translate guest link reset to host link reset", and what I think
>>>> unreasonable(and I think it also does it wrongly).  So in v10 version of
>>>> mine, I dropped it.
>>>>
>>>> [1]https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02987.html
>>>>
>>>> If "translate guest link reset to host link reset" is right, I can
>>>> understand these restrictions[2][3].
>>>>
>>>> [2]. All physical functions in a single card must be assigned to the VM
>>>>      with AER enabled on each and configured on the same virtual bus.
>>>> [3]. Don't place other devices under the virtual bus in [2], no matter
>>>>      physical, emulated, or paravirtual, even if other device
>>>>      supporting AER signaling
>>>>
>>>> Certain device's FLR calls its DeviceClass->reset method; link reset
>>>> calls DeviceClass->reset of each device which on the bus. So, apparently
>>>> they have difference.  But if there is only 1 vfio-pci device under the
>>>> virtual pci bus,  I think FLR can be equivalent to a link reset, right?  
>>>
>>> No.  An FLR resets the device while a secondary bus reset does a reset
>>> of the link and the device.  AER errors are sometimes issues with the
>>> link, not the device.  If we were to perform only an FLR, we're not
>>> performing the same reset as would be done on bare metal.
>>>    
>>
>> Thanks for you explanation, it does helps, except the last sentence, I
>> think I understand it now: fatal error implies there may be link issue
>> exists(pci express spec: 6.2.2.2.1), so, should do link reset for fatal
>> error(that is what and why aer core does). And so, in patch[1] above,
>> qemu does a link reset when seeing secondary bus reset bit of virtual
>> bus got set. is it right?
> 
> QEMU is only going to do a bus reset if the guest is participating in
> AER recovery AND QEMU supports AER recovery AND the guest topology
> configuration allows the guest bus reset to induce a host bus reset.
> vfio does not know that QEMU is the user and cannot assume the user
> will perform a bus reset.  We need to give the user the ability to
> recover from an AER, but not rely on the user to do so.  We cannot
> assume the user's intention or capabilities.  Thanks,
> 

Got your points, thanks a lot, will consider it more,
Cao jin Dec. 8, 2016, 2:46 p.m. UTC | #24
On 12/06/2016 11:35 PM, Alex Williamson wrote:
> On Tue, 6 Dec 2016 18:46:04 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> On 12/06/2016 12:59 PM, Alex Williamson wrote:
>>> On Tue, 6 Dec 2016 05:55:28 +0200
>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>   
>>>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
>>>>> If you're going to take the lead for these AER patches, I would
>>>>> certainly suggest that understanding the reasoning behind the bus reset
>>>>> behavior is a central aspect to this series.  This effort has dragged
>>>>> out for nearly two years and I apologize, but I don't really have a lot
>>>>> of patience for rehashing some of these issues if you're not going to
>>>>> read the previous discussions or consult with your colleagues to
>>>>> understand how we got to this point.  If you want to challenge some of
>>>>> the design points, that's great, it could use some new eyes, but please
>>>>> understand how we got here first.    
>>>>
>>>> Well I'm guessing Cao jin here isn't the only one not
>>>> willing to plough through all historical versions of the patchset
>>>> just to figure out the motivation for some code.
>>>>
>>>> Including a summary of a high level architecture couldn't hurt.
>>>>
>>>> Any chance of writing such?  Alternatively, we can try to build it as
>>>> part of this thread.  Shouldn't be hard as it seems somewhat
>>>> straight-forward on the surface:
>>>>
>>>> - detect link error on the host, don't reset link as we would normally do  
>>>
>>> This is actually a new approach that I'm not sure I agree with.  By
>>> skipping the host directed link reset, vfio is taking responsibility
>>> for doing this, but then we just assume the user will do it.  I have
>>> issues with this.
>>>
>>> The previous approach was to use the error detected notifier to block
>>> access to the device, allowing the host to perform the link reset.  A
>>> subsequent notification in the AER process released the user access
>>> which allowed the user AER process to proceed.  This did result in both
>>> a host directed and a guest directed link reset, but other than
>>> coordinating the blocking of the user process during host reset, that
>>> hasn't been brought up as an issue previously.
>>>   
>>
>> Tests on previous versions didn't bring up issues as I find, I think
>> that is because we didn't test it completely. As I know, before August
>> of this year, we didn't have cable connected to NIC, let alone
>> connecting NIC to gateway.
> 
> Lack of testing has been a significant issue throughout the development
> of this series.
> 
>> Even if I fixed the guest oops issue in igb driver that Alex found in
>> v9, v9 still cannot work in my test. And in my test, disable link
>> reset(in host) in aer core for vfio-pci is the most significant step to
>> get my test passed.
> 
> But is it the correct step?  I'm not convinced.  Why did blocking guest
> access not work?  How do you plan to manage vfio taking the
> responsibility to perform a bus reset when you don't know whether QEMU
> is the user of the device or whether the user supports AER recovery?
>  

Maybe currently we don't have enough proof to prove the correctness, but
I think I did find some facts to prove that link reset in host is a big
trouble, and can answer part of questions above.

1st, some thoughts:
In pci-error-recovery.txt and do_recovery() of kernel tree, we can see,
a recovery consists of several steps(callbacks), link reset is one of
them, and except link reset, the others are seems kind of device
specific. In our case, both host & guest will do recovery, I think the
host recovery actually is some kind of fake recovery, see vfio-pci
driver's error_detected & resume callback, they don't do anything
special, mainly signal error to user, but the link reset in host "fake
reset" does some serious work, in other words, I think host does the
recovery incompletely, so I was thinking, why not just drop incompletely
host recovery(drop link reset) for vfio-pci, and let the guest take care
of the whole serious recovery.  This is part of the reason of why my
version looks like this.  But yes, I admit the issue Alex mentioned,
vfio can't guarantee that user will do a bus reset, this is an issue I
will keep looking for a solution.

2nd, some facts and analyzation from test:
In host, the relationship between time and behviour in each component
roughly looks as following:

     +   HW    +  host kernel   +     qemu      + guest kernel  +
     |         |(error recovery)|               |               |
     |         |                |               |               |
     |         | vfio-pci's     |               |               |
     |         | error_detected |               |               |
     |         | +              |               |               |
     |         | |              |               |               |
     |         | | error notify |               |               |
     |         | | via eventfd  |               |               |
     |         | +---------------> +----------+ |               |
     |         |                |  +vfio_err_ | |               |
     |         |                |  |notifier_ | |               |
     | +---- +<---+link reset   |  |handler   | |               |
     | | HW  | |                |  |          | |               |       
  | |     | |                |             | |               |
     | |r    | | vfio-pci's     |  |pass aer  | |               |
     | |e..  | | resume         |  |to guest  | |               |
     | |s.   | | (blocking end) |  |          | |               |
     | |e    | |                |  |   *2*    | |               |
     | |t    | |                |  +----+-----+ |               |
     | |i    | |                |       |       |               |
     | |n    | |                |       +--------> +----------+ |
     | |g    | |                |               |  | guest    | |
     | |     | |                |               |  | recovery | |
     | |     | |                |               |  | process  | |
     | |     | |                |               |  |(include  | |
     | |     | |                |               |  |register  | |
     | | *1* | |                |               |  |access)   | |
     | |     | |                |               |  |          | |
     | |     | |                |               |  |   *3*    | |
     |         |                |               |  +----------+ |
Time |         |                |               |               |
     |         |                |               |               |
     |         |                |               |               |
     |         |                |               |               |
     v         |                |               |               |

Now let me try to answer: Why did blocking guest access not work?
Some important factor:
1. host recovery doesn't do anything special except error notifying, so
it may be executed very fast.
2. Hardware resetting time is not sure, from pcie spec 6.6.1, guessing
it need many ms, pretty long?

some facts found in v9(block config write, not read, during host
recovery) test:
1. reading uncor error register in vfio_err_notifier_handler sometimes
returns correct value, sometimes return invalid value(All F's)

So, I am thinking, if host blocking on host end early, and *2*, *3* is
parallel with *1*, the way used in v9 to blocking guest access, may not
work.
Michael S. Tsirkin Dec. 8, 2016, 4:30 p.m. UTC | #25
On Thu, Dec 08, 2016 at 10:46:59PM +0800, Cao jin wrote:
> 
> 
> On 12/06/2016 11:35 PM, Alex Williamson wrote:
> > On Tue, 6 Dec 2016 18:46:04 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > 
> >> On 12/06/2016 12:59 PM, Alex Williamson wrote:
> >>> On Tue, 6 Dec 2016 05:55:28 +0200
> >>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >>>   
> >>>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
> >>>>> If you're going to take the lead for these AER patches, I would
> >>>>> certainly suggest that understanding the reasoning behind the bus reset
> >>>>> behavior is a central aspect to this series.  This effort has dragged
> >>>>> out for nearly two years and I apologize, but I don't really have a lot
> >>>>> of patience for rehashing some of these issues if you're not going to
> >>>>> read the previous discussions or consult with your colleagues to
> >>>>> understand how we got to this point.  If you want to challenge some of
> >>>>> the design points, that's great, it could use some new eyes, but please
> >>>>> understand how we got here first.    
> >>>>
> >>>> Well I'm guessing Cao jin here isn't the only one not
> >>>> willing to plough through all historical versions of the patchset
> >>>> just to figure out the motivation for some code.
> >>>>
> >>>> Including a summary of a high level architecture couldn't hurt.
> >>>>
> >>>> Any chance of writing such?  Alternatively, we can try to build it as
> >>>> part of this thread.  Shouldn't be hard as it seems somewhat
> >>>> straight-forward on the surface:
> >>>>
> >>>> - detect link error on the host, don't reset link as we would normally do  
> >>>
> >>> This is actually a new approach that I'm not sure I agree with.  By
> >>> skipping the host directed link reset, vfio is taking responsibility
> >>> for doing this, but then we just assume the user will do it.  I have
> >>> issues with this.
> >>>
> >>> The previous approach was to use the error detected notifier to block
> >>> access to the device, allowing the host to perform the link reset.  A
> >>> subsequent notification in the AER process released the user access
> >>> which allowed the user AER process to proceed.  This did result in both
> >>> a host directed and a guest directed link reset, but other than
> >>> coordinating the blocking of the user process during host reset, that
> >>> hasn't been brought up as an issue previously.
> >>>   
> >>
> >> Tests on previous versions didn't bring up issues as I find, I think
> >> that is because we didn't test it completely. As I know, before August
> >> of this year, we didn't have cable connected to NIC, let alone
> >> connecting NIC to gateway.
> > 
> > Lack of testing has been a significant issue throughout the development
> > of this series.
> > 
> >> Even if I fixed the guest oops issue in igb driver that Alex found in
> >> v9, v9 still cannot work in my test. And in my test, disable link
> >> reset(in host) in aer core for vfio-pci is the most significant step to
> >> get my test passed.
> > 
> > But is it the correct step?  I'm not convinced.  Why did blocking guest
> > access not work?  How do you plan to manage vfio taking the
> > responsibility to perform a bus reset when you don't know whether QEMU
> > is the user of the device or whether the user supports AER recovery?
> >  
> 
> Maybe currently we don't have enough proof to prove the correctness, but
> I think I did find some facts to prove that link reset in host is a big
> trouble, and can answer part of questions above.
> 
> 1st, some thoughts:
> In pci-error-recovery.txt and do_recovery() of kernel tree, we can see,
> a recovery consists of several steps(callbacks), link reset is one of
> them, and except link reset, the others are seems kind of device
> specific. In our case, both host & guest will do recovery, I think the
> host recovery actually is some kind of fake recovery, see vfio-pci
> driver's error_detected & resume callback, they don't do anything
> special, mainly signal error to user, but the link reset in host "fake
> reset" does some serious work, in other words, I think host does the
> recovery incompletely, so I was thinking, why not just drop incompletely
> host recovery(drop link reset) for vfio-pci, and let the guest take care
> of the whole serious recovery.  This is part of the reason of why my
> version looks like this.  But yes, I admit the issue Alex mentioned,
> vfio can't guarantee that user will do a bus reset, this is an issue I
> will keep looking for a solution.
> 
> 2nd, some facts and analyzation from test:
> In host, the relationship between time and behviour in each component
> roughly looks as following:
> 
>      +   HW    +  host kernel   +     qemu      + guest kernel  +
>      |         |(error recovery)|               |               |
>      |         |                |               |               |
>      |         | vfio-pci's     |               |               |
>      |         | error_detected |               |               |
>      |         | +              |               |               |
>      |         | |              |               |               |
>      |         | | error notify |               |               |
>      |         | | via eventfd  |               |               |
>      |         | +---------------> +----------+ |               |
>      |         |                |  +vfio_err_ | |               |
>      |         |                |  |notifier_ | |               |
>      | +---- +<---+link reset   |  |handler   | |               |
>      | | HW  | |                |  |          | |               |       
>   | |     | |                |             | |               |
>      | |r    | | vfio-pci's     |  |pass aer  | |               |
>      | |e..  | | resume         |  |to guest  | |               |
>      | |s.   | | (blocking end) |  |          | |               |
>      | |e    | |                |  |   *2*    | |               |
>      | |t    | |                |  +----+-----+ |               |
>      | |i    | |                |       |       |               |
>      | |n    | |                |       +--------> +----------+ |
>      | |g    | |                |               |  | guest    | |
>      | |     | |                |               |  | recovery | |
>      | |     | |                |               |  | process  | |
>      | |     | |                |               |  |(include  | |
>      | |     | |                |               |  |register  | |
>      | | *1* | |                |               |  |access)   | |
>      | |     | |                |               |  |          | |
>      | |     | |                |               |  |   *3*    | |
>      |         |                |               |  +----------+ |
> Time |         |                |               |               |
>      |         |                |               |               |
>      |         |                |               |               |
>      |         |                |               |               |
>      v         |                |               |               |
> 
> Now let me try to answer: Why did blocking guest access not work?
> Some important factor:
> 1. host recovery doesn't do anything special except error notifying, so
> it may be executed very fast.
> 2. Hardware resetting time is not sure, from pcie spec 6.6.1, guessing
> it need many ms, pretty long?
> 
> some facts found in v9(block config write, not read, during host
> recovery) test:
> 1. reading uncor error register in vfio_err_notifier_handler sometimes
> returns correct value, sometimes return invalid value(All F's)
> 
> So, I am thinking, if host blocking on host end early, and *2*, *3* is
> parallel with *1*, the way used in v9 to blocking guest access, may not
> work.

I don't think this answers Alex's question. He is simply asking
which kind of errors are we recovering from, and why is FLR
sufficient. All this info is completely lacking in
current linux code, too. Why does it reset the link?
Express spec certainly does not say we should.
Link reset is the strongest kind of reset we can do though,
so maybe the thinking goes, let's just do it.

You would like to weaken this, but it's hard to
be sure it's enough, we are dealing with misbehaving
hardware after all.

So what would help, is a list:

- error type
- recovery required
- how to do it in a VM

As for blocking guest access, I'm not sure why it's important
to do. Guests can put devices in bad states, that's a given.
Why isn't it enough to tell guest that device is in a bad state?
I think it will try to avoid accesses then, if it does not,
it is not supposed to do anything too bad.

> -- 
> Sincerely,
> Cao jin
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 9, 2016, 3:40 a.m. UTC | #26
On 12/09/2016 12:30 AM, Michael S. Tsirkin wrote:
> On Thu, Dec 08, 2016 at 10:46:59PM +0800, Cao jin wrote:
>>
>>
>> On 12/06/2016 11:35 PM, Alex Williamson wrote:
>>> On Tue, 6 Dec 2016 18:46:04 +0800
>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>
>>>> On 12/06/2016 12:59 PM, Alex Williamson wrote:
>>>>> On Tue, 6 Dec 2016 05:55:28 +0200
>>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>>>   
>>>>>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
>>>>>>> If you're going to take the lead for these AER patches, I would
>>>>>>> certainly suggest that understanding the reasoning behind the bus reset
>>>>>>> behavior is a central aspect to this series.  This effort has dragged
>>>>>>> out for nearly two years and I apologize, but I don't really have a lot
>>>>>>> of patience for rehashing some of these issues if you're not going to
>>>>>>> read the previous discussions or consult with your colleagues to
>>>>>>> understand how we got to this point.  If you want to challenge some of
>>>>>>> the design points, that's great, it could use some new eyes, but please
>>>>>>> understand how we got here first.    
>>>>>>
>>>>>> Well I'm guessing Cao jin here isn't the only one not
>>>>>> willing to plough through all historical versions of the patchset
>>>>>> just to figure out the motivation for some code.
>>>>>>
>>>>>> Including a summary of a high level architecture couldn't hurt.
>>>>>>
>>>>>> Any chance of writing such?  Alternatively, we can try to build it as
>>>>>> part of this thread.  Shouldn't be hard as it seems somewhat
>>>>>> straight-forward on the surface:
>>>>>>
>>>>>> - detect link error on the host, don't reset link as we would normally do  
>>>>>
>>>>> This is actually a new approach that I'm not sure I agree with.  By
>>>>> skipping the host directed link reset, vfio is taking responsibility
>>>>> for doing this, but then we just assume the user will do it.  I have
>>>>> issues with this.
>>>>>
>>>>> The previous approach was to use the error detected notifier to block
>>>>> access to the device, allowing the host to perform the link reset.  A
>>>>> subsequent notification in the AER process released the user access
>>>>> which allowed the user AER process to proceed.  This did result in both
>>>>> a host directed and a guest directed link reset, but other than
>>>>> coordinating the blocking of the user process during host reset, that
>>>>> hasn't been brought up as an issue previously.
>>>>>   
>>>>
>>>> Tests on previous versions didn't bring up issues as I find, I think
>>>> that is because we didn't test it completely. As I know, before August
>>>> of this year, we didn't have cable connected to NIC, let alone
>>>> connecting NIC to gateway.
>>>
>>> Lack of testing has been a significant issue throughout the development
>>> of this series.
>>>
>>>> Even if I fixed the guest oops issue in igb driver that Alex found in
>>>> v9, v9 still cannot work in my test. And in my test, disable link
>>>> reset(in host) in aer core for vfio-pci is the most significant step to
>>>> get my test passed.
>>>
>>> But is it the correct step?  I'm not convinced.  Why did blocking guest
>>> access not work?  How do you plan to manage vfio taking the
>>> responsibility to perform a bus reset when you don't know whether QEMU
>>> is the user of the device or whether the user supports AER recovery?
>>>  
>>
>> Maybe currently we don't have enough proof to prove the correctness, but
>> I think I did find some facts to prove that link reset in host is a big
>> trouble, and can answer part of questions above.
>>
>> 1st, some thoughts:
>> In pci-error-recovery.txt and do_recovery() of kernel tree, we can see,
>> a recovery consists of several steps(callbacks), link reset is one of
>> them, and except link reset, the others are seems kind of device
>> specific. In our case, both host & guest will do recovery, I think the
>> host recovery actually is some kind of fake recovery, see vfio-pci
>> driver's error_detected & resume callback, they don't do anything
>> special, mainly signal error to user, but the link reset in host "fake
>> reset" does some serious work, in other words, I think host does the
>> recovery incompletely, so I was thinking, why not just drop incompletely
>> host recovery(drop link reset) for vfio-pci, and let the guest take care
>> of the whole serious recovery.  This is part of the reason of why my
>> version looks like this.  But yes, I admit the issue Alex mentioned,
>> vfio can't guarantee that user will do a bus reset, this is an issue I
>> will keep looking for a solution.
>>
>> 2nd, some facts and analyzation from test:
>> In host, the relationship between time and behviour in each component
>> roughly looks as following:
>>
>>      +   HW    +  host kernel   +     qemu      + guest kernel  +
>>      |         |(error recovery)|               |               |
>>      |         |                |               |               |
>>      |         | vfio-pci's     |               |               |
>>      |         | error_detected |               |               |
>>      |         | +              |               |               |
>>      |         | |              |               |               |
>>      |         | | error notify |               |               |
>>      |         | | via eventfd  |               |               |
>>      |         | +---------------> +----------+ |               |
>>      |         |                |  +vfio_err_ | |               |
>>      |         |                |  |notifier_ | |               |
>>      | +---- +<---+link reset   |  |handler   | |               |
>>      | | HW  | |                |  |          | |               |       
>>      | |     | |                |             | |               |
>>      | |r    | | vfio-pci's     |  |pass aer  | |               |
>>      | |e..  | | resume         |  |to guest  | |               |
>>      | |s.   | | (blocking end) |  |          | |               |
>>      | |e    | |                |  |   *2*    | |               |
>>      | |t    | |                |  +----+-----+ |               |
>>      | |i    | |                |       |       |               |
>>      | |n    | |                |       +--------> +----------+ |
>>      | |g    | |                |               |  | guest    | |
>>      | |     | |                |               |  | recovery | |
>>      | |     | |                |               |  | process  | |
>>      | |     | |                |               |  |(include  | |
>>      | |     | |                |               |  |register  | |
>>      | | *1* | |                |               |  |access)   | |
>>      | |     | |                |               |  |          | |
>>      | |     | |                |               |  |   *3*    | |
>>      |         |                |               |  +----------+ |
>> Time |         |                |               |               |
>>      |         |                |               |               |
>>      |         |                |               |               |
>>      |         |                |               |               |
>>      v         |                |               |               |
>>
>> Now let me try to answer: Why did blocking guest access not work?
>> Some important factor:
>> 1. host recovery doesn't do anything special except error notifying, so
>> it may be executed very fast.
>> 2. Hardware resetting time is not sure, from pcie spec 6.6.1, guessing
>> it need many ms, pretty long?
>>
>> some facts found in v9(block config write, not read, during host
>> recovery) test:
>> 1. reading uncor error register in vfio_err_notifier_handler sometimes
>> returns correct value, sometimes return invalid value(All F's)
>>
>> So, I am thinking, if host blocking on host end early, and *2*, *3* is
>> parallel with *1*, the way used in v9 to blocking guest access, may not
>> work.
> 
> I don't think this answers Alex's question. He is simply asking
> which kind of errors are we recovering from, and why is FLR
> sufficient. All this info is completely lacking in
> current linux code, too. Why does it reset the link?
> Express spec certainly does not say we should.
> Link reset is the strongest kind of reset we can do though,

I feel there is a mistake here. I have said several times that I
understand and agree with Alex's points: if I disable host link reset,
then "guest bus reset should induce a host bus reset" on fatal error,
and FLR is not sufficient.  This is a fault in this version, I will
address it in next version.

Alex's question "Why did blocking guest access not work?", make me feel
he is talking the way used in last version kernel patch[*], to block
config write, but not read, in host recovery. I guess it is also a
result of long discussion. But it[*] doesn't work well, that's why I
draw a figure to explain.

[*]http://lists.nongnu.org/archive/html/qemu-devel/2016-08/msg00155.html

I am aware of "Link reset is the strongest kind of reset we can do
though", I even spent a whole day on "what the link reset exactly means"
digging in express spec, and induced some unrestrained thoughts:

(I don't have electrical engineering background, and the following
thoughts don't have relationship with our discussion)
set secondary bus reset bit direct physical layer to send TS1(or TS2)
twice(express spec 4.2.5.11) on the link, device received these TS, then
reset itself(my inference, didn't see exact words). So if data(TS) can
still be transferred correctly on physical layer, is link unreliable? or
the *unreliable* is targeting data link layer and transaction Layer?

> so maybe the thinking goes, let's just do it.
> 

Sure I will. But I still have thoughts unsolved on the restrictions of
configuration, there may be another mail on this topic later.

> You would like to weaken this, but it's hard to
> be sure it's enough, we are dealing with misbehaving
> hardware after all.
> 
> So what would help, is a list:
> 
> - error type
> - recovery required
> - how to do it in a VM
> 
> As for blocking guest access, I'm not sure why it's important
> to do. Guests can put devices in bad states, that's a given.
> Why isn't it enough to tell guest that device is in a bad state?
> I think it will try to avoid accesses then, if it does not,
> it is not supposed to do anything too bad.
> 
>> -- 
>> Sincerely,
>> Cao jin
>>
> 
> 
> .
>
Cao jin Dec. 9, 2016, 3:40 a.m. UTC | #27
On 12/08/2016 10:46 PM, Cao jin wrote:
> 
> 
> On 12/06/2016 11:35 PM, Alex Williamson wrote:
>> On Tue, 6 Dec 2016 18:46:04 +0800
>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>
>>> On 12/06/2016 12:59 PM, Alex Williamson wrote:
>>>> On Tue, 6 Dec 2016 05:55:28 +0200
>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>>   
>>>>> On Mon, Dec 05, 2016 at 09:17:30AM -0700, Alex Williamson wrote:  
>>>>>> If you're going to take the lead for these AER patches, I would
>>>>>> certainly suggest that understanding the reasoning behind the bus reset
>>>>>> behavior is a central aspect to this series.  This effort has dragged
>>>>>> out for nearly two years and I apologize, but I don't really have a lot
>>>>>> of patience for rehashing some of these issues if you're not going to
>>>>>> read the previous discussions or consult with your colleagues to
>>>>>> understand how we got to this point.  If you want to challenge some of
>>>>>> the design points, that's great, it could use some new eyes, but please
>>>>>> understand how we got here first.    
>>>>>
>>>>> Well I'm guessing Cao jin here isn't the only one not
>>>>> willing to plough through all historical versions of the patchset
>>>>> just to figure out the motivation for some code.
>>>>>
>>>>> Including a summary of a high level architecture couldn't hurt.
>>>>>
>>>>> Any chance of writing such?  Alternatively, we can try to build it as
>>>>> part of this thread.  Shouldn't be hard as it seems somewhat
>>>>> straight-forward on the surface:
>>>>>
>>>>> - detect link error on the host, don't reset link as we would normally do  
>>>>
>>>> This is actually a new approach that I'm not sure I agree with.  By
>>>> skipping the host directed link reset, vfio is taking responsibility
>>>> for doing this, but then we just assume the user will do it.  I have
>>>> issues with this.
>>>>
>>>> The previous approach was to use the error detected notifier to block
>>>> access to the device, allowing the host to perform the link reset.  A
>>>> subsequent notification in the AER process released the user access
>>>> which allowed the user AER process to proceed.  This did result in both
>>>> a host directed and a guest directed link reset, but other than
>>>> coordinating the blocking of the user process during host reset, that
>>>> hasn't been brought up as an issue previously.
>>>>   
>>>
>>> Tests on previous versions didn't bring up issues as I find, I think
>>> that is because we didn't test it completely. As I know, before August
>>> of this year, we didn't have cable connected to NIC, let alone
>>> connecting NIC to gateway.
>>
>> Lack of testing has been a significant issue throughout the development
>> of this series.
>>
>>> Even if I fixed the guest oops issue in igb driver that Alex found in
>>> v9, v9 still cannot work in my test. And in my test, disable link
>>> reset(in host) in aer core for vfio-pci is the most significant step to
>>> get my test passed.
>>
>> But is it the correct step?  I'm not convinced.  Why did blocking guest
>> access not work?  How do you plan to manage vfio taking the
>> responsibility to perform a bus reset when you don't know whether QEMU
>> is the user of the device or whether the user supports AER recovery?
>>  
> 
> Maybe currently we don't have enough proof to prove the correctness, but
> I think I did find some facts to prove that link reset in host is a big
> trouble, and can answer part of questions above.
> 
> 1st, some thoughts:
> In pci-error-recovery.txt and do_recovery() of kernel tree, we can see,
> a recovery consists of several steps(callbacks), link reset is one of
> them, and except link reset, the others are seems kind of device
> specific. In our case, both host & guest will do recovery, I think the
> host recovery actually is some kind of fake recovery, see vfio-pci
> driver's error_detected & resume callback, they don't do anything
> special, mainly signal error to user, but the link reset in host "fake
> reset" does some serious work, in other words, I think host does the
> recovery incompletely, so I was thinking, why not just drop incompletely
> host recovery(drop link reset) for vfio-pci, and let the guest take care
> of the whole serious recovery.  This is part of the reason of why my
> version looks like this.  But yes, I admit the issue Alex mentioned,
> vfio can't guarantee that user will do a bus reset, this is an issue I
> will keep looking for a solution.
> 
> 2nd, some facts and analyzation from test:
> In host, the relationship between time and behviour in each component
> roughly looks as following:
> 
>      +   HW    +  host kernel   +     qemu      + guest kernel  +
>      |         |(error recovery)|               |               |
>      |         |                |               |               |
>      |         | vfio-pci's     |               |               |
>      |         | error_detected |               |               |
>      |         | +              |               |               |
>      |         | |              |               |               |
>      |         | | error notify |               |               |
>      |         | | via eventfd  |               |               |
>      |         | +---------------> +----------+ |               |
>      |         |                |  +vfio_err_ | |               |
>      |         |                |  |notifier_ | |               |
>      | +---- +<---+link reset   |  |handler   | |               |
>      | | HW  | |                |  |          | |               |       
>   | |     | |                |             | |               |
>      | |r    | | vfio-pci's     |  |pass aer  | |               |
>      | |e..  | | resume         |  |to guest  | |               |
>      | |s.   | | (blocking end) |  |          | |               |
>      | |e    | |                |  |   *2*    | |               |
>      | |t    | |                |  +----+-----+ |               |
>      | |i    | |                |       |       |               |
>      | |n    | |                |       +--------> +----------+ |
>      | |g    | |                |               |  | guest    | |
>      | |     | |                |               |  | recovery | |
>      | |     | |                |               |  | process  | |
>      | |     | |                |               |  |(include  | |
>      | |     | |                |               |  |register  | |
>      | | *1* | |                |               |  |access)   | |
>      | |     | |                |               |  |          | |
>      | |     | |                |               |  |   *3*    | |
>      |         |                |               |  +----------+ |
> Time |         |                |               |               |
>      |         |                |               |               |
>      |         |                |               |               |
>      |         |                |               |               |
>      v         |                |               |               |
> 
> Now let me try to answer: Why did blocking guest access not work?
> Some important factor:
> 1. host recovery doesn't do anything special except error notifying, so
> it may be executed very fast.
> 2. Hardware resetting time is not sure, from pcie spec 6.6.1, guessing
> it need many ms, pretty long?
> 
> some facts found in v9(block config write, not read, during host
> recovery) test:
> 1. reading uncor error register in vfio_err_notifier_handler sometimes
> returns correct value, sometimes return invalid value(All F's)
> 

I forget another facts:
2. the igb bug[*] our patch v9 triggered also is the same kind things.
error_detected() of igb in guest read config space, and return invalid
all F's, then igb NULLed its hardware address, then oops shows.

I think it is also a concurrent issue as described below.

[*] http://patchwork.ozlabs.org/patch/692171

> So, I am thinking, if host blocking on host end early, and *2*, *3* is
> parallel with *1*, the way used in v9 to blocking guest access, may not
> work.
>
Cao jin Dec. 12, 2016, 1:49 p.m. UTC | #28
Hi,
I have 2 solutions(high level design) came to me, please see if they are
acceptable, or which one is acceptable. Also have some questions.

1. block guest access during host recovery

   add new field error_recovering in struct vfio_pci_device to
   indicate host recovery status. aer driver in host will still do
   reset link

   - set error_recovering in vfio-pci driver's error_detected, used to
     block all kinds of user access(config space, mmio)
   - in order to solve concurrent issue of device resetting & user
     access, check device state[*] in vfio-pci driver's resume, see if
     device reset is done, if it is, then clear"error_recovering", or
     else new a timer, check device state periodically until device
     reset is done. (what if device reset don't end for a long time?)
   - In qemu, translate guest link reset to host link reset.
     A question here: we already have link reset in host, is a second
     link reset necessary? why?
 
   [*] how to check device state: reading certain config space
       register, check return value is valid or not(All F's)

2. skip link reset in aer driver of host kernel, for vfio-pci.
   Let user decide how to do serious recovery

   add new field "user_driver" in struct pci_dev, used to skip link
   reset for vfio-pci; add new field "link_reset" in struct
   vfio_pci_device to indicate link has been reset or not during
   recovery

   - set user_driver in vfio_pci_probe(), to skip link reset for
     vfio-pci in host.
   - (use a flag)block user access(config, mmio) during host recovery
     (not sure if this step is necessary)
   - In qemu, translate guest link reset to host link reset.
   - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
     is executed
   - In vfio-pci driver's resume, new a timer, check "link_reset" field
     periodically, if it is set in reasonable time, then clear it and
     delete timer, or else, vfio-pci driver will does the link reset!


A quick question:
I don't know how devices is divided into iommu groups, is it possible
for functions in a multi-function device to be split into different groups?
Alex Williamson Dec. 12, 2016, 7:12 p.m. UTC | #29
On Mon, 12 Dec 2016 21:49:01 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> Hi,
> I have 2 solutions(high level design) came to me, please see if they are
> acceptable, or which one is acceptable. Also have some questions.
> 
> 1. block guest access during host recovery
> 
>    add new field error_recovering in struct vfio_pci_device to
>    indicate host recovery status. aer driver in host will still do
>    reset link
> 
>    - set error_recovering in vfio-pci driver's error_detected, used to
>      block all kinds of user access(config space, mmio)
>    - in order to solve concurrent issue of device resetting & user
>      access, check device state[*] in vfio-pci driver's resume, see if
>      device reset is done, if it is, then clear"error_recovering", or
>      else new a timer, check device state periodically until device
>      reset is done. (what if device reset don't end for a long time?)
>    - In qemu, translate guest link reset to host link reset.
>      A question here: we already have link reset in host, is a second
>      link reset necessary? why?
>  
>    [*] how to check device state: reading certain config space
>        register, check return value is valid or not(All F's)

Isn't this exactly the path we were on previously?  There might be an
optimization that we could skip back-to-back resets, but how can you
necessarily infer that the resets are for the same thing?  If the user
accesses the device between resets, can you still guarantee the guest
directed reset is unnecessary?  If time passes between resets, do you
know they're for the same event?  How much time can pass between the
host and guest reset to know they're for the same event?  In the
process of error handling, which is more important, speed or
correctness?
 
> 2. skip link reset in aer driver of host kernel, for vfio-pci.
>    Let user decide how to do serious recovery
> 
>    add new field "user_driver" in struct pci_dev, used to skip link
>    reset for vfio-pci; add new field "link_reset" in struct
>    vfio_pci_device to indicate link has been reset or not during
>    recovery
> 
>    - set user_driver in vfio_pci_probe(), to skip link reset for
>      vfio-pci in host.
>    - (use a flag)block user access(config, mmio) during host recovery
>      (not sure if this step is necessary)
>    - In qemu, translate guest link reset to host link reset.
>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
>      is executed
>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
>      periodically, if it is set in reasonable time, then clear it and
>      delete timer, or else, vfio-pci driver will does the link reset!

What happens in the case of a multifunction device where each function
is part of a separate IOMMU group and one function is hot-removed from
the user?  We can't do a link reset on that function since the other
function is still in use.  We have no choice but release a device in an
unknown state back to the host.  As previously discussed, we don't
expect that any sort of function-level FLR will necessarily reset the
device to the same state.  I also don't really like vfio-pci taking
over error handling capabilities from the PCI-core.  That's redundant
code and extra maintenance overhead.
 
> A quick question:
> I don't know how devices is divided into iommu groups, is it possible
> for functions in a multi-function device to be split into different groups?

Yes, if a multifunction device supports ACS or if we have quirks to
expose that the functions do not perform internal peer-to-peer, then
they may be in separate IOMMU groups, depending on the rest of the PCI
topology.  See:

http://vfio.blogspot.com/2014/08/iommu-groups-inside-and-out.html

Thanks,
Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 12, 2016, 10:29 p.m. UTC | #30
On Mon, Dec 12, 2016 at 12:12:16PM -0700, Alex Williamson wrote:
> On Mon, 12 Dec 2016 21:49:01 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
> > Hi,
> > I have 2 solutions(high level design) came to me, please see if they are
> > acceptable, or which one is acceptable. Also have some questions.
> > 
> > 1. block guest access during host recovery
> > 
> >    add new field error_recovering in struct vfio_pci_device to
> >    indicate host recovery status. aer driver in host will still do
> >    reset link
> > 
> >    - set error_recovering in vfio-pci driver's error_detected, used to
> >      block all kinds of user access(config space, mmio)
> >    - in order to solve concurrent issue of device resetting & user
> >      access, check device state[*] in vfio-pci driver's resume, see if
> >      device reset is done, if it is, then clear"error_recovering", or
> >      else new a timer, check device state periodically until device
> >      reset is done. (what if device reset don't end for a long time?)
> >    - In qemu, translate guest link reset to host link reset.
> >      A question here: we already have link reset in host, is a second
> >      link reset necessary? why?
> >  
> >    [*] how to check device state: reading certain config space
> >        register, check return value is valid or not(All F's)
> 
> Isn't this exactly the path we were on previously?  There might be an
> optimization that we could skip back-to-back resets, but how can you
> necessarily infer that the resets are for the same thing?  If the user
> accesses the device between resets, can you still guarantee the guest
> directed reset is unnecessary?  If time passes between resets, do you
> know they're for the same event?  How much time can pass between the
> host and guest reset to know they're for the same event?  In the
> process of error handling, which is more important, speed or
> correctness?
>  
> > 2. skip link reset in aer driver of host kernel, for vfio-pci.
> >    Let user decide how to do serious recovery
> > 
> >    add new field "user_driver" in struct pci_dev, used to skip link
> >    reset for vfio-pci; add new field "link_reset" in struct
> >    vfio_pci_device to indicate link has been reset or not during
> >    recovery
> > 
> >    - set user_driver in vfio_pci_probe(), to skip link reset for
> >      vfio-pci in host.
> >    - (use a flag)block user access(config, mmio) during host recovery
> >      (not sure if this step is necessary)
> >    - In qemu, translate guest link reset to host link reset.
> >    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> >      is executed
> >    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> >      periodically, if it is set in reasonable time, then clear it and
> >      delete timer, or else, vfio-pci driver will does the link reset!
> 
> What happens in the case of a multifunction device where each function
> is part of a separate IOMMU group and one function is hot-removed from
> the user?

So just don't do it then. Topology must match between host and guest,
except maybe for the case of devices with host driver  (e.g. PF)
which we might be able to synchronize against.

>  We can't do a link reset on that function since the other
> function is still in use.  We have no choice but release a device in an
> unknown state back to the host.  As previously discussed, we don't
> expect that any sort of function-level FLR will necessarily reset the
> device to the same state.  I also don't really like vfio-pci taking
> over error handling capabilities from the PCI-core.  That's redundant
> code and extra maintenance overhead.
>  
> > A quick question:
> > I don't know how devices is divided into iommu groups, is it possible
> > for functions in a multi-function device to be split into different groups?
> 
> Yes, if a multifunction device supports ACS or if we have quirks to
> expose that the functions do not perform internal peer-to-peer, then
> they may be in separate IOMMU groups, depending on the rest of the PCI
> topology.  See:
> 
> http://vfio.blogspot.com/2014/08/iommu-groups-inside-and-out.html
> 
> Thanks,
> Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Dec. 12, 2016, 10:43 p.m. UTC | #31
On Tue, 13 Dec 2016 00:29:42 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Dec 12, 2016 at 12:12:16PM -0700, Alex Williamson wrote:
> > On Mon, 12 Dec 2016 21:49:01 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> > > Hi,
> > > I have 2 solutions(high level design) came to me, please see if they are
> > > acceptable, or which one is acceptable. Also have some questions.
> > > 
> > > 1. block guest access during host recovery
> > > 
> > >    add new field error_recovering in struct vfio_pci_device to
> > >    indicate host recovery status. aer driver in host will still do
> > >    reset link
> > > 
> > >    - set error_recovering in vfio-pci driver's error_detected, used to
> > >      block all kinds of user access(config space, mmio)
> > >    - in order to solve concurrent issue of device resetting & user
> > >      access, check device state[*] in vfio-pci driver's resume, see if
> > >      device reset is done, if it is, then clear"error_recovering", or
> > >      else new a timer, check device state periodically until device
> > >      reset is done. (what if device reset don't end for a long time?)
> > >    - In qemu, translate guest link reset to host link reset.
> > >      A question here: we already have link reset in host, is a second
> > >      link reset necessary? why?
> > >  
> > >    [*] how to check device state: reading certain config space
> > >        register, check return value is valid or not(All F's)  
> > 
> > Isn't this exactly the path we were on previously?  There might be an
> > optimization that we could skip back-to-back resets, but how can you
> > necessarily infer that the resets are for the same thing?  If the user
> > accesses the device between resets, can you still guarantee the guest
> > directed reset is unnecessary?  If time passes between resets, do you
> > know they're for the same event?  How much time can pass between the
> > host and guest reset to know they're for the same event?  In the
> > process of error handling, which is more important, speed or
> > correctness?
> >    
> > > 2. skip link reset in aer driver of host kernel, for vfio-pci.
> > >    Let user decide how to do serious recovery
> > > 
> > >    add new field "user_driver" in struct pci_dev, used to skip link
> > >    reset for vfio-pci; add new field "link_reset" in struct
> > >    vfio_pci_device to indicate link has been reset or not during
> > >    recovery
> > > 
> > >    - set user_driver in vfio_pci_probe(), to skip link reset for
> > >      vfio-pci in host.
> > >    - (use a flag)block user access(config, mmio) during host recovery
> > >      (not sure if this step is necessary)
> > >    - In qemu, translate guest link reset to host link reset.
> > >    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> > >      is executed
> > >    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> > >      periodically, if it is set in reasonable time, then clear it and
> > >      delete timer, or else, vfio-pci driver will does the link reset!  
> > 
> > What happens in the case of a multifunction device where each function
> > is part of a separate IOMMU group and one function is hot-removed from
> > the user?  
> 
> So just don't do it then. Topology must match between host and guest,
> except maybe for the case of devices with host driver  (e.g. PF)
> which we might be able to synchronize against.

We're talking about host kernel level handling here.  The host kernel
cannot defer the link reset to the user under the assumption that the
user is handling the devices in a very specific way.  The moment we do
that, we've lost.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 13, 2016, 3:15 a.m. UTC | #32
On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:
> > So just don't do it then. Topology must match between host and guest,
> > except maybe for the case of devices with host driver  (e.g. PF)
> > which we might be able to synchronize against.
> 
> We're talking about host kernel level handling here.  The host kernel
> cannot defer the link reset to the user under the assumption that the
> user is handling the devices in a very specific way.  The moment we do
> that, we've lost.

The way is same as baremetal though, so why not?

And if user doesn't do what's expected, we can
do the full link reset on close.
Alex Williamson Dec. 13, 2016, 3:39 a.m. UTC | #33
On Tue, 13 Dec 2016 05:15:13 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:
> > > So just don't do it then. Topology must match between host and guest,
> > > except maybe for the case of devices with host driver  (e.g. PF)
> > > which we might be able to synchronize against.  
> > 
> > We're talking about host kernel level handling here.  The host kernel
> > cannot defer the link reset to the user under the assumption that the
> > user is handling the devices in a very specific way.  The moment we do
> > that, we've lost.  
> 
> The way is same as baremetal though, so why not?

How do we know this?  What if the user is dpdk?  The kernel is
responsible for maintaining the integrity of the system and devices,
not the user.

> And if user doesn't do what's expected, we can
> do the full link reset on close.

That's exactly my point, if we're talking about multiple devices,
there's no guarantee that the close() for each is simultaneous.  If one
function is released before the other we cannot do a bus reset.  If
that device is then opened by another user before its sibling is
released, then we once again cannot perform a link reset.  I don't
think it would be reasonable to mark the released device quarantined
until the sibling is released, that would be a terrible user experience.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 13, 2016, 4:12 p.m. UTC | #34
On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:
> On Tue, 13 Dec 2016 05:15:13 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:
> > > > So just don't do it then. Topology must match between host and guest,
> > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > which we might be able to synchronize against.  
> > > 
> > > We're talking about host kernel level handling here.  The host kernel
> > > cannot defer the link reset to the user under the assumption that the
> > > user is handling the devices in a very specific way.  The moment we do
> > > that, we've lost.  
> > 
> > The way is same as baremetal though, so why not?
> 
> How do we know this?  What if the user is dpdk?  The kernel is
> responsible for maintaining the integrity of the system and devices,
> not the user.
> 
> > And if user doesn't do what's expected, we can
> > do the full link reset on close.
> 
> That's exactly my point, if we're talking about multiple devices,
> there's no guarantee that the close() for each is simultaneous.  If one
> function is released before the other we cannot do a bus reset.  If
> that device is then opened by another user before its sibling is
> released, then we once again cannot perform a link reset.  I don't
> think it would be reasonable to mark the released device quarantined
> until the sibling is released, that would be a terrible user experience.

Not sure why you find it so terrible, and I don't think there's another way.
Alex Williamson Dec. 13, 2016, 4:27 p.m. UTC | #35
On Tue, 13 Dec 2016 18:12:34 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:
> > On Tue, 13 Dec 2016 05:15:13 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:  
> > > > > So just don't do it then. Topology must match between host and guest,
> > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > which we might be able to synchronize against.    
> > > > 
> > > > We're talking about host kernel level handling here.  The host kernel
> > > > cannot defer the link reset to the user under the assumption that the
> > > > user is handling the devices in a very specific way.  The moment we do
> > > > that, we've lost.    
> > > 
> > > The way is same as baremetal though, so why not?  
> > 
> > How do we know this?  What if the user is dpdk?  The kernel is
> > responsible for maintaining the integrity of the system and devices,
> > not the user.
> >   
> > > And if user doesn't do what's expected, we can
> > > do the full link reset on close.  
> > 
> > That's exactly my point, if we're talking about multiple devices,
> > there's no guarantee that the close() for each is simultaneous.  If one
> > function is released before the other we cannot do a bus reset.  If
> > that device is then opened by another user before its sibling is
> > released, then we once again cannot perform a link reset.  I don't
> > think it would be reasonable to mark the released device quarantined
> > until the sibling is released, that would be a terrible user experience.  
> 
> Not sure why you find it so terrible, and I don't think there's another way.

If we can't do it without regressing the support we currently have,
let's not do it at all.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 14, 2016, 1:58 a.m. UTC | #36
On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:
> On Tue, 13 Dec 2016 18:12:34 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:
> > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:  
> > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > which we might be able to synchronize against.    
> > > > > 
> > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > cannot defer the link reset to the user under the assumption that the
> > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > that, we've lost.    
> > > > 
> > > > The way is same as baremetal though, so why not?  
> > > 
> > > How do we know this?  What if the user is dpdk?  The kernel is
> > > responsible for maintaining the integrity of the system and devices,
> > > not the user.
> > >   
> > > > And if user doesn't do what's expected, we can
> > > > do the full link reset on close.  
> > > 
> > > That's exactly my point, if we're talking about multiple devices,
> > > there's no guarantee that the close() for each is simultaneous.  If one
> > > function is released before the other we cannot do a bus reset.  If
> > > that device is then opened by another user before its sibling is
> > > released, then we once again cannot perform a link reset.  I don't
> > > think it would be reasonable to mark the released device quarantined
> > > until the sibling is released, that would be a terrible user experience.  
> > 
> > Not sure why you find it so terrible, and I don't think there's another way.
> 
> If we can't do it without regressing the support we currently have,
> let's not do it at all.

Why would we regress?  As long as there are no unrecoverable errors,
there's no need to change behaviour at all.

Alex, do you have a picture of how error recovery can work in your mind?
Your answers seem to imply you do, and these patches don't implement
this correctly.  I'm not sure about others, but I for one am unable to
piece it together from the comments you provide.  If yes, could you
maybe do a short writeup of an architecture you would be comfortable
with?

Thanks,
Alex Williamson Dec. 14, 2016, 3 a.m. UTC | #37
On Wed, 14 Dec 2016 03:58:17 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:
> > On Tue, 13 Dec 2016 18:12:34 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:  
> > > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:    
> > > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > > which we might be able to synchronize against.      
> > > > > > 
> > > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > > cannot defer the link reset to the user under the assumption that the
> > > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > > that, we've lost.      
> > > > > 
> > > > > The way is same as baremetal though, so why not?    
> > > > 
> > > > How do we know this?  What if the user is dpdk?  The kernel is
> > > > responsible for maintaining the integrity of the system and devices,
> > > > not the user.
> > > >     
> > > > > And if user doesn't do what's expected, we can
> > > > > do the full link reset on close.    
> > > > 
> > > > That's exactly my point, if we're talking about multiple devices,
> > > > there's no guarantee that the close() for each is simultaneous.  If one
> > > > function is released before the other we cannot do a bus reset.  If
> > > > that device is then opened by another user before its sibling is
> > > > released, then we once again cannot perform a link reset.  I don't
> > > > think it would be reasonable to mark the released device quarantined
> > > > until the sibling is released, that would be a terrible user experience.    
> > > 
> > > Not sure why you find it so terrible, and I don't think there's another way.  
> > 
> > If we can't do it without regressing the support we currently have,
> > let's not do it at all.  
> 
> Why would we regress?  As long as there are no unrecoverable errors,
> there's no need to change behaviour at all.

Currently if a fatal error occurs we allow the host to reset the
device, so to the best of our knowledge, the device is always reset.
The proposal here allows gaps where we assume a particular guest
behavior that allows the device to be returned to the host or opened by
other users without that reset.  Any plan that relies on a specific
user behavior is fundamentally wrong imo.

> Alex, do you have a picture of how error recovery can work in your mind?
> Your answers seem to imply you do, and these patches don't implement
> this correctly.  I'm not sure about others, but I for one am unable to
> piece it together from the comments you provide.  If yes, could you
> maybe do a short writeup of an architecture you would be comfortable
> with?

Clearly I have issues with this skip-the-host-reset plan, I don't think
it works.  We cannot assume the user will do error recovery.  As I
stated previously we should enable the user to do error recovery
without depending on the user to do error recovery.  I'm a bit lost why
we're focusing on this approach when the v9 approach of blocking user
access to the device during the host recovery seemed like a better
solution.  I don't think I've said anything bad about that approach,
but it does need further testing and debugging.  Nobody seems to be
interested in debugging why it wasn't quite working to understand
whether that was an implementation issue or a design issue. That's
currently the leading approach from my perspective.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 14, 2016, 10:24 a.m. UTC | #38
Sorry for late.
after reading all your comments, I think I will try the solution 1.

On 12/13/2016 03:12 AM, Alex Williamson wrote:
> On Mon, 12 Dec 2016 21:49:01 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> Hi,
>> I have 2 solutions(high level design) came to me, please see if they are
>> acceptable, or which one is acceptable. Also have some questions.
>>
>> 1. block guest access during host recovery
>>
>>    add new field error_recovering in struct vfio_pci_device to
>>    indicate host recovery status. aer driver in host will still do
>>    reset link
>>
>>    - set error_recovering in vfio-pci driver's error_detected, used to
>>      block all kinds of user access(config space, mmio)
>>    - in order to solve concurrent issue of device resetting & user
>>      access, check device state[*] in vfio-pci driver's resume, see if
>>      device reset is done, if it is, then clear"error_recovering", or
>>      else new a timer, check device state periodically until device
>>      reset is done. (what if device reset don't end for a long time?)
>>    - In qemu, translate guest link reset to host link reset.
>>      A question here: we already have link reset in host, is a second
>>      link reset necessary? why?
>>  
>>    [*] how to check device state: reading certain config space
>>        register, check return value is valid or not(All F's)
> 
> Isn't this exactly the path we were on previously?

Yes, it is basically the previous path, plus the optimization.

> There might be an
> optimization that we could skip back-to-back resets, but how can you
> necessarily infer that the resets are for the same thing? If the user
> accesses the device between resets, can you still guarantee the guest
> directed reset is unnecessary?  If time passes between resets, do you
> know they're for the same event?  How much time can pass between the
> host and guest reset to know they're for the same event?  In the
> process of error handling, which is more important, speed or
> correctness?
>  

I think vfio driver itself won't know what each reset comes for, and I
don't quite understand why should vfio care this question, is this a new
question in the design?

But I think it make sense that the user access during 2 resets maybe a
trouble for guest recovery, misbehaved user could be out of our
imagination.  Correctness is more important.

If I understand you right, let me make a summary: host recovery just
does link reset, which is incomplete, so we'd better do a complete guest
recovery for correctness.

>> 2. skip link reset in aer driver of host kernel, for vfio-pci.
>>    Let user decide how to do serious recovery
>>
>>    add new field "user_driver" in struct pci_dev, used to skip link
>>    reset for vfio-pci; add new field "link_reset" in struct
>>    vfio_pci_device to indicate link has been reset or not during
>>    recovery
>>
>>    - set user_driver in vfio_pci_probe(), to skip link reset for
>>      vfio-pci in host.
>>    - (use a flag)block user access(config, mmio) during host recovery
>>      (not sure if this step is necessary)
>>    - In qemu, translate guest link reset to host link reset.
>>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
>>      is executed
>>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
>>      periodically, if it is set in reasonable time, then clear it and
>>      delete timer, or else, vfio-pci driver will does the link reset!
> 
> What happens in the case of a multifunction device where each function
> is part of a separate IOMMU group and one function is hot-removed from
> the user? We can't do a link reset on that function since the other
> function is still in use.  We have no choice but release a device in an
> unknown state back to the host.

hot-remove from user, do you mean, for example, all functions assigned
to VM, then suddenly a person does something like following

$ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind

$ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind

to return device to host driver, or don't bind it to host driver, let it
in driver-less state???

>  As previously discussed, we don't
> expect that any sort of function-level FLR will necessarily reset the
> device to the same state.  I also don't really like vfio-pci taking
> over error handling capabilities from the PCI-core.  That's redundant
> code and extra maintenance overhead.
>  

I understand the concern, so I suppose solution 1 is preferred.
Alex Williamson Dec. 14, 2016, 10:16 p.m. UTC | #39
On Wed, 14 Dec 2016 18:24:23 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> Sorry for late.
> after reading all your comments, I think I will try the solution 1.
> 
> On 12/13/2016 03:12 AM, Alex Williamson wrote:
> > On Mon, 12 Dec 2016 21:49:01 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> Hi,
> >> I have 2 solutions(high level design) came to me, please see if they are
> >> acceptable, or which one is acceptable. Also have some questions.
> >>
> >> 1. block guest access during host recovery
> >>
> >>    add new field error_recovering in struct vfio_pci_device to
> >>    indicate host recovery status. aer driver in host will still do
> >>    reset link
> >>
> >>    - set error_recovering in vfio-pci driver's error_detected, used to
> >>      block all kinds of user access(config space, mmio)
> >>    - in order to solve concurrent issue of device resetting & user
> >>      access, check device state[*] in vfio-pci driver's resume, see if
> >>      device reset is done, if it is, then clear"error_recovering", or
> >>      else new a timer, check device state periodically until device
> >>      reset is done. (what if device reset don't end for a long time?)
> >>    - In qemu, translate guest link reset to host link reset.
> >>      A question here: we already have link reset in host, is a second
> >>      link reset necessary? why?
> >>  
> >>    [*] how to check device state: reading certain config space
> >>        register, check return value is valid or not(All F's)  
> > 
> > Isn't this exactly the path we were on previously?  
> 
> Yes, it is basically the previous path, plus the optimization.
> 
> > There might be an
> > optimization that we could skip back-to-back resets, but how can you
> > necessarily infer that the resets are for the same thing? If the user
> > accesses the device between resets, can you still guarantee the guest
> > directed reset is unnecessary?  If time passes between resets, do you
> > know they're for the same event?  How much time can pass between the
> > host and guest reset to know they're for the same event?  In the
> > process of error handling, which is more important, speed or
> > correctness?
> >    
> 
> I think vfio driver itself won't know what each reset comes for, and I
> don't quite understand why should vfio care this question, is this a new
> question in the design?

You're suggesting an optimization to eliminate one of the resets,
and as we've discussed, I don't see removing the host induced reset
as a viable option.  That means you want to eliminate the guest
directed reset.  There are potentially three levels to do that, the
vfio-pci driver in the host kernel, somewhere in QEMU, or eliminate it
within the guest.  My comments were directed to the first option, the
host kernel level cannot correlate user directed resets as duplicates
of host directed resets.  
 
> But I think it make sense that the user access during 2 resets maybe a
> trouble for guest recovery, misbehaved user could be out of our
> imagination.  Correctness is more important.
> 
> If I understand you right, let me make a summary: host recovery just
> does link reset, which is incomplete, so we'd better do a complete guest
> recovery for correctness.

We don't know whether the host link reset is incomplete, but we can't do
a link reset transparently to the device, the device is no longer in the
same state after the reset.  The device specific driver, which exists
in userspace needs to be involved in device recovery.  Therefore
regardless of how QEMU handles the error, the driver within the guest
needs to be notified and perform recovery.  Since the device is PCI and
we're on x86 and nobody wants to introduce paravirtual error recovery,
we must use AER.  Part of AER recovery includes the possibility of
performing a link reset.  So it seems this eliminates avoiding the link
reset within the guest.

That leaves QEMU.  Here we need to decide whether a guest triggered
link reset induces a host link reset.  The current working theory is
that yes, this must be the case.  If there is ever a case where a
driver within the guest could trigger a link reset for the purposes
of error recovery when the host has not, I think this must be the
case.  Therefore, at least some guest induced link resets must become
host link resets.  Currently we assume all guest induced link resets
become host link resets.  Minimally to avoid that, QEMU would need to
know (not assume) whether the host performed a link reset.  Even with
that, QEMU would need to be able to correlate that a link reset from
the guest is a duplicate of a link reset that was already performed by
the host.  That implies that QEMU needs to deduce the intention of
the guest.  That seems like a complicated task for a patch series that
is already complicated enough, especially for a feature of questionable
value given the configuration restrictions (imo).

I would much rather focus on getting it right and making it as simple
as we can, even if that means links get reset one too many times on
error.

> >> 2. skip link reset in aer driver of host kernel, for vfio-pci.
> >>    Let user decide how to do serious recovery
> >>
> >>    add new field "user_driver" in struct pci_dev, used to skip link
> >>    reset for vfio-pci; add new field "link_reset" in struct
> >>    vfio_pci_device to indicate link has been reset or not during
> >>    recovery
> >>
> >>    - set user_driver in vfio_pci_probe(), to skip link reset for
> >>      vfio-pci in host.
> >>    - (use a flag)block user access(config, mmio) during host recovery
> >>      (not sure if this step is necessary)
> >>    - In qemu, translate guest link reset to host link reset.
> >>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> >>      is executed
> >>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> >>      periodically, if it is set in reasonable time, then clear it and
> >>      delete timer, or else, vfio-pci driver will does the link reset!  
> > 
> > What happens in the case of a multifunction device where each function
> > is part of a separate IOMMU group and one function is hot-removed from
> > the user? We can't do a link reset on that function since the other
> > function is still in use.  We have no choice but release a device in an
> > unknown state back to the host.  
> 
> hot-remove from user, do you mean, for example, all functions assigned
> to VM, then suddenly a person does something like following
> 
> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind
> 
> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind
> 
> to return device to host driver, or don't bind it to host driver, let it
> in driver-less state???

Yes, the host kernel has no visiblity to how a user is making use of
devices.  To support AER we require a similar topology between host and
guest such that a guest link reset translates to a host reset.  That
requirement is imposed by userspace, ie. QEMU.  The host kernel cannot
presume that this is the case.  Therefore we could have a
multi-function device where each function is assigned to the same or
different users in any configuration.  If a fault occurs and we defer
to the user to perform the link reset, we have absolutely no guarantee
that it will ever occur.  If the functions are assigned to different
users, then each user individually doesn't have the capability to
perform a link reset.  If the devices happen to be assigned to a single
user when the error occurs, we cannot assume the user has an AER
compatible configuration, the devices could be exposed as separate
single function devices, any one of which might be individually removed
from the user and made use of by the host, such as your sysfs example
above.  The host cannot perform a link reset in this case either
as the sibling devices are still in use by the guest.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 14, 2016, 10:20 p.m. UTC | #40
On Tue, Dec 13, 2016 at 08:00:22PM -0700, Alex Williamson wrote:
> On Wed, 14 Dec 2016 03:58:17 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:
> > > On Tue, 13 Dec 2016 18:12:34 +0200
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:  
> > > > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > >     
> > > > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:    
> > > > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > > > which we might be able to synchronize against.      
> > > > > > > 
> > > > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > > > cannot defer the link reset to the user under the assumption that the
> > > > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > > > that, we've lost.      
> > > > > > 
> > > > > > The way is same as baremetal though, so why not?    
> > > > > 
> > > > > How do we know this?  What if the user is dpdk?  The kernel is
> > > > > responsible for maintaining the integrity of the system and devices,
> > > > > not the user.
> > > > >     
> > > > > > And if user doesn't do what's expected, we can
> > > > > > do the full link reset on close.    
> > > > > 
> > > > > That's exactly my point, if we're talking about multiple devices,
> > > > > there's no guarantee that the close() for each is simultaneous.  If one
> > > > > function is released before the other we cannot do a bus reset.  If
> > > > > that device is then opened by another user before its sibling is
> > > > > released, then we once again cannot perform a link reset.  I don't
> > > > > think it would be reasonable to mark the released device quarantined
> > > > > until the sibling is released, that would be a terrible user experience.    
> > > > 
> > > > Not sure why you find it so terrible, and I don't think there's another way.  
> > > 
> > > If we can't do it without regressing the support we currently have,
> > > let's not do it at all.  
> > 
> > Why would we regress?  As long as there are no unrecoverable errors,
> > there's no need to change behaviour at all.
> 
> Currently if a fatal error occurs we allow the host to reset the
> device, so to the best of our knowledge, the device is always reset.
> The proposal here allows gaps where we assume a particular guest
> behavior that allows the device to be returned to the host or opened by
> other users without that reset.  Any plan that relies on a specific
> user behavior is fundamentally wrong imo.
> 
> > Alex, do you have a picture of how error recovery can work in your mind?
> > Your answers seem to imply you do, and these patches don't implement
> > this correctly.  I'm not sure about others, but I for one am unable to
> > piece it together from the comments you provide.  If yes, could you
> > maybe do a short writeup of an architecture you would be comfortable
> > with?
> 
> Clearly I have issues with this skip-the-host-reset plan, I don't think
> it works.  We cannot assume the user will do error recovery.

Absolutely but we can defer recovery until device close.
Possibly with userspace invoking an ioctl requesting this,
to make sure we don't break any legacy setups.


>  As I
> stated previously we should enable the user to do error recovery
> without depending on the user to do error recovery.  I'm a bit lost why
> we're focusing on this approach when the v9 approach of blocking user
> access to the device during the host recovery seemed like a better
> solution.  I don't think I've said anything bad about that approach,
> but it does need further testing and debugging.  Nobody seems to be
> interested in debugging why it wasn't quite working to understand
> whether that was an implementation issue or a design issue. That's
> currently the leading approach from my perspective.  Thanks,
> 
> Alex

Hmm this doesn't really write up the architecture. It would be
really helpful to have that writeup.

Do I guess right that what v9 tried to do is:
- block future userspace accesses
- do full reset
- re-enable userspace accesses

?

The obvious issue with that is that device loses its state apparently at
random, and this is not something that happens on baremetal, ever.

Thus I don't see how this can work without some PV code in guest.
Michael S. Tsirkin Dec. 14, 2016, 10:25 p.m. UTC | #41
On Wed, Dec 14, 2016 at 03:16:37PM -0700, Alex Williamson wrote:
> On Wed, 14 Dec 2016 18:24:23 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
> > Sorry for late.
> > after reading all your comments, I think I will try the solution 1.
> > 
> > On 12/13/2016 03:12 AM, Alex Williamson wrote:
> > > On Mon, 12 Dec 2016 21:49:01 +0800
> > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > >   
> > >> Hi,
> > >> I have 2 solutions(high level design) came to me, please see if they are
> > >> acceptable, or which one is acceptable. Also have some questions.
> > >>
> > >> 1. block guest access during host recovery
> > >>
> > >>    add new field error_recovering in struct vfio_pci_device to
> > >>    indicate host recovery status. aer driver in host will still do
> > >>    reset link
> > >>
> > >>    - set error_recovering in vfio-pci driver's error_detected, used to
> > >>      block all kinds of user access(config space, mmio)
> > >>    - in order to solve concurrent issue of device resetting & user
> > >>      access, check device state[*] in vfio-pci driver's resume, see if
> > >>      device reset is done, if it is, then clear"error_recovering", or
> > >>      else new a timer, check device state periodically until device
> > >>      reset is done. (what if device reset don't end for a long time?)
> > >>    - In qemu, translate guest link reset to host link reset.
> > >>      A question here: we already have link reset in host, is a second
> > >>      link reset necessary? why?
> > >>  
> > >>    [*] how to check device state: reading certain config space
> > >>        register, check return value is valid or not(All F's)  
> > > 
> > > Isn't this exactly the path we were on previously?  
> > 
> > Yes, it is basically the previous path, plus the optimization.
> > 
> > > There might be an
> > > optimization that we could skip back-to-back resets, but how can you
> > > necessarily infer that the resets are for the same thing? If the user
> > > accesses the device between resets, can you still guarantee the guest
> > > directed reset is unnecessary?  If time passes between resets, do you
> > > know they're for the same event?  How much time can pass between the
> > > host and guest reset to know they're for the same event?  In the
> > > process of error handling, which is more important, speed or
> > > correctness?
> > >    
> > 
> > I think vfio driver itself won't know what each reset comes for, and I
> > don't quite understand why should vfio care this question, is this a new
> > question in the design?
> 
> You're suggesting an optimization to eliminate one of the resets,
> and as we've discussed, I don't see removing the host induced reset
> as a viable option.  That means you want to eliminate the guest
> directed reset.  There are potentially three levels to do that, the
> vfio-pci driver in the host kernel, somewhere in QEMU, or eliminate it
> within the guest.  My comments were directed to the first option, the
> host kernel level cannot correlate user directed resets as duplicates
> of host directed resets.  
>  
> > But I think it make sense that the user access during 2 resets maybe a
> > trouble for guest recovery, misbehaved user could be out of our
> > imagination.  Correctness is more important.
> > 
> > If I understand you right, let me make a summary: host recovery just
> > does link reset, which is incomplete, so we'd better do a complete guest
> > recovery for correctness.
> 
> We don't know whether the host link reset is incomplete, but we can't do
> a link reset transparently to the device, the device is no longer in the
> same state after the reset.  The device specific driver, which exists
> in userspace needs to be involved in device recovery.  Therefore
> regardless of how QEMU handles the error, the driver within the guest
> needs to be notified and perform recovery.  Since the device is PCI and
> we're on x86 and nobody wants to introduce paravirtual error recovery,
> we must use AER.  Part of AER recovery includes the possibility of
> performing a link reset.  So it seems this eliminates avoiding the link
> reset within the guest.
> 
> That leaves QEMU.  Here we need to decide whether a guest triggered
> link reset induces a host link reset.  The current working theory is
> that yes, this must be the case.  If there is ever a case where a
> driver within the guest could trigger a link reset for the purposes
> of error recovery when the host has not, I think this must be the
> case.  Therefore, at least some guest induced link resets must become
> host link resets.  Currently we assume all guest induced link resets
> become host link resets.  Minimally to avoid that, QEMU would need to
> know (not assume) whether the host performed a link reset.  Even with
> that, QEMU would need to be able to correlate that a link reset from
> the guest is a duplicate of a link reset that was already performed by
> the host.  That implies that QEMU needs to deduce the intention of
> the guest.  That seems like a complicated task for a patch series that
> is already complicated enough, especially for a feature of questionable
> value given the configuration restrictions (imo).
> 
> I would much rather focus on getting it right and making it as simple
> as we can, even if that means links get reset one too many times on
> error.
> 
> > >> 2. skip link reset in aer driver of host kernel, for vfio-pci.
> > >>    Let user decide how to do serious recovery
> > >>
> > >>    add new field "user_driver" in struct pci_dev, used to skip link
> > >>    reset for vfio-pci; add new field "link_reset" in struct
> > >>    vfio_pci_device to indicate link has been reset or not during
> > >>    recovery
> > >>
> > >>    - set user_driver in vfio_pci_probe(), to skip link reset for
> > >>      vfio-pci in host.
> > >>    - (use a flag)block user access(config, mmio) during host recovery
> > >>      (not sure if this step is necessary)
> > >>    - In qemu, translate guest link reset to host link reset.
> > >>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> > >>      is executed
> > >>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> > >>      periodically, if it is set in reasonable time, then clear it and
> > >>      delete timer, or else, vfio-pci driver will does the link reset!  
> > > 
> > > What happens in the case of a multifunction device where each function
> > > is part of a separate IOMMU group and one function is hot-removed from
> > > the user? We can't do a link reset on that function since the other
> > > function is still in use.  We have no choice but release a device in an
> > > unknown state back to the host.  
> > 
> > hot-remove from user, do you mean, for example, all functions assigned
> > to VM, then suddenly a person does something like following
> > 
> > $ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind
> > 
> > $ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind
> > 
> > to return device to host driver, or don't bind it to host driver, let it
> > in driver-less state???
> 
> Yes, the host kernel has no visiblity to how a user is making use of
> devices.  To support AER we require a similar topology between host and
> guest such that a guest link reset translates to a host reset.  That
> requirement is imposed by userspace, ie. QEMU.  The host kernel cannot
> presume that this is the case.

So enforce this to enable recovery functionality. Why can't you?

>  Therefore we could have a
> multi-function device where each function is assigned to the same or
> different users in any configuration.  If a fault occurs and we defer
> to the user to perform the link reset, we have absolutely no guarantee
> that it will ever occur.  If the functions are assigned to different
> users, then each user individually doesn't have the capability to
> perform a link reset.  If the devices happen to be assigned to a single
> user when the error occurs, we cannot assume the user has an AER
> compatible configuration, the devices could be exposed as separate
> single function devices, any one of which might be individually removed
> from the user and made use of by the host, such as your sysfs example
> above.  The host cannot perform a link reset in this case either
> as the sibling devices are still in use by the guest.  Thanks,
> 
> Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Dec. 14, 2016, 10:47 p.m. UTC | #42
On Thu, 15 Dec 2016 00:20:20 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Dec 13, 2016 at 08:00:22PM -0700, Alex Williamson wrote:
> > On Wed, 14 Dec 2016 03:58:17 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:  
> > > > On Tue, 13 Dec 2016 18:12:34 +0200
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:    
> > > > > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > >       
> > > > > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:      
> > > > > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > > > > which we might be able to synchronize against.        
> > > > > > > > 
> > > > > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > > > > cannot defer the link reset to the user under the assumption that the
> > > > > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > > > > that, we've lost.        
> > > > > > > 
> > > > > > > The way is same as baremetal though, so why not?      
> > > > > > 
> > > > > > How do we know this?  What if the user is dpdk?  The kernel is
> > > > > > responsible for maintaining the integrity of the system and devices,
> > > > > > not the user.
> > > > > >       
> > > > > > > And if user doesn't do what's expected, we can
> > > > > > > do the full link reset on close.      
> > > > > > 
> > > > > > That's exactly my point, if we're talking about multiple devices,
> > > > > > there's no guarantee that the close() for each is simultaneous.  If one
> > > > > > function is released before the other we cannot do a bus reset.  If
> > > > > > that device is then opened by another user before its sibling is
> > > > > > released, then we once again cannot perform a link reset.  I don't
> > > > > > think it would be reasonable to mark the released device quarantined
> > > > > > until the sibling is released, that would be a terrible user experience.      
> > > > > 
> > > > > Not sure why you find it so terrible, and I don't think there's another way.    
> > > > 
> > > > If we can't do it without regressing the support we currently have,
> > > > let's not do it at all.    
> > > 
> > > Why would we regress?  As long as there are no unrecoverable errors,
> > > there's no need to change behaviour at all.  
> > 
> > Currently if a fatal error occurs we allow the host to reset the
> > device, so to the best of our knowledge, the device is always reset.
> > The proposal here allows gaps where we assume a particular guest
> > behavior that allows the device to be returned to the host or opened by
> > other users without that reset.  Any plan that relies on a specific
> > user behavior is fundamentally wrong imo.
> >   
> > > Alex, do you have a picture of how error recovery can work in your mind?
> > > Your answers seem to imply you do, and these patches don't implement
> > > this correctly.  I'm not sure about others, but I for one am unable to
> > > piece it together from the comments you provide.  If yes, could you
> > > maybe do a short writeup of an architecture you would be comfortable
> > > with?  
> > 
> > Clearly I have issues with this skip-the-host-reset plan, I don't think
> > it works.  We cannot assume the user will do error recovery.  
> 
> Absolutely but we can defer recovery until device close.

No we can't, as I've tried to describe multiple times, if the functions
are part of separate groups then they can be opened and closed
asynchronously from each other and we may not have an opportunity where
they are all closed together to perform a reset.  The only option I can
see for this is to quarantine the device, which as I've stated seems
like a really poor solution.

> Possibly with userspace invoking an ioctl requesting this,
> to make sure we don't break any legacy setups.

And how do we know that user is not malicious?  How do we police
whether they actually perform a reset?  We can only track whether a
reset has occurred, which leads back to the quarantine scenario.

> >  As I
> > stated previously we should enable the user to do error recovery
> > without depending on the user to do error recovery.  I'm a bit lost why
> > we're focusing on this approach when the v9 approach of blocking user
> > access to the device during the host recovery seemed like a better
> > solution.  I don't think I've said anything bad about that approach,
> > but it does need further testing and debugging.  Nobody seems to be
> > interested in debugging why it wasn't quite working to understand
> > whether that was an implementation issue or a design issue. That's
> > currently the leading approach from my perspective.  Thanks,
> > 
> > Alex  
> 
> Hmm this doesn't really write up the architecture. It would be
> really helpful to have that writeup.

Perhaps the patch submitters can create one.
 
> Do I guess right that what v9 tried to do is:
> - block future userspace accesses
> - do full reset
> - re-enable userspace accesses
> 
> ?
> 
> The obvious issue with that is that device loses its state apparently at
> random, and this is not something that happens on baremetal, ever.
> 
> Thus I don't see how this can work without some PV code in guest.

So there is no link error that can occur on hardware where the device
becomes inaccessible?  I'm pretty sure I've experienced cases
otherwise.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Dec. 14, 2016, 10:49 p.m. UTC | #43
On Thu, 15 Dec 2016 00:25:13 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Dec 14, 2016 at 03:16:37PM -0700, Alex Williamson wrote:
> > On Wed, 14 Dec 2016 18:24:23 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> > > Sorry for late.
> > > after reading all your comments, I think I will try the solution 1.
> > > 
> > > On 12/13/2016 03:12 AM, Alex Williamson wrote:  
> > > > On Mon, 12 Dec 2016 21:49:01 +0800
> > > > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> > > >     
> > > >> Hi,
> > > >> I have 2 solutions(high level design) came to me, please see if they are
> > > >> acceptable, or which one is acceptable. Also have some questions.
> > > >>
> > > >> 1. block guest access during host recovery
> > > >>
> > > >>    add new field error_recovering in struct vfio_pci_device to
> > > >>    indicate host recovery status. aer driver in host will still do
> > > >>    reset link
> > > >>
> > > >>    - set error_recovering in vfio-pci driver's error_detected, used to
> > > >>      block all kinds of user access(config space, mmio)
> > > >>    - in order to solve concurrent issue of device resetting & user
> > > >>      access, check device state[*] in vfio-pci driver's resume, see if
> > > >>      device reset is done, if it is, then clear"error_recovering", or
> > > >>      else new a timer, check device state periodically until device
> > > >>      reset is done. (what if device reset don't end for a long time?)
> > > >>    - In qemu, translate guest link reset to host link reset.
> > > >>      A question here: we already have link reset in host, is a second
> > > >>      link reset necessary? why?
> > > >>  
> > > >>    [*] how to check device state: reading certain config space
> > > >>        register, check return value is valid or not(All F's)    
> > > > 
> > > > Isn't this exactly the path we were on previously?    
> > > 
> > > Yes, it is basically the previous path, plus the optimization.
> > >   
> > > > There might be an
> > > > optimization that we could skip back-to-back resets, but how can you
> > > > necessarily infer that the resets are for the same thing? If the user
> > > > accesses the device between resets, can you still guarantee the guest
> > > > directed reset is unnecessary?  If time passes between resets, do you
> > > > know they're for the same event?  How much time can pass between the
> > > > host and guest reset to know they're for the same event?  In the
> > > > process of error handling, which is more important, speed or
> > > > correctness?
> > > >      
> > > 
> > > I think vfio driver itself won't know what each reset comes for, and I
> > > don't quite understand why should vfio care this question, is this a new
> > > question in the design?  
> > 
> > You're suggesting an optimization to eliminate one of the resets,
> > and as we've discussed, I don't see removing the host induced reset
> > as a viable option.  That means you want to eliminate the guest
> > directed reset.  There are potentially three levels to do that, the
> > vfio-pci driver in the host kernel, somewhere in QEMU, or eliminate it
> > within the guest.  My comments were directed to the first option, the
> > host kernel level cannot correlate user directed resets as duplicates
> > of host directed resets.  
> >    
> > > But I think it make sense that the user access during 2 resets maybe a
> > > trouble for guest recovery, misbehaved user could be out of our
> > > imagination.  Correctness is more important.
> > > 
> > > If I understand you right, let me make a summary: host recovery just
> > > does link reset, which is incomplete, so we'd better do a complete guest
> > > recovery for correctness.  
> > 
> > We don't know whether the host link reset is incomplete, but we can't do
> > a link reset transparently to the device, the device is no longer in the
> > same state after the reset.  The device specific driver, which exists
> > in userspace needs to be involved in device recovery.  Therefore
> > regardless of how QEMU handles the error, the driver within the guest
> > needs to be notified and perform recovery.  Since the device is PCI and
> > we're on x86 and nobody wants to introduce paravirtual error recovery,
> > we must use AER.  Part of AER recovery includes the possibility of
> > performing a link reset.  So it seems this eliminates avoiding the link
> > reset within the guest.
> > 
> > That leaves QEMU.  Here we need to decide whether a guest triggered
> > link reset induces a host link reset.  The current working theory is
> > that yes, this must be the case.  If there is ever a case where a
> > driver within the guest could trigger a link reset for the purposes
> > of error recovery when the host has not, I think this must be the
> > case.  Therefore, at least some guest induced link resets must become
> > host link resets.  Currently we assume all guest induced link resets
> > become host link resets.  Minimally to avoid that, QEMU would need to
> > know (not assume) whether the host performed a link reset.  Even with
> > that, QEMU would need to be able to correlate that a link reset from
> > the guest is a duplicate of a link reset that was already performed by
> > the host.  That implies that QEMU needs to deduce the intention of
> > the guest.  That seems like a complicated task for a patch series that
> > is already complicated enough, especially for a feature of questionable
> > value given the configuration restrictions (imo).
> > 
> > I would much rather focus on getting it right and making it as simple
> > as we can, even if that means links get reset one too many times on
> > error.
> >   
> > > >> 2. skip link reset in aer driver of host kernel, for vfio-pci.
> > > >>    Let user decide how to do serious recovery
> > > >>
> > > >>    add new field "user_driver" in struct pci_dev, used to skip link
> > > >>    reset for vfio-pci; add new field "link_reset" in struct
> > > >>    vfio_pci_device to indicate link has been reset or not during
> > > >>    recovery
> > > >>
> > > >>    - set user_driver in vfio_pci_probe(), to skip link reset for
> > > >>      vfio-pci in host.
> > > >>    - (use a flag)block user access(config, mmio) during host recovery
> > > >>      (not sure if this step is necessary)
> > > >>    - In qemu, translate guest link reset to host link reset.
> > > >>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> > > >>      is executed
> > > >>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> > > >>      periodically, if it is set in reasonable time, then clear it and
> > > >>      delete timer, or else, vfio-pci driver will does the link reset!    
> > > > 
> > > > What happens in the case of a multifunction device where each function
> > > > is part of a separate IOMMU group and one function is hot-removed from
> > > > the user? We can't do a link reset on that function since the other
> > > > function is still in use.  We have no choice but release a device in an
> > > > unknown state back to the host.    
> > > 
> > > hot-remove from user, do you mean, for example, all functions assigned
> > > to VM, then suddenly a person does something like following
> > > 
> > > $ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind
> > > 
> > > $ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind
> > > 
> > > to return device to host driver, or don't bind it to host driver, let it
> > > in driver-less state???  
> > 
> > Yes, the host kernel has no visiblity to how a user is making use of
> > devices.  To support AER we require a similar topology between host and
> > guest such that a guest link reset translates to a host reset.  That
> > requirement is imposed by userspace, ie. QEMU.  The host kernel cannot
> > presume that this is the case.  
> 
> So enforce this to enable recovery functionality. Why can't you?

How?
 
> >  Therefore we could have a
> > multi-function device where each function is assigned to the same or
> > different users in any configuration.  If a fault occurs and we defer
> > to the user to perform the link reset, we have absolutely no guarantee
> > that it will ever occur.  If the functions are assigned to different
> > users, then each user individually doesn't have the capability to
> > perform a link reset.  If the devices happen to be assigned to a single
> > user when the error occurs, we cannot assume the user has an AER
> > compatible configuration, the devices could be exposed as separate
> > single function devices, any one of which might be individually removed
> > from the user and made use of by the host, such as your sysfs example
> > above.  The host cannot perform a link reset in this case either
> > as the sibling devices are still in use by the guest.  Thanks,
> > 
> > Alex  

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 14, 2016, 11 p.m. UTC | #44
On Wed, Dec 14, 2016 at 03:47:43PM -0700, Alex Williamson wrote:
> On Thu, 15 Dec 2016 00:20:20 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, Dec 13, 2016 at 08:00:22PM -0700, Alex Williamson wrote:
> > > On Wed, 14 Dec 2016 03:58:17 +0200
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:  
> > > > > On Tue, 13 Dec 2016 18:12:34 +0200
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > >     
> > > > > > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:    
> > > > > > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > >       
> > > > > > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:      
> > > > > > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > > > > > which we might be able to synchronize against.        
> > > > > > > > > 
> > > > > > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > > > > > cannot defer the link reset to the user under the assumption that the
> > > > > > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > > > > > that, we've lost.        
> > > > > > > > 
> > > > > > > > The way is same as baremetal though, so why not?      
> > > > > > > 
> > > > > > > How do we know this?  What if the user is dpdk?  The kernel is
> > > > > > > responsible for maintaining the integrity of the system and devices,
> > > > > > > not the user.
> > > > > > >       
> > > > > > > > And if user doesn't do what's expected, we can
> > > > > > > > do the full link reset on close.      
> > > > > > > 
> > > > > > > That's exactly my point, if we're talking about multiple devices,
> > > > > > > there's no guarantee that the close() for each is simultaneous.  If one
> > > > > > > function is released before the other we cannot do a bus reset.  If
> > > > > > > that device is then opened by another user before its sibling is
> > > > > > > released, then we once again cannot perform a link reset.  I don't
> > > > > > > think it would be reasonable to mark the released device quarantined
> > > > > > > until the sibling is released, that would be a terrible user experience.      
> > > > > > 
> > > > > > Not sure why you find it so terrible, and I don't think there's another way.    
> > > > > 
> > > > > If we can't do it without regressing the support we currently have,
> > > > > let's not do it at all.    
> > > > 
> > > > Why would we regress?  As long as there are no unrecoverable errors,
> > > > there's no need to change behaviour at all.  
> > > 
> > > Currently if a fatal error occurs we allow the host to reset the
> > > device, so to the best of our knowledge, the device is always reset.
> > > The proposal here allows gaps where we assume a particular guest
> > > behavior that allows the device to be returned to the host or opened by
> > > other users without that reset.  Any plan that relies on a specific
> > > user behavior is fundamentally wrong imo.
> > >   
> > > > Alex, do you have a picture of how error recovery can work in your mind?
> > > > Your answers seem to imply you do, and these patches don't implement
> > > > this correctly.  I'm not sure about others, but I for one am unable to
> > > > piece it together from the comments you provide.  If yes, could you
> > > > maybe do a short writeup of an architecture you would be comfortable
> > > > with?  
> > > 
> > > Clearly I have issues with this skip-the-host-reset plan, I don't think
> > > it works.  We cannot assume the user will do error recovery.  
> > 
> > Absolutely but we can defer recovery until device close.
> 
> No we can't, as I've tried to describe multiple times, if the functions
> are part of separate groups then they can be opened and closed
> asynchronously from each other and we may not have an opportunity where
> they are all closed together to perform a reset.
> The only option I can
> see for this is to quarantine the device, which as I've stated seems
> like a really poor solution.
> 
> > Possibly with userspace invoking an ioctl requesting this,
> > to make sure we don't break any legacy setups.
> 
> And how do we know that user is not malicious?  How do we police
> whether they actually perform a reset?  We can only track whether a
> reset has occurred, which leads back to the quarantine scenario.

I don't really know what do you call the quarantine scenario.
Malicious users get non working devices.
Why is this poor? 


> > >  As I
> > > stated previously we should enable the user to do error recovery
> > > without depending on the user to do error recovery.  I'm a bit lost why
> > > we're focusing on this approach when the v9 approach of blocking user
> > > access to the device during the host recovery seemed like a better
> > > solution.  I don't think I've said anything bad about that approach,
> > > but it does need further testing and debugging.  Nobody seems to be
> > > interested in debugging why it wasn't quite working to understand
> > > whether that was an implementation issue or a design issue. That's
> > > currently the leading approach from my perspective.  Thanks,
> > > 
> > > Alex  
> > 
> > Hmm this doesn't really write up the architecture. It would be
> > really helpful to have that writeup.
> 
> Perhaps the patch submitters can create one.

You already spent so many words on this - won't it
be easier for you to just write up what you consider
the right thing to do?


> > Do I guess right that what v9 tried to do is:
> > - block future userspace accesses
> > - do full reset
> > - re-enable userspace accesses
> > 
> > ?
> > 
> > The obvious issue with that is that device loses its state apparently at
> > random, and this is not something that happens on baremetal, ever.
> > 
> > Thus I don't see how this can work without some PV code in guest.
> 
> So there is no link error that can occur on hardware where the device
> becomes inaccessible?  I'm pretty sure I've experienced cases
> otherwise.

There are errors like this (losing link) but this is IMO a major error
which isn't covered by AER.

It's possible that some guests have AER code that happens to recover
from lost link, but IMO that's never guaranteed.

Converting an uncorrectable error like a lost write into link loss
means escalating the problem to a whole different level.
Alex Williamson Dec. 14, 2016, 11:32 p.m. UTC | #45
On Thu, 15 Dec 2016 01:00:06 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Dec 14, 2016 at 03:47:43PM -0700, Alex Williamson wrote:
> > On Thu, 15 Dec 2016 00:20:20 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, Dec 13, 2016 at 08:00:22PM -0700, Alex Williamson wrote:  
> > > > On Wed, 14 Dec 2016 03:58:17 +0200
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > >     
> > > > > On Tue, Dec 13, 2016 at 09:27:59AM -0700, Alex Williamson wrote:    
> > > > > > On Tue, 13 Dec 2016 18:12:34 +0200
> > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > >       
> > > > > > > On Mon, Dec 12, 2016 at 08:39:48PM -0700, Alex Williamson wrote:      
> > > > > > > > On Tue, 13 Dec 2016 05:15:13 +0200
> > > > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > >         
> > > > > > > > > On Mon, Dec 12, 2016 at 03:43:13PM -0700, Alex Williamson wrote:        
> > > > > > > > > > > So just don't do it then. Topology must match between host and guest,
> > > > > > > > > > > except maybe for the case of devices with host driver  (e.g. PF)
> > > > > > > > > > > which we might be able to synchronize against.          
> > > > > > > > > > 
> > > > > > > > > > We're talking about host kernel level handling here.  The host kernel
> > > > > > > > > > cannot defer the link reset to the user under the assumption that the
> > > > > > > > > > user is handling the devices in a very specific way.  The moment we do
> > > > > > > > > > that, we've lost.          
> > > > > > > > > 
> > > > > > > > > The way is same as baremetal though, so why not?        
> > > > > > > > 
> > > > > > > > How do we know this?  What if the user is dpdk?  The kernel is
> > > > > > > > responsible for maintaining the integrity of the system and devices,
> > > > > > > > not the user.
> > > > > > > >         
> > > > > > > > > And if user doesn't do what's expected, we can
> > > > > > > > > do the full link reset on close.        
> > > > > > > > 
> > > > > > > > That's exactly my point, if we're talking about multiple devices,
> > > > > > > > there's no guarantee that the close() for each is simultaneous.  If one
> > > > > > > > function is released before the other we cannot do a bus reset.  If
> > > > > > > > that device is then opened by another user before its sibling is
> > > > > > > > released, then we once again cannot perform a link reset.  I don't
> > > > > > > > think it would be reasonable to mark the released device quarantined
> > > > > > > > until the sibling is released, that would be a terrible user experience.        
> > > > > > > 
> > > > > > > Not sure why you find it so terrible, and I don't think there's another way.      
> > > > > > 
> > > > > > If we can't do it without regressing the support we currently have,
> > > > > > let's not do it at all.      
> > > > > 
> > > > > Why would we regress?  As long as there are no unrecoverable errors,
> > > > > there's no need to change behaviour at all.    
> > > > 
> > > > Currently if a fatal error occurs we allow the host to reset the
> > > > device, so to the best of our knowledge, the device is always reset.
> > > > The proposal here allows gaps where we assume a particular guest
> > > > behavior that allows the device to be returned to the host or opened by
> > > > other users without that reset.  Any plan that relies on a specific
> > > > user behavior is fundamentally wrong imo.
> > > >     
> > > > > Alex, do you have a picture of how error recovery can work in your mind?
> > > > > Your answers seem to imply you do, and these patches don't implement
> > > > > this correctly.  I'm not sure about others, but I for one am unable to
> > > > > piece it together from the comments you provide.  If yes, could you
> > > > > maybe do a short writeup of an architecture you would be comfortable
> > > > > with?    
> > > > 
> > > > Clearly I have issues with this skip-the-host-reset plan, I don't think
> > > > it works.  We cannot assume the user will do error recovery.    
> > > 
> > > Absolutely but we can defer recovery until device close.  
> > 
> > No we can't, as I've tried to describe multiple times, if the functions
> > are part of separate groups then they can be opened and closed
> > asynchronously from each other and we may not have an opportunity where
> > they are all closed together to perform a reset.
> > The only option I can
> > see for this is to quarantine the device, which as I've stated seems
> > like a really poor solution.
> >   
> > > Possibly with userspace invoking an ioctl requesting this,
> > > to make sure we don't break any legacy setups.  
> > 
> > And how do we know that user is not malicious?  How do we police
> > whether they actually perform a reset?  We can only track whether a
> > reset has occurred, which leads back to the quarantine scenario.  
> 
> I don't really know what do you call the quarantine scenario.
> Malicious users get non working devices.
> Why is this poor? 

If we were to skip the host link reset portion of recovery, we'd need
to mark the devices that would have been reset with some sort of dirty
flag.  A user performing a link reset affecting the device would clear
the dirty flag.  If a group is released by the user with devices having
the flag set, the group would be quarantined and we'd need to prevent
the devices within it from being released to the host or the group
being opened by another user.  vfio-pci would need to actively try to
reset the group whenever another group is released.

It's not a malicious user shooting themselves, it's a malicious user
causing a DoS by leaving a device in a tainted state where we cannot
reuse it.  It also inches us ever further up the complexity scale for
this feature.

> > > >  As I
> > > > stated previously we should enable the user to do error recovery
> > > > without depending on the user to do error recovery.  I'm a bit lost why
> > > > we're focusing on this approach when the v9 approach of blocking user
> > > > access to the device during the host recovery seemed like a better
> > > > solution.  I don't think I've said anything bad about that approach,
> > > > but it does need further testing and debugging.  Nobody seems to be
> > > > interested in debugging why it wasn't quite working to understand
> > > > whether that was an implementation issue or a design issue. That's
> > > > currently the leading approach from my perspective.  Thanks,
> > > > 
> > > > Alex    
> > > 
> > > Hmm this doesn't really write up the architecture. It would be
> > > really helpful to have that writeup.  
> > 
> > Perhaps the patch submitters can create one.  
> 
> You already spent so many words on this - won't it
> be easier for you to just write up what you consider
> the right thing to do?
>
> > > Do I guess right that what v9 tried to do is:
> > > - block future userspace accesses
> > > - do full reset
> > > - re-enable userspace accesses
> > > 
> > > ?
> > > 
> > > The obvious issue with that is that device loses its state apparently at
> > > random, and this is not something that happens on baremetal, ever.
> > > 
> > > Thus I don't see how this can work without some PV code in guest.  
> > 
> > So there is no link error that can occur on hardware where the device
> > becomes inaccessible?  I'm pretty sure I've experienced cases
> > otherwise.  
> 
> There are errors like this (losing link) but this is IMO a major error
> which isn't covered by AER.
> 
> It's possible that some guests have AER code that happens to recover
> from lost link, but IMO that's never guaranteed.
> 
> Converting an uncorrectable error like a lost write into link loss
> means escalating the problem to a whole different level.

If a host induced link reset cannot work, then we're at an impasse and
I have no recommended architecture to write-up.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao jin Dec. 15, 2016, 1:56 p.m. UTC | #46
On 12/15/2016 06:16 AM, Alex Williamson wrote:
> On Wed, 14 Dec 2016 18:24:23 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
>> Sorry for late.
>> after reading all your comments, I think I will try the solution 1.
>>
>> On 12/13/2016 03:12 AM, Alex Williamson wrote:
>>> On Mon, 12 Dec 2016 21:49:01 +0800
>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>   
>>>> Hi,
>>>> I have 2 solutions(high level design) came to me, please see if they are
>>>> acceptable, or which one is acceptable. Also have some questions.
>>>>
>>>> 1. block guest access during host recovery
>>>>
>>>>    add new field error_recovering in struct vfio_pci_device to
>>>>    indicate host recovery status. aer driver in host will still do
>>>>    reset link
>>>>
>>>>    - set error_recovering in vfio-pci driver's error_detected, used to
>>>>      block all kinds of user access(config space, mmio)
>>>>    - in order to solve concurrent issue of device resetting & user
>>>>      access, check device state[*] in vfio-pci driver's resume, see if
>>>>      device reset is done, if it is, then clear"error_recovering", or
>>>>      else new a timer, check device state periodically until device
>>>>      reset is done. (what if device reset don't end for a long time?)
>>>>    - In qemu, translate guest link reset to host link reset.
>>>>      A question here: we already have link reset in host, is a second
>>>>      link reset necessary? why?
>>>>  
>>>>    [*] how to check device state: reading certain config space
>>>>        register, check return value is valid or not(All F's)  
>>>
>>> Isn't this exactly the path we were on previously?  
>>
>> Yes, it is basically the previous path, plus the optimization.
>>
>>> There might be an
>>> optimization that we could skip back-to-back resets, but how can you
>>> necessarily infer that the resets are for the same thing? If the user
>>> accesses the device between resets, can you still guarantee the guest
>>> directed reset is unnecessary?  If time passes between resets, do you
>>> know they're for the same event?  How much time can pass between the
>>> host and guest reset to know they're for the same event?  In the
>>> process of error handling, which is more important, speed or
>>> correctness?
>>>    
>>
>> I think vfio driver itself won't know what each reset comes for, and I
>> don't quite understand why should vfio care this question, is this a new
>> question in the design?
> 
> You're suggesting an optimization to eliminate one of the resets,
> and as we've discussed, I don't see removing the host induced reset
> as a viable option.  That means you want to eliminate the guest
> directed reset.  There are potentially three levels to do that, the
> vfio-pci driver in the host kernel, somewhere in QEMU, or eliminate it
> within the guest.  My comments were directed to the first option, the
> host kernel level cannot correlate user directed resets as duplicates
> of host directed resets.  
>  

Ah, maybe it is mistake, I don't really want to eliminate guest directed
reset very much, I was just not sure why it is very necessary.

The optimization I said just is fully separating host recovery from
guest recovery(timer, check device periodically) in time, because there
is concurrent device resetting & user access.

>> But I think it make sense that the user access during 2 resets maybe a
>> trouble for guest recovery, misbehaved user could be out of our
>> imagination.  Correctness is more important.
>>
>> If I understand you right, let me make a summary: host recovery just
>> does link reset, which is incomplete, so we'd better do a complete guest
>> recovery for correctness.
> 
> We don't know whether the host link reset is incomplete, but we can't do
> a link reset transparently to the device, the device is no longer in the
> same state after the reset.  The device specific driver, which exists
> in userspace needs to be involved in device recovery.  Therefore
> regardless of how QEMU handles the error, the driver within the guest
> needs to be notified and perform recovery.  Since the device is PCI and
> we're on x86 and nobody wants to introduce paravirtual error recovery,
> we must use AER.  Part of AER recovery includes the possibility of
> performing a link reset.  So it seems this eliminates avoiding the link
> reset within the guest.
> 
> That leaves QEMU.  Here we need to decide whether a guest triggered
> link reset induces a host link reset.  The current working theory is
> that yes, this must be the case.  If there is ever a case where a
> driver within the guest could trigger a link reset for the purposes
> of error recovery when the host has not, I think this must be the
> case.  Therefore, at least some guest induced link resets must become
> host link resets.  Currently we assume all guest induced link resets
> become host link resets.  Minimally to avoid that, QEMU would need to
> know (not assume) whether the host performed a link reset.  Even with
> that, QEMU would need to be able to correlate that a link reset from
> the guest is a duplicate of a link reset that was already performed by
> the host.  That implies that QEMU needs to deduce the intention of
> the guest.  That seems like a complicated task for a patch series that
> is already complicated enough, especially for a feature of questionable
> value given the configuration restrictions (imo).
> 
> I would much rather focus on getting it right and making it as simple
> as we can, even if that means links get reset one too many times on
> error.
> 

Thanks very much for your detailed explanation, it does helps me to
understand your concern, understand why a second link reset is necessary.

I still want to share my thoughts with you(not argue): now we know host
aer driver will do link reset for vfio-pci first, so I can say, even if
fatal error is link related, after host link reset, link can work now.
Then in qemu, we are not necessary to translate guest link reset to host
link reset, just use vfio_pci_reset() as it is to do device
reset(probably is FLR). Which also means we don't need following
patch(make code easier):

@@ -3120,6 +3122,18 @@ static void vfio_pci_reset(DeviceState *dev)

      trace_vfio_pci_reset(vdev->vbasedev.name);

+     if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+         PCIDevice *br = pci_bridge_get_device(pdev->bus);
+
+         if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
+              PCI_BRIDGE_CTL_BUS_RESET)) {
+             if (pci_get_function_0(pdev) == pdev) {
+                 vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+             }
+             return;
+         }
+     }
+
      vfio_pci_pre_reset(vdev);


I think this also implies: we have a virtual link in qemu, but a virtual
link will never be broken like a physical link.(In particular we already
know host aer driver surely will do link reset to recover physical
link). So, guest's link reset don't need to care whether virtual link is
reset, just care virtual device.  And qemu "translates guest link reset
to host link reset" seems kind of taking link-reset responsibility over
from host:)

>>>> 2. skip link reset in aer driver of host kernel, for vfio-pci.
>>>>    Let user decide how to do serious recovery
>>>>
>>>>    add new field "user_driver" in struct pci_dev, used to skip link
>>>>    reset for vfio-pci; add new field "link_reset" in struct
>>>>    vfio_pci_device to indicate link has been reset or not during
>>>>    recovery
>>>>
>>>>    - set user_driver in vfio_pci_probe(), to skip link reset for
>>>>      vfio-pci in host.
>>>>    - (use a flag)block user access(config, mmio) during host recovery
>>>>      (not sure if this step is necessary)
>>>>    - In qemu, translate guest link reset to host link reset.
>>>>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
>>>>      is executed
>>>>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
>>>>      periodically, if it is set in reasonable time, then clear it and
>>>>      delete timer, or else, vfio-pci driver will does the link reset!  
>>>
>>> What happens in the case of a multifunction device where each function
>>> is part of a separate IOMMU group and one function is hot-removed from
>>> the user? We can't do a link reset on that function since the other
>>> function is still in use.  We have no choice but release a device in an
>>> unknown state back to the host.  
>>
>> hot-remove from user, do you mean, for example, all functions assigned
>> to VM, then suddenly a person does something like following
>>
>> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind
>>
>> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind
>>
>> to return device to host driver, or don't bind it to host driver, let it
>> in driver-less state???
> 
> Yes, the host kernel has no visiblity to how a user is making use of
> devices.  To support AER we require a similar topology between host and
> guest such that a guest link reset translates to a host reset.  That
> requirement is imposed by userspace, ie. QEMU.  The host kernel cannot
> presume that this is the case.  Therefore we could have a
> multi-function device where each function is assigned to the same or
> different users in any configuration.  If a fault occurs and we defer
> to the user to perform the link reset, we have absolutely no guarantee
> that it will ever occur.  If the functions are assigned to different
> users, then each user individually doesn't have the capability to
> perform a link reset.  If the devices happen to be assigned to a single
> user when the error occurs, we cannot assume the user has an AER
> compatible configuration, the devices could be exposed as separate
> single function devices, any one of which might be individually removed
> from the user and made use of by the host, such as your sysfs example
> above.  The host cannot perform a link reset in this case either
> as the sibling devices are still in use by the guest.  Thanks,
> 
> Alex
> 
> 

this explanation is valuable to me, so this is also why we can't do link
reset in vfio driver when one of the function is closed. And do link
reset in vfio driver until all functions are close is poor solution and
very complex(quarantine the device) as you said.

I am going to try solution 1, but I still have some consideration share
with you, this won't stop my trial, and don't have relationship with
above discussion, just FYI:

In non-virtuallization environment, from device's perspective, the steps
of a normal recovery consists of:
    error_detect
    mmio_enabled
    link_reset
    slot_reset
    resume

Now in our condition, the steps become:
    *link_reset* (host's, the following are guest's)
    error_detect
    mmio_enabled
    link_reset
    slot_reset
    resume

Especially, some device's specific driver in guest could do some
specific work in error_detect, take igb_io_error_detected() for example.
Like the words in pci-error-recovery.txt said:

it gives the driver a chance to cleanup, waiting for pending stuff
(timers, whatever, etc...) to complete;

But if link_reset is the first step, we lost all the status(register
value, etc) in the device. Of course I don't know if this will be a
problem (might not), just curious if this has been your concern:)
Alex Williamson Dec. 15, 2016, 5:02 p.m. UTC | #47
On Thu, 15 Dec 2016 21:56:41 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> On 12/15/2016 06:16 AM, Alex Williamson wrote:
> > On Wed, 14 Dec 2016 18:24:23 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >   
> >> Sorry for late.
> >> after reading all your comments, I think I will try the solution 1.
> >>
> >> On 12/13/2016 03:12 AM, Alex Williamson wrote:  
> >>> On Mon, 12 Dec 2016 21:49:01 +0800
> >>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >>>     
> >>>> Hi,
> >>>> I have 2 solutions(high level design) came to me, please see if they are
> >>>> acceptable, or which one is acceptable. Also have some questions.
> >>>>
> >>>> 1. block guest access during host recovery
> >>>>
> >>>>    add new field error_recovering in struct vfio_pci_device to
> >>>>    indicate host recovery status. aer driver in host will still do
> >>>>    reset link
> >>>>
> >>>>    - set error_recovering in vfio-pci driver's error_detected, used to
> >>>>      block all kinds of user access(config space, mmio)
> >>>>    - in order to solve concurrent issue of device resetting & user
> >>>>      access, check device state[*] in vfio-pci driver's resume, see if
> >>>>      device reset is done, if it is, then clear"error_recovering", or
> >>>>      else new a timer, check device state periodically until device
> >>>>      reset is done. (what if device reset don't end for a long time?)
> >>>>    - In qemu, translate guest link reset to host link reset.
> >>>>      A question here: we already have link reset in host, is a second
> >>>>      link reset necessary? why?
> >>>>  
> >>>>    [*] how to check device state: reading certain config space
> >>>>        register, check return value is valid or not(All F's)    
> >>>
> >>> Isn't this exactly the path we were on previously?    
> >>
> >> Yes, it is basically the previous path, plus the optimization.
> >>  
> >>> There might be an
> >>> optimization that we could skip back-to-back resets, but how can you
> >>> necessarily infer that the resets are for the same thing? If the user
> >>> accesses the device between resets, can you still guarantee the guest
> >>> directed reset is unnecessary?  If time passes between resets, do you
> >>> know they're for the same event?  How much time can pass between the
> >>> host and guest reset to know they're for the same event?  In the
> >>> process of error handling, which is more important, speed or
> >>> correctness?
> >>>      
> >>
> >> I think vfio driver itself won't know what each reset comes for, and I
> >> don't quite understand why should vfio care this question, is this a new
> >> question in the design?  
> > 
> > You're suggesting an optimization to eliminate one of the resets,
> > and as we've discussed, I don't see removing the host induced reset
> > as a viable option.  That means you want to eliminate the guest
> > directed reset.  There are potentially three levels to do that, the
> > vfio-pci driver in the host kernel, somewhere in QEMU, or eliminate it
> > within the guest.  My comments were directed to the first option, the
> > host kernel level cannot correlate user directed resets as duplicates
> > of host directed resets.  
> >    
> 
> Ah, maybe it is mistake, I don't really want to eliminate guest directed
> reset very much, I was just not sure why it is very necessary.
> 
> The optimization I said just is fully separating host recovery from
> guest recovery(timer, check device periodically) in time, because there
> is concurrent device resetting & user access.
> 
> >> But I think it make sense that the user access during 2 resets maybe a
> >> trouble for guest recovery, misbehaved user could be out of our
> >> imagination.  Correctness is more important.
> >>
> >> If I understand you right, let me make a summary: host recovery just
> >> does link reset, which is incomplete, so we'd better do a complete guest
> >> recovery for correctness.  
> > 
> > We don't know whether the host link reset is incomplete, but we can't do
> > a link reset transparently to the device, the device is no longer in the
> > same state after the reset.  The device specific driver, which exists
> > in userspace needs to be involved in device recovery.  Therefore
> > regardless of how QEMU handles the error, the driver within the guest
> > needs to be notified and perform recovery.  Since the device is PCI and
> > we're on x86 and nobody wants to introduce paravirtual error recovery,
> > we must use AER.  Part of AER recovery includes the possibility of
> > performing a link reset.  So it seems this eliminates avoiding the link
> > reset within the guest.
> > 

vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

> > That leaves QEMU.  Here we need to decide whether a guest triggered
> > link reset induces a host link reset.  The current working theory is
> > that yes, this must be the case.  If there is ever a case where a
> > driver within the guest could trigger a link reset for the purposes
> > of error recovery when the host has not, I think this must be the
> > case.

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> >  Therefore, at least some guest induced link resets must become
> > host link resets.  Currently we assume all guest induced link resets
> > become host link resets.  Minimally to avoid that, QEMU would need to
> > know (not assume) whether the host performed a link reset.  Even with
> > that, QEMU would need to be able to correlate that a link reset from
> > the guest is a duplicate of a link reset that was already performed by
> > the host.  That implies that QEMU needs to deduce the intention of
> > the guest.  That seems like a complicated task for a patch series that
> > is already complicated enough, especially for a feature of questionable
> > value given the configuration restrictions (imo).
> > 
> > I would much rather focus on getting it right and making it as simple
> > as we can, even if that means links get reset one too many times on
> > error.
> >   
> 
> Thanks very much for your detailed explanation, it does helps me to
> understand your concern, understand why a second link reset is necessary.
> 
> I still want to share my thoughts with you(not argue): now we know host
> aer driver will do link reset for vfio-pci first, so I can say, even if
> fatal error is link related, after host link reset, link can work now.
> Then in qemu, we are not necessary to translate guest link reset to host
> link reset, just use vfio_pci_reset() as it is to do device
> reset(probably is FLR). Which also means we don't need following
> patch(make code easier):

This ignores the statement that I've highlighted above.

> 
> @@ -3120,6 +3122,18 @@ static void vfio_pci_reset(DeviceState *dev)
> 
>       trace_vfio_pci_reset(vdev->vbasedev.name);
> 
> +     if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
> +         PCIDevice *br = pci_bridge_get_device(pdev->bus);
> +
> +         if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
> +              PCI_BRIDGE_CTL_BUS_RESET)) {
> +             if (pci_get_function_0(pdev) == pdev) {
> +                 vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +             }
> +             return;
> +         }
> +     }
> +
>       vfio_pci_pre_reset(vdev);
> 
> 
> I think this also implies: we have a virtual link in qemu, but a virtual
> link will never be broken like a physical link.(In particular we already
> know host aer driver surely will do link reset to recover physical
> link). So, guest's link reset don't need to care whether virtual link is
> reset, just care virtual device.  And qemu "translates guest link reset
> to host link reset" seems kind of taking link-reset responsibility over
> from host:)

Again, this is not taking into account the outstanding question of
whether it's every possible that a guest driver may request a link
reset when the host has not.  Without evaluating that topic this is an
invalid path to follow.

> >>>> 2. skip link reset in aer driver of host kernel, for vfio-pci.
> >>>>    Let user decide how to do serious recovery
> >>>>
> >>>>    add new field "user_driver" in struct pci_dev, used to skip link
> >>>>    reset for vfio-pci; add new field "link_reset" in struct
> >>>>    vfio_pci_device to indicate link has been reset or not during
> >>>>    recovery
> >>>>
> >>>>    - set user_driver in vfio_pci_probe(), to skip link reset for
> >>>>      vfio-pci in host.
> >>>>    - (use a flag)block user access(config, mmio) during host recovery
> >>>>      (not sure if this step is necessary)
> >>>>    - In qemu, translate guest link reset to host link reset.
> >>>>    - In vfio-pci driver, set link_reset after VFIO_DEVICE_PCI_HOT_RESET
> >>>>      is executed
> >>>>    - In vfio-pci driver's resume, new a timer, check "link_reset" field
> >>>>      periodically, if it is set in reasonable time, then clear it and
> >>>>      delete timer, or else, vfio-pci driver will does the link reset!    
> >>>
> >>> What happens in the case of a multifunction device where each function
> >>> is part of a separate IOMMU group and one function is hot-removed from
> >>> the user? We can't do a link reset on that function since the other
> >>> function is still in use.  We have no choice but release a device in an
> >>> unknown state back to the host.    
> >>
> >> hot-remove from user, do you mean, for example, all functions assigned
> >> to VM, then suddenly a person does something like following
> >>
> >> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/vfio-pci/unbind
> >>
> >> $ echo 0000:06:00.0 > /sys/bus/pci/drivers/igb/bind
> >>
> >> to return device to host driver, or don't bind it to host driver, let it
> >> in driver-less state???  
> > 
> > Yes, the host kernel has no visiblity to how a user is making use of
> > devices.  To support AER we require a similar topology between host and
> > guest such that a guest link reset translates to a host reset.  That
> > requirement is imposed by userspace, ie. QEMU.  The host kernel cannot
> > presume that this is the case.  Therefore we could have a
> > multi-function device where each function is assigned to the same or
> > different users in any configuration.  If a fault occurs and we defer
> > to the user to perform the link reset, we have absolutely no guarantee
> > that it will ever occur.  If the functions are assigned to different
> > users, then each user individually doesn't have the capability to
> > perform a link reset.  If the devices happen to be assigned to a single
> > user when the error occurs, we cannot assume the user has an AER
> > compatible configuration, the devices could be exposed as separate
> > single function devices, any one of which might be individually removed
> > from the user and made use of by the host, such as your sysfs example
> > above.  The host cannot perform a link reset in this case either
> > as the sibling devices are still in use by the guest.  Thanks,
> > 
> > Alex
> > 
> >   
> 
> this explanation is valuable to me, so this is also why we can't do link
> reset in vfio driver when one of the function is closed. And do link
> reset in vfio driver until all functions are close is poor solution and
> very complex(quarantine the device) as you said.
> 
> I am going to try solution 1, but I still have some consideration share
> with you, this won't stop my trial, and don't have relationship with
> above discussion, just FYI:
> 
> In non-virtuallization environment, from device's perspective, the steps
> of a normal recovery consists of:
>     error_detect
>     mmio_enabled
>     link_reset
>     slot_reset
>     resume
> 
> Now in our condition, the steps become:
>     *link_reset* (host's, the following are guest's)
>     error_detect
>     mmio_enabled
>     link_reset
>     slot_reset
>     resume
> 
> Especially, some device's specific driver in guest could do some
> specific work in error_detect, take igb_io_error_detected() for example.
> Like the words in pci-error-recovery.txt said:
> 
> it gives the driver a chance to cleanup, waiting for pending stuff
> (timers, whatever, etc...) to complete;
> 
> But if link_reset is the first step, we lost all the status(register
> value, etc) in the device. Of course I don't know if this will be a
> problem (might not), just curious if this has been your concern:)

As I noted previously, I think you have to prove that a guest will
never issue a link reset that is not accounted for by the host reset in
order to proceed down this path.  Any possibility that a guest requires
a link reset that hasn't been provided by the host takes us back to the
course where a guest link reset must induce a host link reset and
correlating one to the other to try to optimize this out becomes very
difficult.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 521e39c..289fb8e 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -496,7 +496,17 @@  static void do_recovery(struct pci_dev *dev, int severity)
 			"error_detected",
 			report_error_detected);
 
-	if (severity == AER_FATAL) {
+	/* vfio-pci as a general meta driver, it actually couldn't do any real
+	 * recovery for device. It is user space driver, or device-specific
+	 * driver in guest who should take care of the serious error recovery,
+	 * link reset actually is one part of whole recovery. Doing reset_link
+	 * in aer driver of host kernel for vfio-pci devices will cause many
+	 * trouble for user space driver or guest's device-specific driver,
+	 * for example: the serious recovery often need to read register in
+	 * config space, but if register reading happens during link-resetting,
+	 * it is quite possible to return invalid value like all F's, which
+	 * will result in unpredictable error. */
+	if (severity == AER_FATAL && strcmp(dev->driver->name, "vfio-pci")) {
 		result = reset_link(dev);
 		if (result != PCI_ERS_RESULT_RECOVERED)
 			goto failed;
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 712a849..aefd751 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -535,6 +535,15 @@  static long vfio_pci_ioctl(void *device_data,
 	struct vfio_pci_device *vdev = device_data;
 	unsigned long minsz;
 
+	if (vdev->aer_recovering && (cmd == VFIO_DEVICE_SET_IRQS ||
+	    cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+		int ret;
+		ret = wait_for_completion_interruptible(
+			&vdev->aer_error_completion);
+		if (ret)
+			return ret;
+	}
+
 	if (cmd == VFIO_DEVICE_GET_INFO) {
 		struct vfio_device_info info;
 
@@ -1117,6 +1126,7 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	mutex_init(&vdev->igate);
 	spin_lock_init(&vdev->irqlock);
+	init_completion(&vdev->aer_error_completion);
 
 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 	if (ret) {
@@ -1176,6 +1186,9 @@  static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 {
 	struct vfio_pci_device *vdev;
 	struct vfio_device *device;
+	u32 uncor_status = 0;
+	unsigned int aer_cap_offset = 0;
+	int ret;
 
 	device = vfio_device_get_from_dev(&pdev->dev);
 	if (device == NULL)
@@ -1187,10 +1200,30 @@  static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
+	/* get device's uncorrectable error status as soon as possible,
+	 * and signal it to user space. The later we read it, the possibility
+	 * the register value is mangled grows. */
+	aer_cap_offset = pci_find_ext_capability(vdev->pdev, PCI_EXT_CAP_ID_ERR);
+	ret = pci_read_config_dword(vdev->pdev, aer_cap_offset + 
+                                    PCI_ERR_UNCOR_STATUS, &uncor_status);
+        if (ret)
+                return PCI_ERS_RESULT_DISCONNECT;
+
+	pr_err("device %d got AER detect notification. uncorrectable error status = 0x%x\n", pdev->devfn, uncor_status);//to be removed
 	mutex_lock(&vdev->igate);
+    
+	vdev->aer_recovering = true;
+	reinit_completion(&vdev->aer_error_completion);
+
+	/* suspend config space access from user space,
+	 * when vfio-pci's error recovery process is on */
+	pci_cfg_access_trylock(vdev->pdev);
 
-	if (vdev->err_trigger)
-		eventfd_signal(vdev->err_trigger, 1);
+	if (vdev->err_trigger && uncor_status) {
+		pr_err("device %d signal uncor status to user space", pdev->devfn);//may be removed
+		/* signal uncorrectable error status to user space */
+		eventfd_signal(vdev->err_trigger, uncor_status);
+        }
 
 	mutex_unlock(&vdev->igate);
 
@@ -1199,8 +1232,34 @@  static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+	struct vfio_pci_device *vdev;
+	struct vfio_device *device;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (device == NULL)
+		return;
+
+	vdev = vfio_device_data(device);
+	if (vdev == NULL) {
+		vfio_device_put(device);
+		return;
+	}
+
+	/* vfio-pci's error recovery is done, time to
+	 * resume pci config space's accesses */
+	pci_cfg_access_unlock(vdev->pdev);
+
+	vdev->aer_recovering = false;
+	complete_all(&vdev->aer_error_completion);
+
+	vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
+	.resume         = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 8a7d546..ebf1041 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -83,6 +83,8 @@  struct vfio_pci_device {
 	bool			bardirty;
 	bool			has_vga;
 	bool			needs_reset;
+	bool			aer_recovering;
+	struct completion	aer_error_completion;
 	struct pci_saved_state	*pci_saved_state;
 	int			refcnt;
 	struct eventfd_ctx	*err_trigger;