diff mbox series

[V3] virtio: disable notification hardening by default

Message ID 20220622012940.21441-1-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series [V3] virtio: disable notification hardening by default | expand

Commit Message

Jason Wang June 22, 2022, 1:29 a.m. UTC
We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
harden vring IRQ"). It works with the assumption that the driver or
core can properly call virtio_device_ready() at the right
place. Unfortunately, this seems to be not true and uncover various
bugs of the existing drivers, mainly the issue of using
virtio_device_ready() incorrectly.

So let's having a Kconfig option and disable it by default. It gives
us a breath to fix the drivers and then we can consider to enable it
by default.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes since V2:
- Tweak the Kconfig help
- Add comment for the read_lock() pairing in virtio_ccw
---
 drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
 drivers/virtio/Kconfig           | 13 +++++++++++++
 drivers/virtio/virtio.c          |  2 ++
 drivers/virtio/virtio_ring.c     | 12 ++++++++++++
 include/linux/virtio_config.h    |  2 ++
 5 files changed, 37 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin June 22, 2022, 7:02 a.m. UTC | #1
On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> harden vring IRQ"). It works with the assumption that the driver or
> core can properly call virtio_device_ready() at the right
> place. Unfortunately, this seems to be not true and uncover various
> bugs of the existing drivers, mainly the issue of using
> virtio_device_ready() incorrectly.
> 
> So let's having a Kconfig option and disable it by default. It gives
> us a breath to fix the drivers and then we can consider to enable it
> by default.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>


OK I will queue, but I think the problem is fundamental.


> ---
> Changes since V2:
> - Tweak the Kconfig help
> - Add comment for the read_lock() pairing in virtio_ccw
> ---
>  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
>  drivers/virtio/Kconfig           | 13 +++++++++++++
>  drivers/virtio/virtio.c          |  2 ++
>  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
>  include/linux/virtio_config.h    |  2 ++
>  5 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> index 97e51c34e6cf..1f6a358f65f0 100644
> --- a/drivers/s390/virtio/virtio_ccw.c
> +++ b/drivers/s390/virtio/virtio_ccw.c
> @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  			vcdev->err = -EIO;
>  	}
>  	virtio_ccw_check_activity(vcdev, activity);
> -	/* Interrupts are disabled here */
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> +	/*
> +	 * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> +	 * disabled here.
> +	 */
>  	read_lock(&vcdev->irq_lock);
> +#endif
>  	for_each_set_bit(i, indicators(vcdev),
>  			 sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
>  		/* The bit clear must happen before the vring kick. */
> @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  		vq = virtio_ccw_vq_by_ind(vcdev, i);
>  		vring_interrupt(0, vq);
>  	}
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	read_unlock(&vcdev->irq_lock);
> +#endif
>  	if (test_bit(0, indicators2(vcdev))) {
>  		virtio_config_changed(&vcdev->vdev);
>  		clear_bit(0, indicators2(vcdev));
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index b5adf6abd241..c04f370a1e5c 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
>  
>  if VIRTIO_MENU
>  
> +config VIRTIO_HARDEN_NOTIFICATION
> +        bool "Harden virtio notification"
> +        help
> +          Enable this to harden the device notifications and suppress
> +          those that happen at a time where notifications are illegal.
> +
> +          Experimental: Note that several drivers still have bugs that
> +          may cause crashes or hangs when correct handling of
> +          notifications is enforced; depending on the subset of
> +          drivers and devices you use, this may or may not work.
> +
> +          If unsure, say N.
> +
>  config VIRTIO_PCI
>  	tristate "PCI driver for virtio devices"
>  	depends on PCI
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> index ef04a96942bf..21dc08d2f32d 100644
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
>   * */
>  void virtio_reset_device(struct virtio_device *dev)
>  {
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	/*
>  	 * The below virtio_synchronize_cbs() guarantees that any
>  	 * interrupt for this line arriving after
> @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
>  	 */
>  	virtio_break_device(dev);
>  	virtio_synchronize_cbs(dev);
> +#endif
>  
>  	dev->config->reset(dev);
>  }
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 13a7348cedff..d9d3b6e201fb 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
>  	vq->we_own_ring = true;
>  	vq->notify = notify;
>  	vq->weak_barriers = weak_barriers;
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	vq->broken = true;
> +#else
> +	vq->broken = false;
> +#endif
>  	vq->last_used_idx = 0;
>  	vq->event_triggered = false;
>  	vq->num_added = 0;
> @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>  	}
>  
>  	if (unlikely(vq->broken)) {
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  		dev_warn_once(&vq->vq.vdev->dev,
>  			      "virtio vring IRQ raised before DRIVER_OK");
>  		return IRQ_NONE;
> +#else
> +		return IRQ_HANDLED;
> +#endif
>  	}
>  
>  	/* Just a hint for performance: so it's ok that this can be racy! */
> @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->we_own_ring = false;
>  	vq->notify = notify;
>  	vq->weak_barriers = weak_barriers;
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	vq->broken = true;
> +#else
> +	vq->broken = false;
> +#endif
>  	vq->last_used_idx = 0;
>  	vq->event_triggered = false;
>  	vq->num_added = 0;
> diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> index 9a36051ceb76..d15c3cdda2d2 100644
> --- a/include/linux/virtio_config.h
> +++ b/include/linux/virtio_config.h
> @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
>  
>  	WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
>  
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	/*
>  	 * The virtio_synchronize_cbs() makes sure vring_interrupt()
>  	 * will see the driver specific setup if it sees vq->broken
> @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
>  	 */
>  	virtio_synchronize_cbs(dev);
>  	__virtio_unbreak_device(dev);
> +#endif
>  	/*
>  	 * The transport should ensure the visibility of vq->broken
>  	 * before setting DRIVER_OK. See the comments for the transport
> -- 
> 2.25.1
Jason Wang June 22, 2022, 7:09 a.m. UTC | #2
On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > harden vring IRQ"). It works with the assumption that the driver or
> > core can properly call virtio_device_ready() at the right
> > place. Unfortunately, this seems to be not true and uncover various
> > bugs of the existing drivers, mainly the issue of using
> > virtio_device_ready() incorrectly.
> >
> > So let's having a Kconfig option and disable it by default. It gives
> > us a breath to fix the drivers and then we can consider to enable it
> > by default.
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
>
> OK I will queue, but I think the problem is fundamental.

If I understand correctly, you want some core IRQ work? As discussed
before, it doesn't solve all the problems, we still need to do per
driver audit.

Thanks

>
>
> > ---
> > Changes since V2:
> > - Tweak the Kconfig help
> > - Add comment for the read_lock() pairing in virtio_ccw
> > ---
> >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> >  drivers/virtio/Kconfig           | 13 +++++++++++++
> >  drivers/virtio/virtio.c          |  2 ++
> >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> >  include/linux/virtio_config.h    |  2 ++
> >  5 files changed, 37 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > index 97e51c34e6cf..1f6a358f65f0 100644
> > --- a/drivers/s390/virtio/virtio_ccw.c
> > +++ b/drivers/s390/virtio/virtio_ccw.c
> > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> >                       vcdev->err = -EIO;
> >       }
> >       virtio_ccw_check_activity(vcdev, activity);
> > -     /* Interrupts are disabled here */
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > +     /*
> > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > +      * disabled here.
> > +      */
> >       read_lock(&vcdev->irq_lock);
> > +#endif
> >       for_each_set_bit(i, indicators(vcdev),
> >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> >               /* The bit clear must happen before the vring kick. */
> > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> >               vring_interrupt(0, vq);
> >       }
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >       read_unlock(&vcdev->irq_lock);
> > +#endif
> >       if (test_bit(0, indicators2(vcdev))) {
> >               virtio_config_changed(&vcdev->vdev);
> >               clear_bit(0, indicators2(vcdev));
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index b5adf6abd241..c04f370a1e5c 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> >
> >  if VIRTIO_MENU
> >
> > +config VIRTIO_HARDEN_NOTIFICATION
> > +        bool "Harden virtio notification"
> > +        help
> > +          Enable this to harden the device notifications and suppress
> > +          those that happen at a time where notifications are illegal.
> > +
> > +          Experimental: Note that several drivers still have bugs that
> > +          may cause crashes or hangs when correct handling of
> > +          notifications is enforced; depending on the subset of
> > +          drivers and devices you use, this may or may not work.
> > +
> > +          If unsure, say N.
> > +
> >  config VIRTIO_PCI
> >       tristate "PCI driver for virtio devices"
> >       depends on PCI
> > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > index ef04a96942bf..21dc08d2f32d 100644
> > --- a/drivers/virtio/virtio.c
> > +++ b/drivers/virtio/virtio.c
> > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> >   * */
> >  void virtio_reset_device(struct virtio_device *dev)
> >  {
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >       /*
> >        * The below virtio_synchronize_cbs() guarantees that any
> >        * interrupt for this line arriving after
> > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> >        */
> >       virtio_break_device(dev);
> >       virtio_synchronize_cbs(dev);
> > +#endif
> >
> >       dev->config->reset(dev);
> >  }
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 13a7348cedff..d9d3b6e201fb 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> >       vq->we_own_ring = true;
> >       vq->notify = notify;
> >       vq->weak_barriers = weak_barriers;
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >       vq->broken = true;
> > +#else
> > +     vq->broken = false;
> > +#endif
> >       vq->last_used_idx = 0;
> >       vq->event_triggered = false;
> >       vq->num_added = 0;
> > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> >       }
> >
> >       if (unlikely(vq->broken)) {
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >               dev_warn_once(&vq->vq.vdev->dev,
> >                             "virtio vring IRQ raised before DRIVER_OK");
> >               return IRQ_NONE;
> > +#else
> > +             return IRQ_HANDLED;
> > +#endif
> >       }
> >
> >       /* Just a hint for performance: so it's ok that this can be racy! */
> > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >       vq->we_own_ring = false;
> >       vq->notify = notify;
> >       vq->weak_barriers = weak_barriers;
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >       vq->broken = true;
> > +#else
> > +     vq->broken = false;
> > +#endif
> >       vq->last_used_idx = 0;
> >       vq->event_triggered = false;
> >       vq->num_added = 0;
> > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > index 9a36051ceb76..d15c3cdda2d2 100644
> > --- a/include/linux/virtio_config.h
> > +++ b/include/linux/virtio_config.h
> > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> >
> >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> >
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >       /*
> >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> >        * will see the driver specific setup if it sees vq->broken
> > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> >        */
> >       virtio_synchronize_cbs(dev);
> >       __virtio_unbreak_device(dev);
> > +#endif
> >       /*
> >        * The transport should ensure the visibility of vq->broken
> >        * before setting DRIVER_OK. See the comments for the transport
> > --
> > 2.25.1
>
Cornelia Huck June 22, 2022, 8:32 a.m. UTC | #3
On Wed, Jun 22 2022, Jason Wang <jasowang@redhat.com> wrote:

> We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> harden vring IRQ"). It works with the assumption that the driver or
> core can properly call virtio_device_ready() at the right
> place. Unfortunately, this seems to be not true and uncover various
> bugs of the existing drivers, mainly the issue of using
> virtio_device_ready() incorrectly.
>
> So let's having a Kconfig option and disable it by default. It gives

s/having/have/

> us a breath to fix the drivers and then we can consider to enable it
> by default.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes since V2:
> - Tweak the Kconfig help
> - Add comment for the read_lock() pairing in virtio_ccw
> ---
>  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
>  drivers/virtio/Kconfig           | 13 +++++++++++++
>  drivers/virtio/virtio.c          |  2 ++
>  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
>  include/linux/virtio_config.h    |  2 ++
>  5 files changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> index 97e51c34e6cf..1f6a358f65f0 100644
> --- a/drivers/s390/virtio/virtio_ccw.c
> +++ b/drivers/s390/virtio/virtio_ccw.c
> @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  			vcdev->err = -EIO;
>  	}
>  	virtio_ccw_check_activity(vcdev, activity);
> -	/* Interrupts are disabled here */
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> +	/*
> +	 * Paried with virtio_ccw_synchronize_cbs() and interrupts are

s/Paried/Paired/

> +	 * disabled here.
> +	 */
>  	read_lock(&vcdev->irq_lock);
> +#endif
>  	for_each_set_bit(i, indicators(vcdev),
>  			 sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
>  		/* The bit clear must happen before the vring kick. */
> @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  		vq = virtio_ccw_vq_by_ind(vcdev, i);
>  		vring_interrupt(0, vq);
>  	}
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	read_unlock(&vcdev->irq_lock);
> +#endif
>  	if (test_bit(0, indicators2(vcdev))) {
>  		virtio_config_changed(&vcdev->vdev);
>  		clear_bit(0, indicators2(vcdev));


Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Michael S. Tsirkin June 24, 2022, 6:31 a.m. UTC | #4
On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > harden vring IRQ"). It works with the assumption that the driver or
> > > core can properly call virtio_device_ready() at the right
> > > place. Unfortunately, this seems to be not true and uncover various
> > > bugs of the existing drivers, mainly the issue of using
> > > virtio_device_ready() incorrectly.
> > >
> > > So let's having a Kconfig option and disable it by default. It gives
> > > us a breath to fix the drivers and then we can consider to enable it
> > > by default.
> > >
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> >
> >
> > OK I will queue, but I think the problem is fundamental.
> 
> If I understand correctly, you want some core IRQ work?

Yes.

> As discussed
> before, it doesn't solve all the problems, we still need to do per
> driver audit.
> 
> Thanks

Maybe, but we don't need to tie things to device_ready then.
We can do

- disable irqs
- device ready
- setup everything
- enable irqs


and this works for most things, the only issue is
this deadlocks if "setup everything" waits for interrupts.


With the current approach there's really no good time:
1.- setup everything
- device ready

can cause kicks before device is ready

2.- device ready
- setup everything

can cause callbacks before setup.

So I prefer the 1. and fix the hardening in the core.


> >
> >
> > > ---
> > > Changes since V2:
> > > - Tweak the Kconfig help
> > > - Add comment for the read_lock() pairing in virtio_ccw
> > > ---
> > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > >  drivers/virtio/virtio.c          |  2 ++
> > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > >  include/linux/virtio_config.h    |  2 ++
> > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > >                       vcdev->err = -EIO;
> > >       }
> > >       virtio_ccw_check_activity(vcdev, activity);
> > > -     /* Interrupts are disabled here */
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > +     /*
> > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > +      * disabled here.
> > > +      */
> > >       read_lock(&vcdev->irq_lock);
> > > +#endif
> > >       for_each_set_bit(i, indicators(vcdev),
> > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > >               /* The bit clear must happen before the vring kick. */
> > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > >               vring_interrupt(0, vq);
> > >       }
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >       read_unlock(&vcdev->irq_lock);
> > > +#endif
> > >       if (test_bit(0, indicators2(vcdev))) {
> > >               virtio_config_changed(&vcdev->vdev);
> > >               clear_bit(0, indicators2(vcdev));
> > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > index b5adf6abd241..c04f370a1e5c 100644
> > > --- a/drivers/virtio/Kconfig
> > > +++ b/drivers/virtio/Kconfig
> > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > >
> > >  if VIRTIO_MENU
> > >
> > > +config VIRTIO_HARDEN_NOTIFICATION
> > > +        bool "Harden virtio notification"
> > > +        help
> > > +          Enable this to harden the device notifications and suppress
> > > +          those that happen at a time where notifications are illegal.
> > > +
> > > +          Experimental: Note that several drivers still have bugs that
> > > +          may cause crashes or hangs when correct handling of
> > > +          notifications is enforced; depending on the subset of
> > > +          drivers and devices you use, this may or may not work.
> > > +
> > > +          If unsure, say N.
> > > +
> > >  config VIRTIO_PCI
> > >       tristate "PCI driver for virtio devices"
> > >       depends on PCI
> > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > index ef04a96942bf..21dc08d2f32d 100644
> > > --- a/drivers/virtio/virtio.c
> > > +++ b/drivers/virtio/virtio.c
> > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > >   * */
> > >  void virtio_reset_device(struct virtio_device *dev)
> > >  {
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >       /*
> > >        * The below virtio_synchronize_cbs() guarantees that any
> > >        * interrupt for this line arriving after
> > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > >        */
> > >       virtio_break_device(dev);
> > >       virtio_synchronize_cbs(dev);
> > > +#endif
> > >
> > >       dev->config->reset(dev);
> > >  }
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 13a7348cedff..d9d3b6e201fb 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > >       vq->we_own_ring = true;
> > >       vq->notify = notify;
> > >       vq->weak_barriers = weak_barriers;
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >       vq->broken = true;
> > > +#else
> > > +     vq->broken = false;
> > > +#endif
> > >       vq->last_used_idx = 0;
> > >       vq->event_triggered = false;
> > >       vq->num_added = 0;
> > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > >       }
> > >
> > >       if (unlikely(vq->broken)) {
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >               dev_warn_once(&vq->vq.vdev->dev,
> > >                             "virtio vring IRQ raised before DRIVER_OK");
> > >               return IRQ_NONE;
> > > +#else
> > > +             return IRQ_HANDLED;
> > > +#endif
> > >       }
> > >
> > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > >       vq->we_own_ring = false;
> > >       vq->notify = notify;
> > >       vq->weak_barriers = weak_barriers;
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >       vq->broken = true;
> > > +#else
> > > +     vq->broken = false;
> > > +#endif
> > >       vq->last_used_idx = 0;
> > >       vq->event_triggered = false;
> > >       vq->num_added = 0;
> > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > --- a/include/linux/virtio_config.h
> > > +++ b/include/linux/virtio_config.h
> > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > >
> > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > >
> > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > >       /*
> > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > >        * will see the driver specific setup if it sees vq->broken
> > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > >        */
> > >       virtio_synchronize_cbs(dev);
> > >       __virtio_unbreak_device(dev);
> > > +#endif
> > >       /*
> > >        * The transport should ensure the visibility of vq->broken
> > >        * before setting DRIVER_OK. See the comments for the transport
> > > --
> > > 2.25.1
> >
Xuan Zhuo June 24, 2022, 8:41 a.m. UTC | #5
On Wed, 22 Jun 2022 09:29:40 +0800, Jason Wang <jasowang@redhat.com> wrote:
> We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> harden vring IRQ"). It works with the assumption that the driver or
> core can properly call virtio_device_ready() at the right
> place. Unfortunately, this seems to be not true and uncover various
> bugs of the existing drivers, mainly the issue of using
> virtio_device_ready() incorrectly.
>
> So let's having a Kconfig option and disable it by default. It gives
> us a breath to fix the drivers and then we can consider to enable it
> by default.


hi, I have a question.

What should we do when the queue is reset and CONFIG_VIRTIO_HARDEN_NOTIFICATION
is off?

Since disable_irq() cannot be used, we should trust the device not to send irqs
again. So we don't have to do anything?

Thanks.

>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes since V2:
> - Tweak the Kconfig help
> - Add comment for the read_lock() pairing in virtio_ccw
> ---
>  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
>  drivers/virtio/Kconfig           | 13 +++++++++++++
>  drivers/virtio/virtio.c          |  2 ++
>  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
>  include/linux/virtio_config.h    |  2 ++
>  5 files changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> index 97e51c34e6cf..1f6a358f65f0 100644
> --- a/drivers/s390/virtio/virtio_ccw.c
> +++ b/drivers/s390/virtio/virtio_ccw.c
> @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  			vcdev->err = -EIO;
>  	}
>  	virtio_ccw_check_activity(vcdev, activity);
> -	/* Interrupts are disabled here */
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> +	/*
> +	 * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> +	 * disabled here.
> +	 */
>  	read_lock(&vcdev->irq_lock);
> +#endif
>  	for_each_set_bit(i, indicators(vcdev),
>  			 sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
>  		/* The bit clear must happen before the vring kick. */
> @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
>  		vq = virtio_ccw_vq_by_ind(vcdev, i);
>  		vring_interrupt(0, vq);
>  	}
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	read_unlock(&vcdev->irq_lock);
> +#endif
>  	if (test_bit(0, indicators2(vcdev))) {
>  		virtio_config_changed(&vcdev->vdev);
>  		clear_bit(0, indicators2(vcdev));
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index b5adf6abd241..c04f370a1e5c 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
>
>  if VIRTIO_MENU
>
> +config VIRTIO_HARDEN_NOTIFICATION
> +        bool "Harden virtio notification"
> +        help
> +          Enable this to harden the device notifications and suppress
> +          those that happen at a time where notifications are illegal.
> +
> +          Experimental: Note that several drivers still have bugs that
> +          may cause crashes or hangs when correct handling of
> +          notifications is enforced; depending on the subset of
> +          drivers and devices you use, this may or may not work.
> +
> +          If unsure, say N.
> +
>  config VIRTIO_PCI
>  	tristate "PCI driver for virtio devices"
>  	depends on PCI
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> index ef04a96942bf..21dc08d2f32d 100644
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
>   * */
>  void virtio_reset_device(struct virtio_device *dev)
>  {
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	/*
>  	 * The below virtio_synchronize_cbs() guarantees that any
>  	 * interrupt for this line arriving after
> @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
>  	 */
>  	virtio_break_device(dev);
>  	virtio_synchronize_cbs(dev);
> +#endif
>
>  	dev->config->reset(dev);
>  }
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 13a7348cedff..d9d3b6e201fb 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
>  	vq->we_own_ring = true;
>  	vq->notify = notify;
>  	vq->weak_barriers = weak_barriers;
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	vq->broken = true;
> +#else
> +	vq->broken = false;
> +#endif
>  	vq->last_used_idx = 0;
>  	vq->event_triggered = false;
>  	vq->num_added = 0;
> @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>  	}
>
>  	if (unlikely(vq->broken)) {
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  		dev_warn_once(&vq->vq.vdev->dev,
>  			      "virtio vring IRQ raised before DRIVER_OK");
>  		return IRQ_NONE;
> +#else
> +		return IRQ_HANDLED;
> +#endif
>  	}
>
>  	/* Just a hint for performance: so it's ok that this can be racy! */
> @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->we_own_ring = false;
>  	vq->notify = notify;
>  	vq->weak_barriers = weak_barriers;
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	vq->broken = true;
> +#else
> +	vq->broken = false;
> +#endif
>  	vq->last_used_idx = 0;
>  	vq->event_triggered = false;
>  	vq->num_added = 0;
> diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> index 9a36051ceb76..d15c3cdda2d2 100644
> --- a/include/linux/virtio_config.h
> +++ b/include/linux/virtio_config.h
> @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
>
>  	WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
>
> +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
>  	/*
>  	 * The virtio_synchronize_cbs() makes sure vring_interrupt()
>  	 * will see the driver specific setup if it sees vq->broken
> @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
>  	 */
>  	virtio_synchronize_cbs(dev);
>  	__virtio_unbreak_device(dev);
> +#endif
>  	/*
>  	 * The transport should ensure the visibility of vq->broken
>  	 * before setting DRIVER_OK. See the comments for the transport
> --
> 2.25.1
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Michael S. Tsirkin June 24, 2022, 9:05 a.m. UTC | #6
On Fri, Jun 24, 2022 at 04:41:55PM +0800, Xuan Zhuo wrote:
> On Wed, 22 Jun 2022 09:29:40 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > harden vring IRQ"). It works with the assumption that the driver or
> > core can properly call virtio_device_ready() at the right
> > place. Unfortunately, this seems to be not true and uncover various
> > bugs of the existing drivers, mainly the issue of using
> > virtio_device_ready() incorrectly.
> >
> > So let's having a Kconfig option and disable it by default. It gives
> > us a breath to fix the drivers and then we can consider to enable it
> > by default.
> 
> 
> hi, I have a question.
> 
> What should we do when the queue is reset and CONFIG_VIRTIO_HARDEN_NOTIFICATION
> is off?
> 
> Since disable_irq() cannot be used, we should trust the device not to send irqs
> again. So we don't have to do anything?

yes

> Thanks.
> 
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> > Changes since V2:
> > - Tweak the Kconfig help
> > - Add comment for the read_lock() pairing in virtio_ccw
> > ---
> >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> >  drivers/virtio/Kconfig           | 13 +++++++++++++
> >  drivers/virtio/virtio.c          |  2 ++
> >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> >  include/linux/virtio_config.h    |  2 ++
> >  5 files changed, 37 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > index 97e51c34e6cf..1f6a358f65f0 100644
> > --- a/drivers/s390/virtio/virtio_ccw.c
> > +++ b/drivers/s390/virtio/virtio_ccw.c
> > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> >  			vcdev->err = -EIO;
> >  	}
> >  	virtio_ccw_check_activity(vcdev, activity);
> > -	/* Interrupts are disabled here */
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > +	/*
> > +	 * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > +	 * disabled here.
> > +	 */
> >  	read_lock(&vcdev->irq_lock);
> > +#endif
> >  	for_each_set_bit(i, indicators(vcdev),
> >  			 sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> >  		/* The bit clear must happen before the vring kick. */
> > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> >  		vq = virtio_ccw_vq_by_ind(vcdev, i);
> >  		vring_interrupt(0, vq);
> >  	}
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  	read_unlock(&vcdev->irq_lock);
> > +#endif
> >  	if (test_bit(0, indicators2(vcdev))) {
> >  		virtio_config_changed(&vcdev->vdev);
> >  		clear_bit(0, indicators2(vcdev));
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index b5adf6abd241..c04f370a1e5c 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> >
> >  if VIRTIO_MENU
> >
> > +config VIRTIO_HARDEN_NOTIFICATION
> > +        bool "Harden virtio notification"
> > +        help
> > +          Enable this to harden the device notifications and suppress
> > +          those that happen at a time where notifications are illegal.
> > +
> > +          Experimental: Note that several drivers still have bugs that
> > +          may cause crashes or hangs when correct handling of
> > +          notifications is enforced; depending on the subset of
> > +          drivers and devices you use, this may or may not work.
> > +
> > +          If unsure, say N.
> > +
> >  config VIRTIO_PCI
> >  	tristate "PCI driver for virtio devices"
> >  	depends on PCI
> > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > index ef04a96942bf..21dc08d2f32d 100644
> > --- a/drivers/virtio/virtio.c
> > +++ b/drivers/virtio/virtio.c
> > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> >   * */
> >  void virtio_reset_device(struct virtio_device *dev)
> >  {
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  	/*
> >  	 * The below virtio_synchronize_cbs() guarantees that any
> >  	 * interrupt for this line arriving after
> > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> >  	 */
> >  	virtio_break_device(dev);
> >  	virtio_synchronize_cbs(dev);
> > +#endif
> >
> >  	dev->config->reset(dev);
> >  }
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 13a7348cedff..d9d3b6e201fb 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> >  	vq->we_own_ring = true;
> >  	vq->notify = notify;
> >  	vq->weak_barriers = weak_barriers;
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  	vq->broken = true;
> > +#else
> > +	vq->broken = false;
> > +#endif
> >  	vq->last_used_idx = 0;
> >  	vq->event_triggered = false;
> >  	vq->num_added = 0;
> > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> >  	}
> >
> >  	if (unlikely(vq->broken)) {
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  		dev_warn_once(&vq->vq.vdev->dev,
> >  			      "virtio vring IRQ raised before DRIVER_OK");
> >  		return IRQ_NONE;
> > +#else
> > +		return IRQ_HANDLED;
> > +#endif
> >  	}
> >
> >  	/* Just a hint for performance: so it's ok that this can be racy! */
> > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >  	vq->we_own_ring = false;
> >  	vq->notify = notify;
> >  	vq->weak_barriers = weak_barriers;
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  	vq->broken = true;
> > +#else
> > +	vq->broken = false;
> > +#endif
> >  	vq->last_used_idx = 0;
> >  	vq->event_triggered = false;
> >  	vq->num_added = 0;
> > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > index 9a36051ceb76..d15c3cdda2d2 100644
> > --- a/include/linux/virtio_config.h
> > +++ b/include/linux/virtio_config.h
> > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> >
> >  	WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> >
> > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> >  	/*
> >  	 * The virtio_synchronize_cbs() makes sure vring_interrupt()
> >  	 * will see the driver specific setup if it sees vq->broken
> > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> >  	 */
> >  	virtio_synchronize_cbs(dev);
> >  	__virtio_unbreak_device(dev);
> > +#endif
> >  	/*
> >  	 * The transport should ensure the visibility of vq->broken
> >  	 * before setting DRIVER_OK. See the comments for the transport
> > --
> > 2.25.1
> >
> > _______________________________________________
> > Virtualization mailing list
> > Virtualization@lists.linux-foundation.org
> > https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Jason Wang June 27, 2022, 2:50 a.m. UTC | #7
On Fri, Jun 24, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> > On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > > harden vring IRQ"). It works with the assumption that the driver or
> > > > core can properly call virtio_device_ready() at the right
> > > > place. Unfortunately, this seems to be not true and uncover various
> > > > bugs of the existing drivers, mainly the issue of using
> > > > virtio_device_ready() incorrectly.
> > > >
> > > > So let's having a Kconfig option and disable it by default. It gives
> > > > us a breath to fix the drivers and then we can consider to enable it
> > > > by default.
> > > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > >
> > > OK I will queue, but I think the problem is fundamental.
> >
> > If I understand correctly, you want some core IRQ work?
>
> Yes.
>
> > As discussed
> > before, it doesn't solve all the problems, we still need to do per
> > driver audit.
> >
> > Thanks
>
> Maybe, but we don't need to tie things to device_ready then.
> We can do
>
> - disable irqs
> - device ready
> - setup everything
> - enable irqs
>
>
> and this works for most things, the only issue is
> this deadlocks if "setup everything" waits for interrupts.
>
>
> With the current approach there's really no good time:
> 1.- setup everything
> - device ready
>
> can cause kicks before device is ready
>
> 2.- device ready
> - setup everything
>
> can cause callbacks before setup.
>
> So I prefer the 1. and fix the hardening in the core.

So my question is:

1) do similar hardening like config interrupt
or
2) per transport notification work (e.g for PCI core IRQ work)

1) seems easier and universal, but we pay little overhead which could
be eliminated by the config option.
2) seems require more work in the IRQ core and it can not work for all
transports (e.g vDPA would be kind of difficult)

Thanks

>
>
> > >
> > >
> > > > ---
> > > > Changes since V2:
> > > > - Tweak the Kconfig help
> > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > ---
> > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > >  drivers/virtio/virtio.c          |  2 ++
> > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > >  include/linux/virtio_config.h    |  2 ++
> > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > >                       vcdev->err = -EIO;
> > > >       }
> > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > -     /* Interrupts are disabled here */
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > +     /*
> > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > +      * disabled here.
> > > > +      */
> > > >       read_lock(&vcdev->irq_lock);
> > > > +#endif
> > > >       for_each_set_bit(i, indicators(vcdev),
> > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > >               /* The bit clear must happen before the vring kick. */
> > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > >               vring_interrupt(0, vq);
> > > >       }
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >       read_unlock(&vcdev->irq_lock);
> > > > +#endif
> > > >       if (test_bit(0, indicators2(vcdev))) {
> > > >               virtio_config_changed(&vcdev->vdev);
> > > >               clear_bit(0, indicators2(vcdev));
> > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > --- a/drivers/virtio/Kconfig
> > > > +++ b/drivers/virtio/Kconfig
> > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > >
> > > >  if VIRTIO_MENU
> > > >
> > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > +        bool "Harden virtio notification"
> > > > +        help
> > > > +          Enable this to harden the device notifications and suppress
> > > > +          those that happen at a time where notifications are illegal.
> > > > +
> > > > +          Experimental: Note that several drivers still have bugs that
> > > > +          may cause crashes or hangs when correct handling of
> > > > +          notifications is enforced; depending on the subset of
> > > > +          drivers and devices you use, this may or may not work.
> > > > +
> > > > +          If unsure, say N.
> > > > +
> > > >  config VIRTIO_PCI
> > > >       tristate "PCI driver for virtio devices"
> > > >       depends on PCI
> > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > --- a/drivers/virtio/virtio.c
> > > > +++ b/drivers/virtio/virtio.c
> > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > >   * */
> > > >  void virtio_reset_device(struct virtio_device *dev)
> > > >  {
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >       /*
> > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > >        * interrupt for this line arriving after
> > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > >        */
> > > >       virtio_break_device(dev);
> > > >       virtio_synchronize_cbs(dev);
> > > > +#endif
> > > >
> > > >       dev->config->reset(dev);
> > > >  }
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > >       vq->we_own_ring = true;
> > > >       vq->notify = notify;
> > > >       vq->weak_barriers = weak_barriers;
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >       vq->broken = true;
> > > > +#else
> > > > +     vq->broken = false;
> > > > +#endif
> > > >       vq->last_used_idx = 0;
> > > >       vq->event_triggered = false;
> > > >       vq->num_added = 0;
> > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > >       }
> > > >
> > > >       if (unlikely(vq->broken)) {
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > >               return IRQ_NONE;
> > > > +#else
> > > > +             return IRQ_HANDLED;
> > > > +#endif
> > > >       }
> > > >
> > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > >       vq->we_own_ring = false;
> > > >       vq->notify = notify;
> > > >       vq->weak_barriers = weak_barriers;
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >       vq->broken = true;
> > > > +#else
> > > > +     vq->broken = false;
> > > > +#endif
> > > >       vq->last_used_idx = 0;
> > > >       vq->event_triggered = false;
> > > >       vq->num_added = 0;
> > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > --- a/include/linux/virtio_config.h
> > > > +++ b/include/linux/virtio_config.h
> > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > >
> > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > >
> > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > >       /*
> > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > >        * will see the driver specific setup if it sees vq->broken
> > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > >        */
> > > >       virtio_synchronize_cbs(dev);
> > > >       __virtio_unbreak_device(dev);
> > > > +#endif
> > > >       /*
> > > >        * The transport should ensure the visibility of vq->broken
> > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > --
> > > > 2.25.1
> > >
>
Michael S. Tsirkin June 27, 2022, 7:33 a.m. UTC | #8
On Mon, Jun 27, 2022 at 10:50:17AM +0800, Jason Wang wrote:
> On Fri, Jun 24, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> > > On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > > > harden vring IRQ"). It works with the assumption that the driver or
> > > > > core can properly call virtio_device_ready() at the right
> > > > > place. Unfortunately, this seems to be not true and uncover various
> > > > > bugs of the existing drivers, mainly the issue of using
> > > > > virtio_device_ready() incorrectly.
> > > > >
> > > > > So let's having a Kconfig option and disable it by default. It gives
> > > > > us a breath to fix the drivers and then we can consider to enable it
> > > > > by default.
> > > > >
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > >
> > > > OK I will queue, but I think the problem is fundamental.
> > >
> > > If I understand correctly, you want some core IRQ work?
> >
> > Yes.
> >
> > > As discussed
> > > before, it doesn't solve all the problems, we still need to do per
> > > driver audit.
> > >
> > > Thanks
> >
> > Maybe, but we don't need to tie things to device_ready then.
> > We can do
> >
> > - disable irqs
> > - device ready
> > - setup everything
> > - enable irqs
> >
> >
> > and this works for most things, the only issue is
> > this deadlocks if "setup everything" waits for interrupts.
> >
> >
> > With the current approach there's really no good time:
> > 1.- setup everything
> > - device ready
> >
> > can cause kicks before device is ready
> >
> > 2.- device ready
> > - setup everything
> >
> > can cause callbacks before setup.
> >
> > So I prefer the 1. and fix the hardening in the core.
> 
> So my question is:
> 
> 1) do similar hardening like config interrupt
> or
> 2) per transport notification work (e.g for PCI core IRQ work)
> 
> 1) seems easier and universal, but we pay little overhead which could
> be eliminated by the config option.

I doubt 1 is easy and I am not even sure core IRQ changes will help.
My concern with adding overhead is that I'm not sure these are not just
wasted CPU cycles.  We spent a bunch of time on irq hardening and so far
we are still at the "all drivers need to be fixed" stage.

The reason config was kind of easy is that config interrupt is rarely
vital for device function so arbitrarily deferring that does not lead to
deadlocks - what you are trying to do with VQ interrupts is
fundamentally different. Things are especially bad if we just drop
an interrupt but deferring can lead to problems too.

Consider as an example
    virtio-net: fix race between ndo_open() and virtio_device_ready()
if you just defer vq interrupts you get deadlocks.



So, thinking about all this, how about a simple per vq flag meaning
"this vq was kicked since reset"?

If driver does not kick then it's not ready to get callbacks, right?

Sounds quite clean, but we need to think through memory ordering
concerns - I guess it's only when we change the value so
	if (!vq->kicked) {
		vq->kicked = true;
		mb();
	}

will do the trick, right?

need to think about the reset path - it already synchronizes callbacks
and already can lose interrupts so we just need to clear vq->kicked
before that, right?


> 2) seems require more work in the IRQ core and it can not work for all
> transports (e.g vDPA would be kind of difficult)
> 
> Thanks

Hmm I don't really get why would it be difficult.
VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
have interrupt masking support.


> >
> >
> > > >
> > > >
> > > > > ---
> > > > > Changes since V2:
> > > > > - Tweak the Kconfig help
> > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > ---
> > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > >  include/linux/virtio_config.h    |  2 ++
> > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > >                       vcdev->err = -EIO;
> > > > >       }
> > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > -     /* Interrupts are disabled here */
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > +     /*
> > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > +      * disabled here.
> > > > > +      */
> > > > >       read_lock(&vcdev->irq_lock);
> > > > > +#endif
> > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > >               /* The bit clear must happen before the vring kick. */
> > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > >               vring_interrupt(0, vq);
> > > > >       }
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >       read_unlock(&vcdev->irq_lock);
> > > > > +#endif
> > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > >               virtio_config_changed(&vcdev->vdev);
> > > > >               clear_bit(0, indicators2(vcdev));
> > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > --- a/drivers/virtio/Kconfig
> > > > > +++ b/drivers/virtio/Kconfig
> > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > >
> > > > >  if VIRTIO_MENU
> > > > >
> > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > +        bool "Harden virtio notification"
> > > > > +        help
> > > > > +          Enable this to harden the device notifications and suppress
> > > > > +          those that happen at a time where notifications are illegal.
> > > > > +
> > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > +          may cause crashes or hangs when correct handling of
> > > > > +          notifications is enforced; depending on the subset of
> > > > > +          drivers and devices you use, this may or may not work.
> > > > > +
> > > > > +          If unsure, say N.
> > > > > +
> > > > >  config VIRTIO_PCI
> > > > >       tristate "PCI driver for virtio devices"
> > > > >       depends on PCI
> > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > --- a/drivers/virtio/virtio.c
> > > > > +++ b/drivers/virtio/virtio.c
> > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > >   * */
> > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > >  {
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >       /*
> > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > >        * interrupt for this line arriving after
> > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > >        */
> > > > >       virtio_break_device(dev);
> > > > >       virtio_synchronize_cbs(dev);
> > > > > +#endif
> > > > >
> > > > >       dev->config->reset(dev);
> > > > >  }
> > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > >       vq->we_own_ring = true;
> > > > >       vq->notify = notify;
> > > > >       vq->weak_barriers = weak_barriers;
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >       vq->broken = true;
> > > > > +#else
> > > > > +     vq->broken = false;
> > > > > +#endif
> > > > >       vq->last_used_idx = 0;
> > > > >       vq->event_triggered = false;
> > > > >       vq->num_added = 0;
> > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > >       }
> > > > >
> > > > >       if (unlikely(vq->broken)) {
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > >               return IRQ_NONE;
> > > > > +#else
> > > > > +             return IRQ_HANDLED;
> > > > > +#endif
> > > > >       }
> > > > >
> > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > >       vq->we_own_ring = false;
> > > > >       vq->notify = notify;
> > > > >       vq->weak_barriers = weak_barriers;
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >       vq->broken = true;
> > > > > +#else
> > > > > +     vq->broken = false;
> > > > > +#endif
> > > > >       vq->last_used_idx = 0;
> > > > >       vq->event_triggered = false;
> > > > >       vq->num_added = 0;
> > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > --- a/include/linux/virtio_config.h
> > > > > +++ b/include/linux/virtio_config.h
> > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > >
> > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > >
> > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > >       /*
> > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > >        */
> > > > >       virtio_synchronize_cbs(dev);
> > > > >       __virtio_unbreak_device(dev);
> > > > > +#endif
> > > > >       /*
> > > > >        * The transport should ensure the visibility of vq->broken
> > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > --
> > > > > 2.25.1
> > > >
> >
Jason Wang June 27, 2022, 8:11 a.m. UTC | #9
On Mon, Jun 27, 2022 at 3:33 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 27, 2022 at 10:50:17AM +0800, Jason Wang wrote:
> > On Fri, Jun 24, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> > > > On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > > > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > > > > harden vring IRQ"). It works with the assumption that the driver or
> > > > > > core can properly call virtio_device_ready() at the right
> > > > > > place. Unfortunately, this seems to be not true and uncover various
> > > > > > bugs of the existing drivers, mainly the issue of using
> > > > > > virtio_device_ready() incorrectly.
> > > > > >
> > > > > > So let's having a Kconfig option and disable it by default. It gives
> > > > > > us a breath to fix the drivers and then we can consider to enable it
> > > > > > by default.
> > > > > >
> > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > >
> > > > > OK I will queue, but I think the problem is fundamental.
> > > >
> > > > If I understand correctly, you want some core IRQ work?
> > >
> > > Yes.
> > >
> > > > As discussed
> > > > before, it doesn't solve all the problems, we still need to do per
> > > > driver audit.
> > > >
> > > > Thanks
> > >
> > > Maybe, but we don't need to tie things to device_ready then.
> > > We can do
> > >
> > > - disable irqs
> > > - device ready
> > > - setup everything
> > > - enable irqs
> > >
> > >
> > > and this works for most things, the only issue is
> > > this deadlocks if "setup everything" waits for interrupts.
> > >
> > >
> > > With the current approach there's really no good time:
> > > 1.- setup everything
> > > - device ready
> > >
> > > can cause kicks before device is ready
> > >
> > > 2.- device ready
> > > - setup everything
> > >
> > > can cause callbacks before setup.
> > >
> > > So I prefer the 1. and fix the hardening in the core.
> >
> > So my question is:
> >
> > 1) do similar hardening like config interrupt
> > or
> > 2) per transport notification work (e.g for PCI core IRQ work)
> >
> > 1) seems easier and universal, but we pay little overhead which could
> > be eliminated by the config option.
>
> I doubt 1 is easy and I am not even sure core IRQ changes will help.

Core IRQ won't help in 1), the changes are limited in the virtio.

> My concern with adding overhead is that I'm not sure these are not just
> wasted CPU cycles.  We spent a bunch of time on irq hardening and so far
> we are still at the "all drivers need to be fixed" stage.

It's not the fault of hardening but the drivers. The hardening just
expose those bugs.

>
> The reason config was kind of easy is that config interrupt is rarely
> vital for device function so arbitrarily deferring that does not lead to
> deadlocks - what you are trying to do with VQ interrupts is
> fundamentally different. Things are especially bad if we just drop
> an interrupt but deferring can lead to problems too.

I'm not sure I see the difference, disable_irq() stuffs also delay the
interrupt processing until enable_irq().

>
> Consider as an example
>     virtio-net: fix race between ndo_open() and virtio_device_ready()
> if you just defer vq interrupts you get deadlocks.
>
>

I don't see a deadlock here, maybe you can show more detail on this?

>
> So, thinking about all this, how about a simple per vq flag meaning
> "this vq was kicked since reset"?

And ignore the notification if vq is not kicked? It sounds like the
callback needs to be synchronized with the kick.

>
> If driver does not kick then it's not ready to get callbacks, right?
>
> Sounds quite clean, but we need to think through memory ordering
> concerns - I guess it's only when we change the value so
>         if (!vq->kicked) {
>                 vq->kicked = true;
>                 mb();
>         }
>
> will do the trick, right?

There's no much difference with the existing approach:

1) your proposal implicitly makes callbacks ready in virtqueue_kick()
2) my proposal explicitly makes callbacks ready via virtio_device_ready()

Both require careful auditing of all the existing drivers to make sure
no kick before DRIVER_OK.

>
> need to think about the reset path - it already synchronizes callbacks
> and already can lose interrupts so we just need to clear vq->kicked
> before that, right?

Probably.

>
>
> > 2) seems require more work in the IRQ core and it can not work for all
> > transports (e.g vDPA would be kind of difficult)
> >
> > Thanks
>
> Hmm I don't really get why would it be difficult.
> VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> have interrupt masking support.

Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
the auxiliary bus. And that is the way another vendor will go.

Thanks

>
>
> > >
> > >
> > > > >
> > > > >
> > > > > > ---
> > > > > > Changes since V2:
> > > > > > - Tweak the Kconfig help
> > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > ---
> > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > >                       vcdev->err = -EIO;
> > > > > >       }
> > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > -     /* Interrupts are disabled here */
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > +     /*
> > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > +      * disabled here.
> > > > > > +      */
> > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > +#endif
> > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > >               vring_interrupt(0, vq);
> > > > > >       }
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > +#endif
> > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > --- a/drivers/virtio/Kconfig
> > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > >
> > > > > >  if VIRTIO_MENU
> > > > > >
> > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > +        bool "Harden virtio notification"
> > > > > > +        help
> > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > +
> > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > +          notifications is enforced; depending on the subset of
> > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > +
> > > > > > +          If unsure, say N.
> > > > > > +
> > > > > >  config VIRTIO_PCI
> > > > > >       tristate "PCI driver for virtio devices"
> > > > > >       depends on PCI
> > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > --- a/drivers/virtio/virtio.c
> > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > >   * */
> > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > >  {
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >       /*
> > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > >        * interrupt for this line arriving after
> > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > >        */
> > > > > >       virtio_break_device(dev);
> > > > > >       virtio_synchronize_cbs(dev);
> > > > > > +#endif
> > > > > >
> > > > > >       dev->config->reset(dev);
> > > > > >  }
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > >       vq->we_own_ring = true;
> > > > > >       vq->notify = notify;
> > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >       vq->broken = true;
> > > > > > +#else
> > > > > > +     vq->broken = false;
> > > > > > +#endif
> > > > > >       vq->last_used_idx = 0;
> > > > > >       vq->event_triggered = false;
> > > > > >       vq->num_added = 0;
> > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > >       }
> > > > > >
> > > > > >       if (unlikely(vq->broken)) {
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > >               return IRQ_NONE;
> > > > > > +#else
> > > > > > +             return IRQ_HANDLED;
> > > > > > +#endif
> > > > > >       }
> > > > > >
> > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > >       vq->we_own_ring = false;
> > > > > >       vq->notify = notify;
> > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >       vq->broken = true;
> > > > > > +#else
> > > > > > +     vq->broken = false;
> > > > > > +#endif
> > > > > >       vq->last_used_idx = 0;
> > > > > >       vq->event_triggered = false;
> > > > > >       vq->num_added = 0;
> > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > --- a/include/linux/virtio_config.h
> > > > > > +++ b/include/linux/virtio_config.h
> > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > >
> > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > >
> > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > >       /*
> > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > >        */
> > > > > >       virtio_synchronize_cbs(dev);
> > > > > >       __virtio_unbreak_device(dev);
> > > > > > +#endif
> > > > > >       /*
> > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > --
> > > > > > 2.25.1
> > > > >
> > >
>
Michael S. Tsirkin June 27, 2022, 9:52 a.m. UTC | #10
On Mon, Jun 27, 2022 at 04:11:18PM +0800, Jason Wang wrote:
> On Mon, Jun 27, 2022 at 3:33 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 10:50:17AM +0800, Jason Wang wrote:
> > > On Fri, Jun 24, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> > > > > On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > > > > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > > > > > harden vring IRQ"). It works with the assumption that the driver or
> > > > > > > core can properly call virtio_device_ready() at the right
> > > > > > > place. Unfortunately, this seems to be not true and uncover various
> > > > > > > bugs of the existing drivers, mainly the issue of using
> > > > > > > virtio_device_ready() incorrectly.
> > > > > > >
> > > > > > > So let's having a Kconfig option and disable it by default. It gives
> > > > > > > us a breath to fix the drivers and then we can consider to enable it
> > > > > > > by default.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > >
> > > > > > OK I will queue, but I think the problem is fundamental.
> > > > >
> > > > > If I understand correctly, you want some core IRQ work?
> > > >
> > > > Yes.
> > > >
> > > > > As discussed
> > > > > before, it doesn't solve all the problems, we still need to do per
> > > > > driver audit.
> > > > >
> > > > > Thanks
> > > >
> > > > Maybe, but we don't need to tie things to device_ready then.
> > > > We can do
> > > >
> > > > - disable irqs
> > > > - device ready
> > > > - setup everything
> > > > - enable irqs
> > > >
> > > >
> > > > and this works for most things, the only issue is
> > > > this deadlocks if "setup everything" waits for interrupts.
> > > >
> > > >
> > > > With the current approach there's really no good time:
> > > > 1.- setup everything
> > > > - device ready
> > > >
> > > > can cause kicks before device is ready
> > > >
> > > > 2.- device ready
> > > > - setup everything
> > > >
> > > > can cause callbacks before setup.
> > > >
> > > > So I prefer the 1. and fix the hardening in the core.
> > >
> > > So my question is:
> > >
> > > 1) do similar hardening like config interrupt
> > > or
> > > 2) per transport notification work (e.g for PCI core IRQ work)
> > >
> > > 1) seems easier and universal, but we pay little overhead which could
> > > be eliminated by the config option.
> >
> > I doubt 1 is easy and I am not even sure core IRQ changes will help.
> 
> Core IRQ won't help in 1), the changes are limited in the virtio.
> 
> > My concern with adding overhead is that I'm not sure these are not just
> > wasted CPU cycles.  We spent a bunch of time on irq hardening and so far
> > we are still at the "all drivers need to be fixed" stage.
> 
> It's not the fault of hardening but the drivers. The hardening just
> expose those bugs.

Heh. Yea sure. But things work fine for people. What is the chance
your review found and fixed all driver bugs? After two attempts
I don't feel like hoping audit will fix all bugs.


> >
> > The reason config was kind of easy is that config interrupt is rarely
> > vital for device function so arbitrarily deferring that does not lead to
> > deadlocks - what you are trying to do with VQ interrupts is
> > fundamentally different. Things are especially bad if we just drop
> > an interrupt but deferring can lead to problems too.
> 
> I'm not sure I see the difference, disable_irq() stuffs also delay the
> interrupt processing until enable_irq().


Absolutely. I am not at all sure disable_irq fixes all problems.

> >
> > Consider as an example
> >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > if you just defer vq interrupts you get deadlocks.
> >
> >
> 
> I don't see a deadlock here, maybe you can show more detail on this?

What I mean is this: if we revert the above commit, things still
work (out of spec, but still). If we revert and defer interrupts until
device ready then ndo_open that triggers before device ready deadlocks.


> >
> > So, thinking about all this, how about a simple per vq flag meaning
> > "this vq was kicked since reset"?
> 
> And ignore the notification if vq is not kicked? It sounds like the
> callback needs to be synchronized with the kick.

Note we only need to synchronize it when it changes, which is
only during initialization and reset.


> >
> > If driver does not kick then it's not ready to get callbacks, right?
> >
> > Sounds quite clean, but we need to think through memory ordering
> > concerns - I guess it's only when we change the value so
> >         if (!vq->kicked) {
> >                 vq->kicked = true;
> >                 mb();
> >         }
> >
> > will do the trick, right?
> 
> There's no much difference with the existing approach:
> 
> 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> 
> Both require careful auditing of all the existing drivers to make sure
> no kick before DRIVER_OK.

Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
to hardening and in absence of config interrupts is generally easily
fixed just by sticking virtio_device_ready early in initialization.
With the current approach one has to *also* not do virtio_device_ready
too early - and it's really tricky.

With the proposal I think that we don't need to fix all drivers and
in my eyes that is a huge advantage because frankly I'm fine with
more work on strict spec compliance taking more than expected
but I would like the hardening work to finally be done.
I am not sure the amount of effort expended on the hardening here is
proportionate to the benefit it provides.



> >
> > need to think about the reset path - it already synchronizes callbacks
> > and already can lose interrupts so we just need to clear vq->kicked
> > before that, right?
> 
> Probably.
> 
> >
> >
> > > 2) seems require more work in the IRQ core and it can not work for all
> > > transports (e.g vDPA would be kind of difficult)
> > >
> > > Thanks
> >
> > Hmm I don't really get why would it be difficult.
> > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > have interrupt masking support.
> 
> Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> the auxiliary bus. And that is the way another vendor will go.
> 
> Thanks

A bunch of callbacks will do it I guess.

> >
> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > ---
> > > > > > > Changes since V2:
> > > > > > > - Tweak the Kconfig help
> > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > ---
> > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > >                       vcdev->err = -EIO;
> > > > > > >       }
> > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > -     /* Interrupts are disabled here */
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > +     /*
> > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > +      * disabled here.
> > > > > > > +      */
> > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > +#endif
> > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > >               vring_interrupt(0, vq);
> > > > > > >       }
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > +#endif
> > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > >
> > > > > > >  if VIRTIO_MENU
> > > > > > >
> > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > +        bool "Harden virtio notification"
> > > > > > > +        help
> > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > +
> > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > +
> > > > > > > +          If unsure, say N.
> > > > > > > +
> > > > > > >  config VIRTIO_PCI
> > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > >       depends on PCI
> > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > >   * */
> > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > >  {
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >       /*
> > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > >        * interrupt for this line arriving after
> > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > >        */
> > > > > > >       virtio_break_device(dev);
> > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > +#endif
> > > > > > >
> > > > > > >       dev->config->reset(dev);
> > > > > > >  }
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > >       vq->we_own_ring = true;
> > > > > > >       vq->notify = notify;
> > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >       vq->broken = true;
> > > > > > > +#else
> > > > > > > +     vq->broken = false;
> > > > > > > +#endif
> > > > > > >       vq->last_used_idx = 0;
> > > > > > >       vq->event_triggered = false;
> > > > > > >       vq->num_added = 0;
> > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > >       }
> > > > > > >
> > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > >               return IRQ_NONE;
> > > > > > > +#else
> > > > > > > +             return IRQ_HANDLED;
> > > > > > > +#endif
> > > > > > >       }
> > > > > > >
> > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > >       vq->we_own_ring = false;
> > > > > > >       vq->notify = notify;
> > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >       vq->broken = true;
> > > > > > > +#else
> > > > > > > +     vq->broken = false;
> > > > > > > +#endif
> > > > > > >       vq->last_used_idx = 0;
> > > > > > >       vq->event_triggered = false;
> > > > > > >       vq->num_added = 0;
> > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > >
> > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > >
> > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > >       /*
> > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > >        */
> > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > +#endif
> > > > > > >       /*
> > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > --
> > > > > > > 2.25.1
> > > > > >
> > > >
> >
Jason Wang June 28, 2022, 3:49 a.m. UTC | #11
On Mon, Jun 27, 2022 at 5:52 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 27, 2022 at 04:11:18PM +0800, Jason Wang wrote:
> > On Mon, Jun 27, 2022 at 3:33 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 10:50:17AM +0800, Jason Wang wrote:
> > > > On Fri, Jun 24, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Jun 22, 2022 at 03:09:31PM +0800, Jason Wang wrote:
> > > > > > On Wed, Jun 22, 2022 at 3:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Jun 22, 2022 at 09:29:40AM +0800, Jason Wang wrote:
> > > > > > > > We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio:
> > > > > > > > harden vring IRQ"). It works with the assumption that the driver or
> > > > > > > > core can properly call virtio_device_ready() at the right
> > > > > > > > place. Unfortunately, this seems to be not true and uncover various
> > > > > > > > bugs of the existing drivers, mainly the issue of using
> > > > > > > > virtio_device_ready() incorrectly.
> > > > > > > >
> > > > > > > > So let's having a Kconfig option and disable it by default. It gives
> > > > > > > > us a breath to fix the drivers and then we can consider to enable it
> > > > > > > > by default.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > >
> > > > > > > OK I will queue, but I think the problem is fundamental.
> > > > > >
> > > > > > If I understand correctly, you want some core IRQ work?
> > > > >
> > > > > Yes.
> > > > >
> > > > > > As discussed
> > > > > > before, it doesn't solve all the problems, we still need to do per
> > > > > > driver audit.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Maybe, but we don't need to tie things to device_ready then.
> > > > > We can do
> > > > >
> > > > > - disable irqs
> > > > > - device ready
> > > > > - setup everything
> > > > > - enable irqs
> > > > >
> > > > >
> > > > > and this works for most things, the only issue is
> > > > > this deadlocks if "setup everything" waits for interrupts.
> > > > >
> > > > >
> > > > > With the current approach there's really no good time:
> > > > > 1.- setup everything
> > > > > - device ready
> > > > >
> > > > > can cause kicks before device is ready
> > > > >
> > > > > 2.- device ready
> > > > > - setup everything
> > > > >
> > > > > can cause callbacks before setup.
> > > > >
> > > > > So I prefer the 1. and fix the hardening in the core.
> > > >
> > > > So my question is:
> > > >
> > > > 1) do similar hardening like config interrupt
> > > > or
> > > > 2) per transport notification work (e.g for PCI core IRQ work)
> > > >
> > > > 1) seems easier and universal, but we pay little overhead which could
> > > > be eliminated by the config option.
> > >
> > > I doubt 1 is easy and I am not even sure core IRQ changes will help.
> >
> > Core IRQ won't help in 1), the changes are limited in the virtio.
> >
> > > My concern with adding overhead is that I'm not sure these are not just
> > > wasted CPU cycles.  We spent a bunch of time on irq hardening and so far
> > > we are still at the "all drivers need to be fixed" stage.
> >
> > It's not the fault of hardening but the drivers. The hardening just
> > expose those bugs.
>
> Heh. Yea sure. But things work fine for people. What is the chance
> your review found and fixed all driver bugs?

I don't/can't audit all bugs but the race between open/close against
ready/reset. It looks to me a good chance to fix them all but if you
think differently, let me know

> After two attempts
> I don't feel like hoping audit will fix all bugs.

I've started the auditing and have 15+ patches in the queue. (only
covers bluetooth, console, pmem, virtio-net and caif). Spotting the
issue is not hard but the testing, It would take at least the time of
one release to finalize I guess.

>
>
> > >
> > > The reason config was kind of easy is that config interrupt is rarely
> > > vital for device function so arbitrarily deferring that does not lead to
> > > deadlocks - what you are trying to do with VQ interrupts is
> > > fundamentally different. Things are especially bad if we just drop
> > > an interrupt but deferring can lead to problems too.
> >
> > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > interrupt processing until enable_irq().
>
>
> Absolutely. I am not at all sure disable_irq fixes all problems.
>
> > >
> > > Consider as an example
> > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > if you just defer vq interrupts you get deadlocks.
> > >
> > >
> >
> > I don't see a deadlock here, maybe you can show more detail on this?
>
> What I mean is this: if we revert the above commit, things still
> work (out of spec, but still). If we revert and defer interrupts until
> device ready then ndo_open that triggers before device ready deadlocks.

Ok, I guess you meant on a hypervisor that is strictly written with spec.

>
>
> > >
> > > So, thinking about all this, how about a simple per vq flag meaning
> > > "this vq was kicked since reset"?
> >
> > And ignore the notification if vq is not kicked? It sounds like the
> > callback needs to be synchronized with the kick.
>
> Note we only need to synchronize it when it changes, which is
> only during initialization and reset.

Yes.

>
>
> > >
> > > If driver does not kick then it's not ready to get callbacks, right?
> > >
> > > Sounds quite clean, but we need to think through memory ordering
> > > concerns - I guess it's only when we change the value so
> > >         if (!vq->kicked) {
> > >                 vq->kicked = true;
> > >                 mb();
> > >         }
> > >
> > > will do the trick, right?
> >
> > There's no much difference with the existing approach:
> >
> > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> >
> > Both require careful auditing of all the existing drivers to make sure
> > no kick before DRIVER_OK.
>
> Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> to hardening

Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.

> and in absence of config interrupts is generally easily
> fixed just by sticking virtio_device_ready early in initialization.

So if the kick is done before the subsystem registration, there's
still a window in the middle (assuming we stick virtio_device_ready()
early):

virtio_device_ready()
virtqueue_kick()
/* the window */
subsystem_registration()

And during remove(), we get another window:

subsysrem_unregistration()
/* the window */
virtio_device_reset()

if we do reset before, we may end up with other issues like deadlock.

So I think it's probably very hard to fix issues only at the virtio
core since we need subsystem specific knowledge to make sure
everything is synchronized.

> With the current approach one has to *also* not do virtio_device_ready
> too early - and it's really tricky.

Not sure how much we differ here, during the probe driver can just
place the virtio_device_ready() after the kick.

>
> With the proposal I think that we don't need to fix all drivers and
> in my eyes that is a huge advantage because frankly I'm fine with
> more work on strict spec compliance taking more than expected
> but I would like the hardening work to finally be done.

Ok, but what I meant is, with your proposal if a buggy drive kicks
before DRIVER_OK, it suppresses the effect of hardening?

> I am not sure the amount of effort expended on the hardening here is
> proportionate to the benefit it provides.

Probably, but we received those bug reports from the confidential
computing guys.

Or can we choose to go another way, let the kconfig option enabled for
TDX/SEV, and then fix the bugs only if it is reported?

>
>
>
> > >
> > > need to think about the reset path - it already synchronizes callbacks
> > > and already can lose interrupts so we just need to clear vq->kicked
> > > before that, right?
> >
> > Probably.
> >
> > >
> > >
> > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > transports (e.g vDPA would be kind of difficult)
> > > >
> > > > Thanks
> > >
> > > Hmm I don't really get why would it be difficult.
> > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > have interrupt masking support.
> >
> > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > the auxiliary bus. And that is the way another vendor will go.
> >
> > Thanks
>
> A bunch of callbacks will do it I guess.

Possible but looks like a layer violation, I think auxiliary stuff
wants to hide the underlayer architecture. That is why I tend to do it
in the virtio core. And actually, transport is freed to implement
another layer of those synchronization if it wants.

Thanks

>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > ---
> > > > > > > > Changes since V2:
> > > > > > > > - Tweak the Kconfig help
> > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > ---
> > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > >                       vcdev->err = -EIO;
> > > > > > > >       }
> > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > +     /*
> > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > +      * disabled here.
> > > > > > > > +      */
> > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > +#endif
> > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > >               vring_interrupt(0, vq);
> > > > > > > >       }
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > +#endif
> > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > >
> > > > > > > >  if VIRTIO_MENU
> > > > > > > >
> > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > +        help
> > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > +
> > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > +
> > > > > > > > +          If unsure, say N.
> > > > > > > > +
> > > > > > > >  config VIRTIO_PCI
> > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > >       depends on PCI
> > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > >   * */
> > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > >  {
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >       /*
> > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > >        * interrupt for this line arriving after
> > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > >        */
> > > > > > > >       virtio_break_device(dev);
> > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > +#endif
> > > > > > > >
> > > > > > > >       dev->config->reset(dev);
> > > > > > > >  }
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > >       vq->we_own_ring = true;
> > > > > > > >       vq->notify = notify;
> > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >       vq->broken = true;
> > > > > > > > +#else
> > > > > > > > +     vq->broken = false;
> > > > > > > > +#endif
> > > > > > > >       vq->last_used_idx = 0;
> > > > > > > >       vq->event_triggered = false;
> > > > > > > >       vq->num_added = 0;
> > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > >               return IRQ_NONE;
> > > > > > > > +#else
> > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > +#endif
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > >       vq->we_own_ring = false;
> > > > > > > >       vq->notify = notify;
> > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >       vq->broken = true;
> > > > > > > > +#else
> > > > > > > > +     vq->broken = false;
> > > > > > > > +#endif
> > > > > > > >       vq->last_used_idx = 0;
> > > > > > > >       vq->event_triggered = false;
> > > > > > > >       vq->num_added = 0;
> > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > >
> > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > >
> > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > >       /*
> > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > >        */
> > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > +#endif
> > > > > > > >       /*
> > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > --
> > > > > > > > 2.25.1
> > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin June 28, 2022, 4:59 a.m. UTC | #12
On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > Heh. Yea sure. But things work fine for people. What is the chance
> > your review found and fixed all driver bugs?
> 
> I don't/can't audit all bugs but the race between open/close against
> ready/reset. It looks to me a good chance to fix them all but if you
> think differently, let me know
> 
> > After two attempts
> > I don't feel like hoping audit will fix all bugs.
> 
> I've started the auditing and have 15+ patches in the queue. (only
> covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> issue is not hard but the testing, It would take at least the time of
> one release to finalize I guess.

Absolutely. So I am looking for a way to implement hardening that does
not break existing drivers.


> >
> >
> > > >
> > > > The reason config was kind of easy is that config interrupt is rarely
> > > > vital for device function so arbitrarily deferring that does not lead to
> > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > fundamentally different. Things are especially bad if we just drop
> > > > an interrupt but deferring can lead to problems too.
> > >
> > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > interrupt processing until enable_irq().
> >
> >
> > Absolutely. I am not at all sure disable_irq fixes all problems.
> >
> > > >
> > > > Consider as an example
> > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > if you just defer vq interrupts you get deadlocks.
> > > >
> > > >
> > >
> > > I don't see a deadlock here, maybe you can show more detail on this?
> >
> > What I mean is this: if we revert the above commit, things still
> > work (out of spec, but still). If we revert and defer interrupts until
> > device ready then ndo_open that triggers before device ready deadlocks.
> 
> Ok, I guess you meant on a hypervisor that is strictly written with spec.

I mean on hypervisor that starts processing queues after getting a kick
even without DRIVER_OK.

> >
> >
> > > >
> > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > "this vq was kicked since reset"?
> > >
> > > And ignore the notification if vq is not kicked? It sounds like the
> > > callback needs to be synchronized with the kick.
> >
> > Note we only need to synchronize it when it changes, which is
> > only during initialization and reset.
> 
> Yes.
> 
> >
> >
> > > >
> > > > If driver does not kick then it's not ready to get callbacks, right?
> > > >
> > > > Sounds quite clean, but we need to think through memory ordering
> > > > concerns - I guess it's only when we change the value so
> > > >         if (!vq->kicked) {
> > > >                 vq->kicked = true;
> > > >                 mb();
> > > >         }
> > > >
> > > > will do the trick, right?
> > >
> > > There's no much difference with the existing approach:
> > >
> > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > >
> > > Both require careful auditing of all the existing drivers to make sure
> > > no kick before DRIVER_OK.
> >
> > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > to hardening
> 
> Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.

I don't see how - my proposal ignores DRIVER_OK issues.

> > and in absence of config interrupts is generally easily
> > fixed just by sticking virtio_device_ready early in initialization.
> 
> So if the kick is done before the subsystem registration, there's
> still a window in the middle (assuming we stick virtio_device_ready()
> early):
> 
> virtio_device_ready()
> virtqueue_kick()
> /* the window */
> subsystem_registration()

Absolutely, however, I do not think we really have many such drivers
since this has been known as a wrong thing to do since the beginning.
Want to try to find any? I couldn't ... except maybe bluetooth
but that's just maintainer nacking fixes saying he'll fix it
his way ...

> And during remove(), we get another window:
> 
> subsysrem_unregistration()
> /* the window */
> virtio_device_reset()

Same here.

> if we do reset before, we may end up with other issues like deadlock.
> 
> So I think it's probably very hard to fix issues only at the virtio
> core since we need subsystem specific knowledge to make sure
> everything is synchronized.

How many drivers do you see with the issue above?
As compared to yours which has 16 patches just in your queue.

> > With the current approach one has to *also* not do virtio_device_ready
> > too early - and it's really tricky.
> 
> Not sure how much we differ here, during the probe driver can just
> place the virtio_device_ready() after the kick.

Not so easy. For example, consider virtio net without your
locking change. kick is part of a command vq operation
which does not directly have anything to do with probe.
Same for many other devices - once you register,
linux can send commands immediately.
Yes we can invent per device flags and locking rules to
try and suppress commands until device_ready. I am not
all that happy with this idea to be frank ...


> >
> > With the proposal I think that we don't need to fix all drivers and
> > in my eyes that is a huge advantage because frankly I'm fine with
> > more work on strict spec compliance taking more than expected
> > but I would like the hardening work to finally be done.
> 
> Ok, but what I meant is, with your proposal if a buggy drive kicks
> before DRIVER_OK, it suppresses the effect of hardening?

I don't see how. DRIVER_OK is for device's benefit, guest does not need
it.

> > I am not sure the amount of effort expended on the hardening here is
> > proportionate to the benefit it provides.
> 
> Probably, but we received those bug reports from the confidential
> computing guys.
> 
> Or can we choose to go another way, let the kconfig option enabled for
> TDX/SEV, and then fix the bugs only if it is reported?

Nah that's really fragmenting out userbase.

> >
> >
> >
> > > >
> > > > need to think about the reset path - it already synchronizes callbacks
> > > > and already can lose interrupts so we just need to clear vq->kicked
> > > > before that, right?
> > >
> > > Probably.
> > >
> > > >
> > > >
> > > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > > transports (e.g vDPA would be kind of difficult)
> > > > >
> > > > > Thanks
> > > >
> > > > Hmm I don't really get why would it be difficult.
> > > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > > have interrupt masking support.
> > >
> > > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > > the auxiliary bus. And that is the way another vendor will go.
> > >
> > > Thanks
> >
> > A bunch of callbacks will do it I guess.
> 
> Possible but looks like a layer violation, I think auxiliary stuff
> wants to hide the underlayer architecture. That is why I tend to do it
> in the virtio core. And actually, transport is freed to implement
> another layer of those synchronization if it wants.
> 
> Thanks

As usual a level of indirection will help with layering.

> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > > Changes since V2:
> > > > > > > > > - Tweak the Kconfig help
> > > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > > ---
> > > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > >                       vcdev->err = -EIO;
> > > > > > > > >       }
> > > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > +     /*
> > > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > > +      * disabled here.
> > > > > > > > > +      */
> > > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > > +#endif
> > > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > > >               vring_interrupt(0, vq);
> > > > > > > > >       }
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > > +#endif
> > > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > > >
> > > > > > > > >  if VIRTIO_MENU
> > > > > > > > >
> > > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > > +        help
> > > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > > +
> > > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > > +
> > > > > > > > > +          If unsure, say N.
> > > > > > > > > +
> > > > > > > > >  config VIRTIO_PCI
> > > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > > >       depends on PCI
> > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > >   * */
> > > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > >  {
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >       /*
> > > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > > >        * interrupt for this line arriving after
> > > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > >        */
> > > > > > > > >       virtio_break_device(dev);
> > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > +#endif
> > > > > > > > >
> > > > > > > > >       dev->config->reset(dev);
> > > > > > > > >  }
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > > >       vq->we_own_ring = true;
> > > > > > > > >       vq->notify = notify;
> > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >       vq->broken = true;
> > > > > > > > > +#else
> > > > > > > > > +     vq->broken = false;
> > > > > > > > > +#endif
> > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > >       vq->event_triggered = false;
> > > > > > > > >       vq->num_added = 0;
> > > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > >               return IRQ_NONE;
> > > > > > > > > +#else
> > > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > > +#endif
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > > >       vq->we_own_ring = false;
> > > > > > > > >       vq->notify = notify;
> > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >       vq->broken = true;
> > > > > > > > > +#else
> > > > > > > > > +     vq->broken = false;
> > > > > > > > > +#endif
> > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > >       vq->event_triggered = false;
> > > > > > > > >       vq->num_added = 0;
> > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > >
> > > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > >
> > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > >       /*
> > > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > >        */
> > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > > +#endif
> > > > > > > > >       /*
> > > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > > --
> > > > > > > > > 2.25.1
> > > > > > > >
> > > > > >
> > > >
> >
Jason Wang June 28, 2022, 6:17 a.m. UTC | #13
On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > Heh. Yea sure. But things work fine for people. What is the chance
> > > your review found and fixed all driver bugs?
> >
> > I don't/can't audit all bugs but the race between open/close against
> > ready/reset. It looks to me a good chance to fix them all but if you
> > think differently, let me know
> >
> > > After two attempts
> > > I don't feel like hoping audit will fix all bugs.
> >
> > I've started the auditing and have 15+ patches in the queue. (only
> > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > issue is not hard but the testing, It would take at least the time of
> > one release to finalize I guess.
>
> Absolutely. So I am looking for a way to implement hardening that does
> not break existing drivers.

I totally agree with you to seek a way without bothering the drivers.
Just wonder if this is possbile.

>
>
> > >
> > >
> > > > >
> > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > fundamentally different. Things are especially bad if we just drop
> > > > > an interrupt but deferring can lead to problems too.
> > > >
> > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > interrupt processing until enable_irq().
> > >
> > >
> > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > >
> > > > >
> > > > > Consider as an example
> > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > if you just defer vq interrupts you get deadlocks.
> > > > >
> > > > >
> > > >
> > > > I don't see a deadlock here, maybe you can show more detail on this?
> > >
> > > What I mean is this: if we revert the above commit, things still
> > > work (out of spec, but still). If we revert and defer interrupts until
> > > device ready then ndo_open that triggers before device ready deadlocks.
> >
> > Ok, I guess you meant on a hypervisor that is strictly written with spec.
>
> I mean on hypervisor that starts processing queues after getting a kick
> even without DRIVER_OK.

Oh right.

>
> > >
> > >
> > > > >
> > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > "this vq was kicked since reset"?
> > > >
> > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > callback needs to be synchronized with the kick.
> > >
> > > Note we only need to synchronize it when it changes, which is
> > > only during initialization and reset.
> >
> > Yes.
> >
> > >
> > >
> > > > >
> > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > >
> > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > concerns - I guess it's only when we change the value so
> > > > >         if (!vq->kicked) {
> > > > >                 vq->kicked = true;
> > > > >                 mb();
> > > > >         }
> > > > >
> > > > > will do the trick, right?
> > > >
> > > > There's no much difference with the existing approach:
> > > >
> > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > >
> > > > Both require careful auditing of all the existing drivers to make sure
> > > > no kick before DRIVER_OK.
> > >
> > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > to hardening
> >
> > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
>
> I don't see how - my proposal ignores DRIVER_OK issues.

Yes, what I meant is, in your proposal, the first kick after rest is a
hint that the driver is ok (but actually it could not).

>
> > > and in absence of config interrupts is generally easily
> > > fixed just by sticking virtio_device_ready early in initialization.
> >
> > So if the kick is done before the subsystem registration, there's
> > still a window in the middle (assuming we stick virtio_device_ready()
> > early):
> >
> > virtio_device_ready()
> > virtqueue_kick()
> > /* the window */
> > subsystem_registration()
>
> Absolutely, however, I do not think we really have many such drivers
> since this has been known as a wrong thing to do since the beginning.
> Want to try to find any?

Yes, let me try and update.

>I couldn't ... except maybe bluetooth
> but that's just maintainer nacking fixes saying he'll fix it
> his way ...
>
> > And during remove(), we get another window:
> >
> > subsysrem_unregistration()
> > /* the window */
> > virtio_device_reset()
>
> Same here.
>
> > if we do reset before, we may end up with other issues like deadlock.
> >
> > So I think it's probably very hard to fix issues only at the virtio
> > core since we need subsystem specific knowledge to make sure
> > everything is synchronized.
>
> How many drivers do you see with the issue above?

E.g for virtio-net, except for the ndo_open()/driver_ok races I
spotted another two.

> As compared to yours which has 16 patches just in your queue.
>
> > > With the current approach one has to *also* not do virtio_device_ready
> > > too early - and it's really tricky.
> >
> > Not sure how much we differ here, during the probe driver can just
> > place the virtio_device_ready() after the kick.
>
> Not so easy. For example, consider virtio net without your
> locking change. kick is part of a command vq operation
> which does not directly have anything to do with probe.
> Same for many other devices - once you register,
> linux can send commands immediately.
> Yes we can invent per device flags and locking rules to
> try and suppress commands until device_ready. I am not
> all that happy with this idea to be frank ...

Actuall, I meant we need to depend on the subsystem facilities to
handle this (e.g the rtnl_lock() ndo_XYZ) instead of inventing per
driver flags. I believe virtio is not the only driver that suffers
from those races.

>
>
> > >
> > > With the proposal I think that we don't need to fix all drivers and
> > > in my eyes that is a huge advantage because frankly I'm fine with
> > > more work on strict spec compliance taking more than expected
> > > but I would like the hardening work to finally be done.
> >
> > Ok, but what I meant is, with your proposal if a buggy drive kicks
> > before DRIVER_OK, it suppresses the effect of hardening?
>
> I don't see how. DRIVER_OK is for device's benefit, guest does not need
> it.
>
> > > I am not sure the amount of effort expended on the hardening here is
> > > proportionate to the benefit it provides.
> >
> > Probably, but we received those bug reports from the confidential
> > computing guys.
> >
> > Or can we choose to go another way, let the kconfig option enabled for
> > TDX/SEV, and then fix the bugs only if it is reported?
>
> Nah that's really fragmenting out userbase.

True.

>
> > >
> > >
> > >
> > > > >
> > > > > need to think about the reset path - it already synchronizes callbacks
> > > > > and already can lose interrupts so we just need to clear vq->kicked
> > > > > before that, right?
> > > >
> > > > Probably.
> > > >
> > > > >
> > > > >
> > > > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > > > transports (e.g vDPA would be kind of difficult)
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Hmm I don't really get why would it be difficult.
> > > > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > > > have interrupt masking support.
> > > >
> > > > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > > > the auxiliary bus. And that is the way another vendor will go.
> > > >
> > > > Thanks
> > >
> > > A bunch of callbacks will do it I guess.
> >
> > Possible but looks like a layer violation, I think auxiliary stuff
> > wants to hide the underlayer architecture. That is why I tend to do it
> > in the virtio core. And actually, transport is freed to implement
> > another layer of those synchronization if it wants.
> >
> > Thanks
>
> As usual a level of indirection will help with layering.

Probably.

Thanks

>
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > Changes since V2:
> > > > > > > > > > - Tweak the Kconfig help
> > > > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > > > ---
> > > > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > >                       vcdev->err = -EIO;
> > > > > > > > > >       }
> > > > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > +     /*
> > > > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > > > +      * disabled here.
> > > > > > > > > > +      */
> > > > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > > > +#endif
> > > > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > > > >               vring_interrupt(0, vq);
> > > > > > > > > >       }
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > > > +#endif
> > > > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > > > >
> > > > > > > > > >  if VIRTIO_MENU
> > > > > > > > > >
> > > > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > > > +        help
> > > > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > > > +
> > > > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > > > +
> > > > > > > > > > +          If unsure, say N.
> > > > > > > > > > +
> > > > > > > > > >  config VIRTIO_PCI
> > > > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > > > >       depends on PCI
> > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > >   * */
> > > > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > >  {
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >       /*
> > > > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > > > >        * interrupt for this line arriving after
> > > > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > >        */
> > > > > > > > > >       virtio_break_device(dev);
> > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > +#endif
> > > > > > > > > >
> > > > > > > > > >       dev->config->reset(dev);
> > > > > > > > > >  }
> > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > > > >       vq->we_own_ring = true;
> > > > > > > > > >       vq->notify = notify;
> > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >       vq->broken = true;
> > > > > > > > > > +#else
> > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > +#endif
> > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > >               return IRQ_NONE;
> > > > > > > > > > +#else
> > > > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > > > +#endif
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > > > >       vq->we_own_ring = false;
> > > > > > > > > >       vq->notify = notify;
> > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >       vq->broken = true;
> > > > > > > > > > +#else
> > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > +#endif
> > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > >
> > > > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > >
> > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > >       /*
> > > > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > >        */
> > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > > > +#endif
> > > > > > > > > >       /*
> > > > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > > > --
> > > > > > > > > > 2.25.1
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin June 28, 2022, 6:24 a.m. UTC | #14
On Tue, Jun 28, 2022 at 02:17:50PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > your review found and fixed all driver bugs?
> > >
> > > I don't/can't audit all bugs but the race between open/close against
> > > ready/reset. It looks to me a good chance to fix them all but if you
> > > think differently, let me know
> > >
> > > > After two attempts
> > > > I don't feel like hoping audit will fix all bugs.
> > >
> > > I've started the auditing and have 15+ patches in the queue. (only
> > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > issue is not hard but the testing, It would take at least the time of
> > > one release to finalize I guess.
> >
> > Absolutely. So I am looking for a way to implement hardening that does
> > not break existing drivers.
> 
> I totally agree with you to seek a way without bothering the drivers.
> Just wonder if this is possbile.
> 
> >
> >
> > > >
> > > >
> > > > > >
> > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > an interrupt but deferring can lead to problems too.
> > > > >
> > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > interrupt processing until enable_irq().
> > > >
> > > >
> > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > >
> > > > > >
> > > > > > Consider as an example
> > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > >
> > > > > >
> > > > >
> > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > >
> > > > What I mean is this: if we revert the above commit, things still
> > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > device ready then ndo_open that triggers before device ready deadlocks.
> > >
> > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> >
> > I mean on hypervisor that starts processing queues after getting a kick
> > even without DRIVER_OK.
> 
> Oh right.
> 
> >
> > > >
> > > >
> > > > > >
> > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > "this vq was kicked since reset"?
> > > > >
> > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > callback needs to be synchronized with the kick.
> > > >
> > > > Note we only need to synchronize it when it changes, which is
> > > > only during initialization and reset.
> > >
> > > Yes.
> > >
> > > >
> > > >
> > > > > >
> > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > >
> > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > concerns - I guess it's only when we change the value so
> > > > > >         if (!vq->kicked) {
> > > > > >                 vq->kicked = true;
> > > > > >                 mb();
> > > > > >         }
> > > > > >
> > > > > > will do the trick, right?
> > > > >
> > > > > There's no much difference with the existing approach:
> > > > >
> > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > >
> > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > no kick before DRIVER_OK.
> > > >
> > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > to hardening
> > >
> > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> >
> > I don't see how - my proposal ignores DRIVER_OK issues.
> 
> Yes, what I meant is, in your proposal, the first kick after rest is a
> hint that the driver is ok (but actually it could not).

Question is are there drivers which kick before they are ready
to handle callbacks?

> >
> > > > and in absence of config interrupts is generally easily
> > > > fixed just by sticking virtio_device_ready early in initialization.
> > >
> > > So if the kick is done before the subsystem registration, there's
> > > still a window in the middle (assuming we stick virtio_device_ready()
> > > early):
> > >
> > > virtio_device_ready()
> > > virtqueue_kick()
> > > /* the window */
> > > subsystem_registration()
> >
> > Absolutely, however, I do not think we really have many such drivers
> > since this has been known as a wrong thing to do since the beginning.
> > Want to try to find any?
> 
> Yes, let me try and update.
> 
> >I couldn't ... except maybe bluetooth
> > but that's just maintainer nacking fixes saying he'll fix it
> > his way ...
> >
> > > And during remove(), we get another window:
> > >
> > > subsysrem_unregistration()
> > > /* the window */
> > > virtio_device_reset()
> >
> > Same here.
> >
> > > if we do reset before, we may end up with other issues like deadlock.
> > >
> > > So I think it's probably very hard to fix issues only at the virtio
> > > core since we need subsystem specific knowledge to make sure
> > > everything is synchronized.
> >
> > How many drivers do you see with the issue above?
> 
> E.g for virtio-net, except for the ndo_open()/driver_ok races I
> spotted another two.

two issues causing kick before DRIVER_OK? or two issues
causing kick before driver can handle callbacks?


> > As compared to yours which has 16 patches just in your queue.
> >
> > > > With the current approach one has to *also* not do virtio_device_ready
> > > > too early - and it's really tricky.
> > >
> > > Not sure how much we differ here, during the probe driver can just
> > > place the virtio_device_ready() after the kick.
> >
> > Not so easy. For example, consider virtio net without your
> > locking change. kick is part of a command vq operation
> > which does not directly have anything to do with probe.
> > Same for many other devices - once you register,
> > linux can send commands immediately.
> > Yes we can invent per device flags and locking rules to
> > try and suppress commands until device_ready. I am not
> > all that happy with this idea to be frank ...
> 
> Actuall, I meant we need to depend on the subsystem facilities to
> handle this (e.g the rtnl_lock() ndo_XYZ) instead of inventing per
> driver flags.

Yep. And that makes it very messy to handle. My hope is
that for years we got callbacks right after adding buffers
so the races have been ironed out by now.


> I believe virtio is not the only driver that suffers
> from those races.
> >
> >
> > > >
> > > > With the proposal I think that we don't need to fix all drivers and
> > > > in my eyes that is a huge advantage because frankly I'm fine with
> > > > more work on strict spec compliance taking more than expected
> > > > but I would like the hardening work to finally be done.
> > >
> > > Ok, but what I meant is, with your proposal if a buggy drive kicks
> > > before DRIVER_OK, it suppresses the effect of hardening?
> >
> > I don't see how. DRIVER_OK is for device's benefit, guest does not need
> > it.
> >
> > > > I am not sure the amount of effort expended on the hardening here is
> > > > proportionate to the benefit it provides.
> > >
> > > Probably, but we received those bug reports from the confidential
> > > computing guys.
> > >
> > > Or can we choose to go another way, let the kconfig option enabled for
> > > TDX/SEV, and then fix the bugs only if it is reported?
> >
> > Nah that's really fragmenting out userbase.
> 
> True.
> 
> >
> > > >
> > > >
> > > >
> > > > > >
> > > > > > need to think about the reset path - it already synchronizes callbacks
> > > > > > and already can lose interrupts so we just need to clear vq->kicked
> > > > > > before that, right?
> > > > >
> > > > > Probably.
> > > > >
> > > > > >
> > > > > >
> > > > > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > > > > transports (e.g vDPA would be kind of difficult)
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > Hmm I don't really get why would it be difficult.
> > > > > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > > > > have interrupt masking support.
> > > > >
> > > > > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > > > > the auxiliary bus. And that is the way another vendor will go.
> > > > >
> > > > > Thanks
> > > >
> > > > A bunch of callbacks will do it I guess.
> > >
> > > Possible but looks like a layer violation, I think auxiliary stuff
> > > wants to hide the underlayer architecture. That is why I tend to do it
> > > in the virtio core. And actually, transport is freed to implement
> > > another layer of those synchronization if it wants.
> > >
> > > Thanks
> >
> > As usual a level of indirection will help with layering.
> 
> Probably.
> 
> Thanks
> 
> >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > > Changes since V2:
> > > > > > > > > > > - Tweak the Kconfig help
> > > > > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > >                       vcdev->err = -EIO;
> > > > > > > > > > >       }
> > > > > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > +     /*
> > > > > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > > > > +      * disabled here.
> > > > > > > > > > > +      */
> > > > > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > > > > >               vring_interrupt(0, vq);
> > > > > > > > > > >       }
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > > > > >
> > > > > > > > > > >  if VIRTIO_MENU
> > > > > > > > > > >
> > > > > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > > > > +        help
> > > > > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > > > > +
> > > > > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > > > > +
> > > > > > > > > > > +          If unsure, say N.
> > > > > > > > > > > +
> > > > > > > > > > >  config VIRTIO_PCI
> > > > > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > > > > >       depends on PCI
> > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > >   * */
> > > > > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > >  {
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > > > > >        * interrupt for this line arriving after
> > > > > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > >        */
> > > > > > > > > > >       virtio_break_device(dev);
> > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > > +#endif
> > > > > > > > > > >
> > > > > > > > > > >       dev->config->reset(dev);
> > > > > > > > > > >  }
> > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > > > > >       vq->we_own_ring = true;
> > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > +#else
> > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > >               return IRQ_NONE;
> > > > > > > > > > > +#else
> > > > > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > > > > >       vq->we_own_ring = false;
> > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > +#else
> > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > >
> > > > > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > >
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > >        */
> > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > > > > --
> > > > > > > > > > > 2.25.1
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >
Jason Wang June 28, 2022, 6:32 a.m. UTC | #15
On Tue, Jun 28, 2022 at 2:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:17:50PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > your review found and fixed all driver bugs?
> > > >
> > > > I don't/can't audit all bugs but the race between open/close against
> > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > think differently, let me know
> > > >
> > > > > After two attempts
> > > > > I don't feel like hoping audit will fix all bugs.
> > > >
> > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > issue is not hard but the testing, It would take at least the time of
> > > > one release to finalize I guess.
> > >
> > > Absolutely. So I am looking for a way to implement hardening that does
> > > not break existing drivers.
> >
> > I totally agree with you to seek a way without bothering the drivers.
> > Just wonder if this is possbile.
> >
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > an interrupt but deferring can lead to problems too.
> > > > > >
> > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > interrupt processing until enable_irq().
> > > > >
> > > > >
> > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > >
> > > > > > >
> > > > > > > Consider as an example
> > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > >
> > > > > What I mean is this: if we revert the above commit, things still
> > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > >
> > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > >
> > > I mean on hypervisor that starts processing queues after getting a kick
> > > even without DRIVER_OK.
> >
> > Oh right.
> >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > "this vq was kicked since reset"?
> > > > > >
> > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > callback needs to be synchronized with the kick.
> > > > >
> > > > > Note we only need to synchronize it when it changes, which is
> > > > > only during initialization and reset.
> > > >
> > > > Yes.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > >
> > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > concerns - I guess it's only when we change the value so
> > > > > > >         if (!vq->kicked) {
> > > > > > >                 vq->kicked = true;
> > > > > > >                 mb();
> > > > > > >         }
> > > > > > >
> > > > > > > will do the trick, right?
> > > > > >
> > > > > > There's no much difference with the existing approach:
> > > > > >
> > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > >
> > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > no kick before DRIVER_OK.
> > > > >
> > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > to hardening
> > > >
> > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > >
> > > I don't see how - my proposal ignores DRIVER_OK issues.
> >
> > Yes, what I meant is, in your proposal, the first kick after rest is a
> > hint that the driver is ok (but actually it could not).
>
> Question is are there drivers which kick before they are ready
> to handle callbacks?

Let me try to have a look at all the drivers to answer this.

>
> > >
> > > > > and in absence of config interrupts is generally easily
> > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > >
> > > > So if the kick is done before the subsystem registration, there's
> > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > early):
> > > >
> > > > virtio_device_ready()
> > > > virtqueue_kick()
> > > > /* the window */
> > > > subsystem_registration()
> > >
> > > Absolutely, however, I do not think we really have many such drivers
> > > since this has been known as a wrong thing to do since the beginning.
> > > Want to try to find any?
> >
> > Yes, let me try and update.
> >
> > >I couldn't ... except maybe bluetooth
> > > but that's just maintainer nacking fixes saying he'll fix it
> > > his way ...
> > >
> > > > And during remove(), we get another window:
> > > >
> > > > subsysrem_unregistration()
> > > > /* the window */
> > > > virtio_device_reset()
> > >
> > > Same here.
> > >
> > > > if we do reset before, we may end up with other issues like deadlock.
> > > >
> > > > So I think it's probably very hard to fix issues only at the virtio
> > > > core since we need subsystem specific knowledge to make sure
> > > > everything is synchronized.
> > >
> > > How many drivers do you see with the issue above?
> >
> > E.g for virtio-net, except for the ndo_open()/driver_ok races I
> > spotted another two.
>
> two issues causing kick before DRIVER_OK? or two issues
> causing kick before driver can handle callbacks?

1) A window between netdev_unregisger() and virtio_device_reset(), tx
callbacks may came and try to wake up the TX queue
2) refill_work can still be scheduled by the rx callback after
cancel_delayed_work_sync()

>
>
> > > As compared to yours which has 16 patches just in your queue.
> > >
> > > > > With the current approach one has to *also* not do virtio_device_ready
> > > > > too early - and it's really tricky.
> > > >
> > > > Not sure how much we differ here, during the probe driver can just
> > > > place the virtio_device_ready() after the kick.
> > >
> > > Not so easy. For example, consider virtio net without your
> > > locking change. kick is part of a command vq operation
> > > which does not directly have anything to do with probe.
> > > Same for many other devices - once you register,
> > > linux can send commands immediately.
> > > Yes we can invent per device flags and locking rules to
> > > try and suppress commands until device_ready. I am not
> > > all that happy with this idea to be frank ...
> >
> > Actuall, I meant we need to depend on the subsystem facilities to
> > handle this (e.g the rtnl_lock() ndo_XYZ) instead of inventing per
> > driver flags.
>
> Yep. And that makes it very messy to handle.

But this is how other drivers use to handle those races.

>My hope is
> that for years we got callbacks right after adding buffers
> so the races have been ironed out by now.

This sounds just like what you propose here or is it something other?

Thanks

>
>
> > I believe virtio is not the only driver that suffers
> > from those races.
> > >
> > >
> > > > >
> > > > > With the proposal I think that we don't need to fix all drivers and
> > > > > in my eyes that is a huge advantage because frankly I'm fine with
> > > > > more work on strict spec compliance taking more than expected
> > > > > but I would like the hardening work to finally be done.
> > > >
> > > > Ok, but what I meant is, with your proposal if a buggy drive kicks
> > > > before DRIVER_OK, it suppresses the effect of hardening?
> > >
> > > I don't see how. DRIVER_OK is for device's benefit, guest does not need
> > > it.
> > >
> > > > > I am not sure the amount of effort expended on the hardening here is
> > > > > proportionate to the benefit it provides.
> > > >
> > > > Probably, but we received those bug reports from the confidential
> > > > computing guys.
> > > >
> > > > Or can we choose to go another way, let the kconfig option enabled for
> > > > TDX/SEV, and then fix the bugs only if it is reported?
> > >
> > > Nah that's really fragmenting out userbase.
> >
> > True.
> >
> > >
> > > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > need to think about the reset path - it already synchronizes callbacks
> > > > > > > and already can lose interrupts so we just need to clear vq->kicked
> > > > > > > before that, right?
> > > > > >
> > > > > > Probably.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > > > > > transports (e.g vDPA would be kind of difficult)
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > Hmm I don't really get why would it be difficult.
> > > > > > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > > > > > have interrupt masking support.
> > > > > >
> > > > > > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > > > > > the auxiliary bus. And that is the way another vendor will go.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > A bunch of callbacks will do it I guess.
> > > >
> > > > Possible but looks like a layer violation, I think auxiliary stuff
> > > > wants to hide the underlayer architecture. That is why I tend to do it
> > > > in the virtio core. And actually, transport is freed to implement
> > > > another layer of those synchronization if it wants.
> > > >
> > > > Thanks
> > >
> > > As usual a level of indirection will help with layering.
> >
> > Probably.
> >
> > Thanks
> >
> > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > > Changes since V2:
> > > > > > > > > > > > - Tweak the Kconfig help
> > > > > > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > > > > > ---
> > > > > > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > > >                       vcdev->err = -EIO;
> > > > > > > > > > > >       }
> > > > > > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > > +     /*
> > > > > > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > > > > > +      * disabled here.
> > > > > > > > > > > > +      */
> > > > > > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > > > > > >               vring_interrupt(0, vq);
> > > > > > > > > > > >       }
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > > > > > >
> > > > > > > > > > > >  if VIRTIO_MENU
> > > > > > > > > > > >
> > > > > > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > > > > > +        help
> > > > > > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > > > > > +
> > > > > > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > > > > > +
> > > > > > > > > > > > +          If unsure, say N.
> > > > > > > > > > > > +
> > > > > > > > > > > >  config VIRTIO_PCI
> > > > > > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > > > > > >       depends on PCI
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > >   * */
> > > > > > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > >  {
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >       /*
> > > > > > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > > > > > >        * interrupt for this line arriving after
> > > > > > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > >        */
> > > > > > > > > > > >       virtio_break_device(dev);
> > > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >
> > > > > > > > > > > >       dev->config->reset(dev);
> > > > > > > > > > > >  }
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > > > > > >       vq->we_own_ring = true;
> > > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > > +#else
> > > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > >       }
> > > > > > > > > > > >
> > > > > > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > >               return IRQ_NONE;
> > > > > > > > > > > > +#else
> > > > > > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       }
> > > > > > > > > > > >
> > > > > > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > > > > > >       vq->we_own_ring = false;
> > > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > > +#else
> > > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > >
> > > > > > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > >
> > > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > >       /*
> > > > > > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > >        */
> > > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > >       /*
> > > > > > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.25.1
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin June 28, 2022, 6:44 a.m. UTC | #16
On Tue, Jun 28, 2022 at 02:32:19PM +0800, Jason Wang wrote:
> > Question is are there drivers which kick before they are ready
> > to handle callbacks?
> 
> Let me try to have a look at all the drivers to answer this.

One thing to note is that I consider hardening probe and
hardening remove separate features. I think that at this point
for secured guests it is prudent to outright block device
removal - we have been finding races in removal for years.
Note sure there's a flag for that but it's probably not too hard to add
e.g. to pci core.
Jason Wang June 29, 2022, 4:07 a.m. UTC | #17
On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > your review found and fixed all driver bugs?
> > >
> > > I don't/can't audit all bugs but the race between open/close against
> > > ready/reset. It looks to me a good chance to fix them all but if you
> > > think differently, let me know
> > >
> > > > After two attempts
> > > > I don't feel like hoping audit will fix all bugs.
> > >
> > > I've started the auditing and have 15+ patches in the queue. (only
> > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > issue is not hard but the testing, It would take at least the time of
> > > one release to finalize I guess.
> >
> > Absolutely. So I am looking for a way to implement hardening that does
> > not break existing drivers.
>
> I totally agree with you to seek a way without bothering the drivers.
> Just wonder if this is possbile.
>
> >
> >
> > > >
> > > >
> > > > > >
> > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > an interrupt but deferring can lead to problems too.
> > > > >
> > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > interrupt processing until enable_irq().
> > > >
> > > >
> > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > >
> > > > > >
> > > > > > Consider as an example
> > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > >
> > > > > >
> > > > >
> > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > >
> > > > What I mean is this: if we revert the above commit, things still
> > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > device ready then ndo_open that triggers before device ready deadlocks.
> > >
> > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> >
> > I mean on hypervisor that starts processing queues after getting a kick
> > even without DRIVER_OK.
>
> Oh right.
>
> >
> > > >
> > > >
> > > > > >
> > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > "this vq was kicked since reset"?
> > > > >
> > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > callback needs to be synchronized with the kick.
> > > >
> > > > Note we only need to synchronize it when it changes, which is
> > > > only during initialization and reset.
> > >
> > > Yes.
> > >
> > > >
> > > >
> > > > > >
> > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > >
> > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > concerns - I guess it's only when we change the value so
> > > > > >         if (!vq->kicked) {
> > > > > >                 vq->kicked = true;
> > > > > >                 mb();
> > > > > >         }
> > > > > >
> > > > > > will do the trick, right?
> > > > >
> > > > > There's no much difference with the existing approach:
> > > > >
> > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > >
> > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > no kick before DRIVER_OK.
> > > >
> > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > to hardening
> > >
> > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> >
> > I don't see how - my proposal ignores DRIVER_OK issues.
>
> Yes, what I meant is, in your proposal, the first kick after rest is a
> hint that the driver is ok (but actually it could not).
>
> >
> > > > and in absence of config interrupts is generally easily
> > > > fixed just by sticking virtio_device_ready early in initialization.
> > >
> > > So if the kick is done before the subsystem registration, there's
> > > still a window in the middle (assuming we stick virtio_device_ready()
> > > early):
> > >
> > > virtio_device_ready()
> > > virtqueue_kick()
> > > /* the window */
> > > subsystem_registration()
> >
> > Absolutely, however, I do not think we really have many such drivers
> > since this has been known as a wrong thing to do since the beginning.
> > Want to try to find any?
>
> Yes, let me try and update.

This is basically the device that have an RX queue, so I've found the
following drivers:

scmi, mac80211_hwsim, vsock, bt, balloon.

>
> >I couldn't ... except maybe bluetooth
> > but that's just maintainer nacking fixes saying he'll fix it
> > his way ...
> >
> > > And during remove(), we get another window:
> > >
> > > subsysrem_unregistration()
> > > /* the window */
> > > virtio_device_reset()
> >
> > Same here.

Basically for the drivers that set driver_ok before registration, so
we have a lot:

blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
iommu, caif, pmem, input, mem

So I think there's no easy way to harden the notification without
auditing the driver one by one (especially considering the driver may
use bh or workqueue). The problem is the notification hardening
depends on a correct or race-free probe/remove. So we need to fix the
issues in probe/remove then do the hardening on the notification.

Thanks

> >
> > > if we do reset before, we may end up with other issues like deadlock.
> > >
> > > So I think it's probably very hard to fix issues only at the virtio
> > > core since we need subsystem specific knowledge to make sure
> > > everything is synchronized.
> >
> > How many drivers do you see with the issue above?
>
> E.g for virtio-net, except for the ndo_open()/driver_ok races I
> spotted another two.
>
> > As compared to yours which has 16 patches just in your queue.
> >
> > > > With the current approach one has to *also* not do virtio_device_ready
> > > > too early - and it's really tricky.
> > >
> > > Not sure how much we differ here, during the probe driver can just
> > > place the virtio_device_ready() after the kick.
> >
> > Not so easy. For example, consider virtio net without your
> > locking change. kick is part of a command vq operation
> > which does not directly have anything to do with probe.
> > Same for many other devices - once you register,
> > linux can send commands immediately.
> > Yes we can invent per device flags and locking rules to
> > try and suppress commands until device_ready. I am not
> > all that happy with this idea to be frank ...
>
> Actuall, I meant we need to depend on the subsystem facilities to
> handle this (e.g the rtnl_lock() ndo_XYZ) instead of inventing per
> driver flags. I believe virtio is not the only driver that suffers
> from those races.
>
> >
> >
> > > >
> > > > With the proposal I think that we don't need to fix all drivers and
> > > > in my eyes that is a huge advantage because frankly I'm fine with
> > > > more work on strict spec compliance taking more than expected
> > > > but I would like the hardening work to finally be done.
> > >
> > > Ok, but what I meant is, with your proposal if a buggy drive kicks
> > > before DRIVER_OK, it suppresses the effect of hardening?
> >
> > I don't see how. DRIVER_OK is for device's benefit, guest does not need
> > it.
> >
> > > > I am not sure the amount of effort expended on the hardening here is
> > > > proportionate to the benefit it provides.
> > >
> > > Probably, but we received those bug reports from the confidential
> > > computing guys.
> > >
> > > Or can we choose to go another way, let the kconfig option enabled for
> > > TDX/SEV, and then fix the bugs only if it is reported?
> >
> > Nah that's really fragmenting out userbase.
>
> True.
>
> >
> > > >
> > > >
> > > >
> > > > > >
> > > > > > need to think about the reset path - it already synchronizes callbacks
> > > > > > and already can lose interrupts so we just need to clear vq->kicked
> > > > > > before that, right?
> > > > >
> > > > > Probably.
> > > > >
> > > > > >
> > > > > >
> > > > > > > 2) seems require more work in the IRQ core and it can not work for all
> > > > > > > transports (e.g vDPA would be kind of difficult)
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > Hmm I don't really get why would it be difficult.
> > > > > > VDPA is mostly PCI isn't it? With PCI both level INT#x and edge MSI
> > > > > > have interrupt masking support.
> > > > >
> > > > > Yes, but consider the case of mlx5_vdpa, PCI stuff was hidden under
> > > > > the auxiliary bus. And that is the way another vendor will go.
> > > > >
> > > > > Thanks
> > > >
> > > > A bunch of callbacks will do it I guess.
> > >
> > > Possible but looks like a layer violation, I think auxiliary stuff
> > > wants to hide the underlayer architecture. That is why I tend to do it
> > > in the virtio core. And actually, transport is freed to implement
> > > another layer of those synchronization if it wants.
> > >
> > > Thanks
> >
> > As usual a level of indirection will help with layering.
>
> Probably.
>
> Thanks
>
> >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > > Changes since V2:
> > > > > > > > > > > - Tweak the Kconfig help
> > > > > > > > > > > - Add comment for the read_lock() pairing in virtio_ccw
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/s390/virtio/virtio_ccw.c |  9 ++++++++-
> > > > > > > > > > >  drivers/virtio/Kconfig           | 13 +++++++++++++
> > > > > > > > > > >  drivers/virtio/virtio.c          |  2 ++
> > > > > > > > > > >  drivers/virtio/virtio_ring.c     | 12 ++++++++++++
> > > > > > > > > > >  include/linux/virtio_config.h    |  2 ++
> > > > > > > > > > >  5 files changed, 37 insertions(+), 1 deletion(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > index 97e51c34e6cf..1f6a358f65f0 100644
> > > > > > > > > > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > > > > > > > > > @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > >                       vcdev->err = -EIO;
> > > > > > > > > > >       }
> > > > > > > > > > >       virtio_ccw_check_activity(vcdev, activity);
> > > > > > > > > > > -     /* Interrupts are disabled here */
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > +     /*
> > > > > > > > > > > +      * Paried with virtio_ccw_synchronize_cbs() and interrupts are
> > > > > > > > > > > +      * disabled here.
> > > > > > > > > > > +      */
> > > > > > > > > > >       read_lock(&vcdev->irq_lock);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       for_each_set_bit(i, indicators(vcdev),
> > > > > > > > > > >                        sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
> > > > > > > > > > >               /* The bit clear must happen before the vring kick. */
> > > > > > > > > > > @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
> > > > > > > > > > >               vq = virtio_ccw_vq_by_ind(vcdev, i);
> > > > > > > > > > >               vring_interrupt(0, vq);
> > > > > > > > > > >       }
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       read_unlock(&vcdev->irq_lock);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       if (test_bit(0, indicators2(vcdev))) {
> > > > > > > > > > >               virtio_config_changed(&vcdev->vdev);
> > > > > > > > > > >               clear_bit(0, indicators2(vcdev));
> > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > > > > > > index b5adf6abd241..c04f370a1e5c 100644
> > > > > > > > > > > --- a/drivers/virtio/Kconfig
> > > > > > > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > > > > > > @@ -35,6 +35,19 @@ menuconfig VIRTIO_MENU
> > > > > > > > > > >
> > > > > > > > > > >  if VIRTIO_MENU
> > > > > > > > > > >
> > > > > > > > > > > +config VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > > +        bool "Harden virtio notification"
> > > > > > > > > > > +        help
> > > > > > > > > > > +          Enable this to harden the device notifications and suppress
> > > > > > > > > > > +          those that happen at a time where notifications are illegal.
> > > > > > > > > > > +
> > > > > > > > > > > +          Experimental: Note that several drivers still have bugs that
> > > > > > > > > > > +          may cause crashes or hangs when correct handling of
> > > > > > > > > > > +          notifications is enforced; depending on the subset of
> > > > > > > > > > > +          drivers and devices you use, this may or may not work.
> > > > > > > > > > > +
> > > > > > > > > > > +          If unsure, say N.
> > > > > > > > > > > +
> > > > > > > > > > >  config VIRTIO_PCI
> > > > > > > > > > >       tristate "PCI driver for virtio devices"
> > > > > > > > > > >       depends on PCI
> > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > index ef04a96942bf..21dc08d2f32d 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > @@ -220,6 +220,7 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > >   * */
> > > > > > > > > > >  void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > >  {
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The below virtio_synchronize_cbs() guarantees that any
> > > > > > > > > > >        * interrupt for this line arriving after
> > > > > > > > > > > @@ -228,6 +229,7 @@ void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > >        */
> > > > > > > > > > >       virtio_break_device(dev);
> > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > > +#endif
> > > > > > > > > > >
> > > > > > > > > > >       dev->config->reset(dev);
> > > > > > > > > > >  }
> > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > index 13a7348cedff..d9d3b6e201fb 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > @@ -1688,7 +1688,11 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > > > > > > > > > >       vq->we_own_ring = true;
> > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > +#else
> > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > @@ -2135,9 +2139,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       if (unlikely(vq->broken)) {
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >               dev_warn_once(&vq->vq.vdev->dev,
> > > > > > > > > > >                             "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > >               return IRQ_NONE;
> > > > > > > > > > > +#else
> > > > > > > > > > > +             return IRQ_HANDLED;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       /* Just a hint for performance: so it's ok that this can be racy! */
> > > > > > > > > > > @@ -2180,7 +2188,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > > > > > > > > >       vq->we_own_ring = false;
> > > > > > > > > > >       vq->notify = notify;
> > > > > > > > > > >       vq->weak_barriers = weak_barriers;
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       vq->broken = true;
> > > > > > > > > > > +#else
> > > > > > > > > > > +     vq->broken = false;
> > > > > > > > > > > +#endif
> > > > > > > > > > >       vq->last_used_idx = 0;
> > > > > > > > > > >       vq->event_triggered = false;
> > > > > > > > > > >       vq->num_added = 0;
> > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > index 9a36051ceb76..d15c3cdda2d2 100644
> > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > >
> > > > > > > > > > >       WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > >
> > > > > > > > > > > +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The virtio_synchronize_cbs() makes sure vring_interrupt()
> > > > > > > > > > >        * will see the driver specific setup if it sees vq->broken
> > > > > > > > > > > @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > >        */
> > > > > > > > > > >       virtio_synchronize_cbs(dev);
> > > > > > > > > > >       __virtio_unbreak_device(dev);
> > > > > > > > > > > +#endif
> > > > > > > > > > >       /*
> > > > > > > > > > >        * The transport should ensure the visibility of vq->broken
> > > > > > > > > > >        * before setting DRIVER_OK. See the comments for the transport
> > > > > > > > > > > --
> > > > > > > > > > > 2.25.1
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >
Michael S. Tsirkin June 29, 2022, 6:31 a.m. UTC | #18
On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > your review found and fixed all driver bugs?
> > > >
> > > > I don't/can't audit all bugs but the race between open/close against
> > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > think differently, let me know
> > > >
> > > > > After two attempts
> > > > > I don't feel like hoping audit will fix all bugs.
> > > >
> > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > issue is not hard but the testing, It would take at least the time of
> > > > one release to finalize I guess.
> > >
> > > Absolutely. So I am looking for a way to implement hardening that does
> > > not break existing drivers.
> >
> > I totally agree with you to seek a way without bothering the drivers.
> > Just wonder if this is possbile.
> >
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > an interrupt but deferring can lead to problems too.
> > > > > >
> > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > interrupt processing until enable_irq().
> > > > >
> > > > >
> > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > >
> > > > > > >
> > > > > > > Consider as an example
> > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > >
> > > > > What I mean is this: if we revert the above commit, things still
> > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > >
> > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > >
> > > I mean on hypervisor that starts processing queues after getting a kick
> > > even without DRIVER_OK.
> >
> > Oh right.
> >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > "this vq was kicked since reset"?
> > > > > >
> > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > callback needs to be synchronized with the kick.
> > > > >
> > > > > Note we only need to synchronize it when it changes, which is
> > > > > only during initialization and reset.
> > > >
> > > > Yes.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > >
> > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > concerns - I guess it's only when we change the value so
> > > > > > >         if (!vq->kicked) {
> > > > > > >                 vq->kicked = true;
> > > > > > >                 mb();
> > > > > > >         }
> > > > > > >
> > > > > > > will do the trick, right?
> > > > > >
> > > > > > There's no much difference with the existing approach:
> > > > > >
> > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > >
> > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > no kick before DRIVER_OK.
> > > > >
> > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > to hardening
> > > >
> > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > >
> > > I don't see how - my proposal ignores DRIVER_OK issues.
> >
> > Yes, what I meant is, in your proposal, the first kick after rest is a
> > hint that the driver is ok (but actually it could not).
> >
> > >
> > > > > and in absence of config interrupts is generally easily
> > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > >
> > > > So if the kick is done before the subsystem registration, there's
> > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > early):
> > > >
> > > > virtio_device_ready()
> > > > virtqueue_kick()
> > > > /* the window */
> > > > subsystem_registration()
> > >
> > > Absolutely, however, I do not think we really have many such drivers
> > > since this has been known as a wrong thing to do since the beginning.
> > > Want to try to find any?
> >
> > Yes, let me try and update.
> 
> This is basically the device that have an RX queue, so I've found the
> following drivers:
> 
> scmi, mac80211_hwsim, vsock, bt, balloon.

Looked and I don't see it yet. Let's consider
./net/vmw_vsock/virtio_transport.c for example. Assuming we block
callbacks until the first kick, what is the issue with probe exactly?


> >
> > >I couldn't ... except maybe bluetooth
> > > but that's just maintainer nacking fixes saying he'll fix it
> > > his way ...
> > >
> > > > And during remove(), we get another window:
> > > >
> > > > subsysrem_unregistration()
> > > > /* the window */
> > > > virtio_device_reset()
> > >
> > > Same here.
> 
> Basically for the drivers that set driver_ok before registration,

I don't see what does driver_ok have to do with it.

> so
> we have a lot:
> 
> blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> iommu, caif, pmem, input, mem
> 
> So I think there's no easy way to harden the notification without
> auditing the driver one by one (especially considering the driver may
> use bh or workqueue). The problem is the notification hardening
> depends on a correct or race-free probe/remove. So we need to fix the
> issues in probe/remove then do the hardening on the notification.
> 
> Thanks

So if drivers kick but are not ready to get callbacks then let's fix
that first of all, these are racy with existing qemu even ignoring
spec compliance.
Jason Wang June 29, 2022, 7:02 a.m. UTC | #19
On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > your review found and fixed all driver bugs?
> > > > >
> > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > think differently, let me know
> > > > >
> > > > > > After two attempts
> > > > > > I don't feel like hoping audit will fix all bugs.
> > > > >
> > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > issue is not hard but the testing, It would take at least the time of
> > > > > one release to finalize I guess.
> > > >
> > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > not break existing drivers.
> > >
> > > I totally agree with you to seek a way without bothering the drivers.
> > > Just wonder if this is possbile.
> > >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > >
> > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > interrupt processing until enable_irq().
> > > > > >
> > > > > >
> > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > >
> > > > > > > >
> > > > > > > > Consider as an example
> > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > >
> > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > >
> > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > >
> > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > even without DRIVER_OK.
> > >
> > > Oh right.
> > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > "this vq was kicked since reset"?
> > > > > > >
> > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > callback needs to be synchronized with the kick.
> > > > > >
> > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > only during initialization and reset.
> > > > >
> > > > > Yes.
> > > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > >
> > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > >         if (!vq->kicked) {
> > > > > > > >                 vq->kicked = true;
> > > > > > > >                 mb();
> > > > > > > >         }
> > > > > > > >
> > > > > > > > will do the trick, right?
> > > > > > >
> > > > > > > There's no much difference with the existing approach:
> > > > > > >
> > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > >
> > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > no kick before DRIVER_OK.
> > > > > >
> > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > to hardening
> > > > >
> > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > >
> > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > >
> > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > hint that the driver is ok (but actually it could not).
> > >
> > > >
> > > > > > and in absence of config interrupts is generally easily
> > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > >
> > > > > So if the kick is done before the subsystem registration, there's
> > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > early):
> > > > >
> > > > > virtio_device_ready()
> > > > > virtqueue_kick()
> > > > > /* the window */
> > > > > subsystem_registration()
> > > >
> > > > Absolutely, however, I do not think we really have many such drivers
> > > > since this has been known as a wrong thing to do since the beginning.
> > > > Want to try to find any?
> > >
> > > Yes, let me try and update.
> >
> > This is basically the device that have an RX queue, so I've found the
> > following drivers:
> >
> > scmi, mac80211_hwsim, vsock, bt, balloon.
>
> Looked and I don't see it yet. Let's consider
> ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> callbacks until the first kick, what is the issue with probe exactly?

We need to make sure the callback can survive when it runs before sub
system registration.

>
>
> > >
> > > >I couldn't ... except maybe bluetooth
> > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > his way ...
> > > >
> > > > > And during remove(), we get another window:
> > > > >
> > > > > subsysrem_unregistration()
> > > > > /* the window */
> > > > > virtio_device_reset()
> > > >
> > > > Same here.
> >
> > Basically for the drivers that set driver_ok before registration,
>
> I don't see what does driver_ok have to do with it.

I meant for those driver, in probe they do()

virtio_device_ready()
subsystem_register()

In remove() they do

subsystem_unregister()
virtio_device_reset()

for symmetry

>
> > so
> > we have a lot:
> >
> > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > iommu, caif, pmem, input, mem
> >
> > So I think there's no easy way to harden the notification without
> > auditing the driver one by one (especially considering the driver may
> > use bh or workqueue). The problem is the notification hardening
> > depends on a correct or race-free probe/remove. So we need to fix the
> > issues in probe/remove then do the hardening on the notification.
> >
> > Thanks
>
> So if drivers kick but are not ready to get callbacks then let's fix
> that first of all, these are racy with existing qemu even ignoring
> spec compliance.

Yes, (the patches I've posted so far exist even with a well-behaved device).

Thanks

>
>
> --
> MST
>
Michael S. Tsirkin June 29, 2022, 7:15 a.m. UTC | #20
On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > your review found and fixed all driver bugs?
> > > > > >
> > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > think differently, let me know
> > > > > >
> > > > > > > After two attempts
> > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > >
> > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > one release to finalize I guess.
> > > > >
> > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > not break existing drivers.
> > > >
> > > > I totally agree with you to seek a way without bothering the drivers.
> > > > Just wonder if this is possbile.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > >
> > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > interrupt processing until enable_irq().
> > > > > > >
> > > > > > >
> > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > >
> > > > > > > > >
> > > > > > > > > Consider as an example
> > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > >
> > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > >
> > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > >
> > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > even without DRIVER_OK.
> > > >
> > > > Oh right.
> > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > "this vq was kicked since reset"?
> > > > > > > >
> > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > callback needs to be synchronized with the kick.
> > > > > > >
> > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > only during initialization and reset.
> > > > > >
> > > > > > Yes.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > >
> > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > >         if (!vq->kicked) {
> > > > > > > > >                 vq->kicked = true;
> > > > > > > > >                 mb();
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > will do the trick, right?
> > > > > > > >
> > > > > > > > There's no much difference with the existing approach:
> > > > > > > >
> > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > >
> > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > no kick before DRIVER_OK.
> > > > > > >
> > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > to hardening
> > > > > >
> > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > >
> > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > >
> > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > hint that the driver is ok (but actually it could not).
> > > >
> > > > >
> > > > > > > and in absence of config interrupts is generally easily
> > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > >
> > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > early):
> > > > > >
> > > > > > virtio_device_ready()
> > > > > > virtqueue_kick()
> > > > > > /* the window */
> > > > > > subsystem_registration()
> > > > >
> > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > Want to try to find any?
> > > >
> > > > Yes, let me try and update.
> > >
> > > This is basically the device that have an RX queue, so I've found the
> > > following drivers:
> > >
> > > scmi, mac80211_hwsim, vsock, bt, balloon.
> >
> > Looked and I don't see it yet. Let's consider
> > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > callbacks until the first kick, what is the issue with probe exactly?
> 
> We need to make sure the callback can survive when it runs before sub
> system registration.

With my proposal no - only if we also kick before registration.
So I do not see the issue yet.

Consider ./net/vmw_vsock/virtio_transport.c

kicks: virtio_transport_send_pkt_work,
virtio_vsock_rx_fill, virtio_vsock_event_fill

which of these triggers before we are ready to
handle callbacks?


> >
> >
> > > >
> > > > >I couldn't ... except maybe bluetooth
> > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > his way ...
> > > > >
> > > > > > And during remove(), we get another window:
> > > > > >
> > > > > > subsysrem_unregistration()
> > > > > > /* the window */
> > > > > > virtio_device_reset()
> > > > >
> > > > > Same here.
> > >
> > > Basically for the drivers that set driver_ok before registration,
> >
> > I don't see what does driver_ok have to do with it.
> 
> I meant for those driver, in probe they do()
> 
> virtio_device_ready()
> subsystem_register()
> 
> In remove() they do
> 
> subsystem_unregister()
> virtio_device_reset()
> 
> for symmetry

Let's leave remove alone for now. I am close to 100% sure we have *lots*
of issues around it, but while probe is unavoidable remove can be
avoided by blocking hotplug.


> >
> > > so
> > > we have a lot:
> > >
> > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > iommu, caif, pmem, input, mem
> > >
> > > So I think there's no easy way to harden the notification without
> > > auditing the driver one by one (especially considering the driver may
> > > use bh or workqueue). The problem is the notification hardening
> > > depends on a correct or race-free probe/remove. So we need to fix the
> > > issues in probe/remove then do the hardening on the notification.
> > >
> > > Thanks
> >
> > So if drivers kick but are not ready to get callbacks then let's fix
> > that first of all, these are racy with existing qemu even ignoring
> > spec compliance.
> 
> Yes, (the patches I've posted so far exist even with a well-behaved device).
> 
> Thanks

patches you posted deal with DRIVER_OK spec compliance.
I do not see patches for kicks before callbacks are ready to run.

> >
> >
> > --
> > MST
> >
Jason Wang June 29, 2022, 8:34 a.m. UTC | #21
On Wed, Jun 29, 2022 at 3:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> > On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > > your review found and fixed all driver bugs?
> > > > > > >
> > > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > > think differently, let me know
> > > > > > >
> > > > > > > > After two attempts
> > > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > > >
> > > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > > one release to finalize I guess.
> > > > > >
> > > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > > not break existing drivers.
> > > > >
> > > > > I totally agree with you to seek a way without bothering the drivers.
> > > > > Just wonder if this is possbile.
> > > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > > >
> > > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > > interrupt processing until enable_irq().
> > > > > > > >
> > > > > > > >
> > > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Consider as an example
> > > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > > >
> > > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > > >
> > > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > > >
> > > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > > even without DRIVER_OK.
> > > > >
> > > > > Oh right.
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > > "this vq was kicked since reset"?
> > > > > > > > >
> > > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > > callback needs to be synchronized with the kick.
> > > > > > > >
> > > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > > only during initialization and reset.
> > > > > > >
> > > > > > > Yes.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > > >
> > > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > > >         if (!vq->kicked) {
> > > > > > > > > >                 vq->kicked = true;
> > > > > > > > > >                 mb();
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > > will do the trick, right?
> > > > > > > > >
> > > > > > > > > There's no much difference with the existing approach:
> > > > > > > > >
> > > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > > >
> > > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > > no kick before DRIVER_OK.
> > > > > > > >
> > > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > > to hardening
> > > > > > >
> > > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > > >
> > > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > > >
> > > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > > hint that the driver is ok (but actually it could not).
> > > > >
> > > > > >
> > > > > > > > and in absence of config interrupts is generally easily
> > > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > > >
> > > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > > early):
> > > > > > >
> > > > > > > virtio_device_ready()
> > > > > > > virtqueue_kick()
> > > > > > > /* the window */
> > > > > > > subsystem_registration()
> > > > > >
> > > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > > Want to try to find any?
> > > > >
> > > > > Yes, let me try and update.
> > > >
> > > > This is basically the device that have an RX queue, so I've found the
> > > > following drivers:
> > > >
> > > > scmi, mac80211_hwsim, vsock, bt, balloon.
> > >
> > > Looked and I don't see it yet. Let's consider
> > > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > > callbacks until the first kick, what is the issue with probe exactly?
> >
> > We need to make sure the callback can survive when it runs before sub
> > system registration.
>
> With my proposal no - only if we also kick before registration.
> So I do not see the issue yet.
>
> Consider ./net/vmw_vsock/virtio_transport.c
>
> kicks: virtio_transport_send_pkt_work,
> virtio_vsock_rx_fill, virtio_vsock_event_fill
>
> which of these triggers before we are ready to
> handle callbacks?

So:

virtio_vsock_vqs_init()
    virtio_device_ready()
    virtio_vsock_rx_fill() /* kick there */
rcu_assign_pointer(the_virtio_vsock, vsock)

It means at least virtio_vsock_rx_done()/virtio_vsock_workqueue needs
to survive. I don't say it has a bug but we do need to audit the code
in this case. The implication is: the virtqueue callback should be
written with no assumption that the driver has registered in the
subsystem. We don't or can't assume all drivers are written in this
way.

>
>
> > >
> > >
> > > > >
> > > > > >I couldn't ... except maybe bluetooth
> > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > his way ...
> > > > > >
> > > > > > > And during remove(), we get another window:
> > > > > > >
> > > > > > > subsysrem_unregistration()
> > > > > > > /* the window */
> > > > > > > virtio_device_reset()
> > > > > >
> > > > > > Same here.
> > > >
> > > > Basically for the drivers that set driver_ok before registration,
> > >
> > > I don't see what does driver_ok have to do with it.
> >
> > I meant for those driver, in probe they do()
> >
> > virtio_device_ready()
> > subsystem_register()
> >
> > In remove() they do
> >
> > subsystem_unregister()
> > virtio_device_reset()
> >
> > for symmetry
>
> Let's leave remove alone for now. I am close to 100% sure we have *lots*
> of issues around it, but while probe is unavoidable remove can be
> avoided by blocking hotplug.

Unbind can trigger this path as well.

>
>
> > >
> > > > so
> > > > we have a lot:
> > > >
> > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > iommu, caif, pmem, input, mem
> > > >
> > > > So I think there's no easy way to harden the notification without
> > > > auditing the driver one by one (especially considering the driver may
> > > > use bh or workqueue). The problem is the notification hardening
> > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > issues in probe/remove then do the hardening on the notification.
> > > >
> > > > Thanks
> > >
> > > So if drivers kick but are not ready to get callbacks then let's fix
> > > that first of all, these are racy with existing qemu even ignoring
> > > spec compliance.
> >
> > Yes, (the patches I've posted so far exist even with a well-behaved device).
> >
> > Thanks
>
> patches you posted deal with DRIVER_OK spec compliance.
> I do not see patches for kicks before callbacks are ready to run.

Yes.

Thanks

>
> > >
> > >
> > > --
> > > MST
> > >
>
Michael S. Tsirkin June 29, 2022, 8:52 a.m. UTC | #22
On Wed, Jun 29, 2022 at 04:34:36PM +0800, Jason Wang wrote:
> On Wed, Jun 29, 2022 at 3:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> > > On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > > > your review found and fixed all driver bugs?
> > > > > > > >
> > > > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > > > think differently, let me know
> > > > > > > >
> > > > > > > > > After two attempts
> > > > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > > > >
> > > > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > > > one release to finalize I guess.
> > > > > > >
> > > > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > > > not break existing drivers.
> > > > > >
> > > > > > I totally agree with you to seek a way without bothering the drivers.
> > > > > > Just wonder if this is possbile.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > > > >
> > > > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > > > interrupt processing until enable_irq().
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Consider as an example
> > > > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > > > >
> > > > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > > > >
> > > > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > > > >
> > > > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > > > even without DRIVER_OK.
> > > > > >
> > > > > > Oh right.
> > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > > > "this vq was kicked since reset"?
> > > > > > > > > >
> > > > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > > > callback needs to be synchronized with the kick.
> > > > > > > > >
> > > > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > > > only during initialization and reset.
> > > > > > > >
> > > > > > > > Yes.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > > > >
> > > > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > > > >         if (!vq->kicked) {
> > > > > > > > > > >                 vq->kicked = true;
> > > > > > > > > > >                 mb();
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > will do the trick, right?
> > > > > > > > > >
> > > > > > > > > > There's no much difference with the existing approach:
> > > > > > > > > >
> > > > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > > > >
> > > > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > > > no kick before DRIVER_OK.
> > > > > > > > >
> > > > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > > > to hardening
> > > > > > > >
> > > > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > > > >
> > > > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > > > >
> > > > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > > > hint that the driver is ok (but actually it could not).
> > > > > >
> > > > > > >
> > > > > > > > > and in absence of config interrupts is generally easily
> > > > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > > > >
> > > > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > > > early):
> > > > > > > >
> > > > > > > > virtio_device_ready()
> > > > > > > > virtqueue_kick()
> > > > > > > > /* the window */
> > > > > > > > subsystem_registration()
> > > > > > >
> > > > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > > > Want to try to find any?
> > > > > >
> > > > > > Yes, let me try and update.
> > > > >
> > > > > This is basically the device that have an RX queue, so I've found the
> > > > > following drivers:
> > > > >
> > > > > scmi, mac80211_hwsim, vsock, bt, balloon.
> > > >
> > > > Looked and I don't see it yet. Let's consider
> > > > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > > > callbacks until the first kick, what is the issue with probe exactly?
> > >
> > > We need to make sure the callback can survive when it runs before sub
> > > system registration.
> >
> > With my proposal no - only if we also kick before registration.
> > So I do not see the issue yet.
> >
> > Consider ./net/vmw_vsock/virtio_transport.c
> >
> > kicks: virtio_transport_send_pkt_work,
> > virtio_vsock_rx_fill, virtio_vsock_event_fill
> >
> > which of these triggers before we are ready to
> > handle callbacks?
> 
> So:
> 
> virtio_vsock_vqs_init()
>     virtio_device_ready()
>     virtio_vsock_rx_fill() /* kick there */
> rcu_assign_pointer(the_virtio_vsock, vsock)
> 
> It means at least virtio_vsock_rx_done()/virtio_vsock_workqueue needs
> to survive. I don't say it has a bug but we do need to audit the code
> in this case. The implication is: the virtqueue callback should be
> written with no assumption that the driver has registered in the
> subsystem. We don't or can't assume all drivers are written in this
> way.


I thought you said you audited code and found bugs.

My claim is that simply because qemu starts processing
packets immediately upon kick, if bugs like this
existed we would have noticed by now.

In this case the_virtio_vsock is used for xmit things,
callbacks do not seem to use it at all.

> >
> >
> > > >
> > > >
> > > > > >
> > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > his way ...
> > > > > > >
> > > > > > > > And during remove(), we get another window:
> > > > > > > >
> > > > > > > > subsysrem_unregistration()
> > > > > > > > /* the window */
> > > > > > > > virtio_device_reset()
> > > > > > >
> > > > > > > Same here.
> > > > >
> > > > > Basically for the drivers that set driver_ok before registration,
> > > >
> > > > I don't see what does driver_ok have to do with it.
> > >
> > > I meant for those driver, in probe they do()
> > >
> > > virtio_device_ready()
> > > subsystem_register()
> > >
> > > In remove() they do
> > >
> > > subsystem_unregister()
> > > virtio_device_reset()
> > >
> > > for symmetry
> >
> > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > of issues around it, but while probe is unavoidable remove can be
> > avoided by blocking hotplug.
> 
> Unbind can trigger this path as well.
> 
> >
> >
> > > >
> > > > > so
> > > > > we have a lot:
> > > > >
> > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > iommu, caif, pmem, input, mem
> > > > >
> > > > > So I think there's no easy way to harden the notification without
> > > > > auditing the driver one by one (especially considering the driver may
> > > > > use bh or workqueue). The problem is the notification hardening
> > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > issues in probe/remove then do the hardening on the notification.
> > > > >
> > > > > Thanks
> > > >
> > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > that first of all, these are racy with existing qemu even ignoring
> > > > spec compliance.
> > >
> > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > >
> > > Thanks
> >
> > patches you posted deal with DRIVER_OK spec compliance.
> > I do not see patches for kicks before callbacks are ready to run.
> 
> Yes.
> 
> Thanks
> 
> >
> > > >
> > > >
> > > > --
> > > > MST
> > > >
> >
Jason Wang June 30, 2022, 2:01 a.m. UTC | #23
On Wed, Jun 29, 2022 at 4:52 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 29, 2022 at 04:34:36PM +0800, Jason Wang wrote:
> > On Wed, Jun 29, 2022 at 3:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> > > > On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > > > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > > > > your review found and fixed all driver bugs?
> > > > > > > > >
> > > > > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > > > > think differently, let me know
> > > > > > > > >
> > > > > > > > > > After two attempts
> > > > > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > > > > >
> > > > > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > > > > one release to finalize I guess.
> > > > > > > >
> > > > > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > > > > not break existing drivers.
> > > > > > >
> > > > > > > I totally agree with you to seek a way without bothering the drivers.
> > > > > > > Just wonder if this is possbile.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > > > > >
> > > > > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > > > > interrupt processing until enable_irq().
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Consider as an example
> > > > > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > > > > >
> > > > > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > > > > >
> > > > > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > > > > >
> > > > > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > > > > even without DRIVER_OK.
> > > > > > >
> > > > > > > Oh right.
> > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > > > > "this vq was kicked since reset"?
> > > > > > > > > > >
> > > > > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > > > > callback needs to be synchronized with the kick.
> > > > > > > > > >
> > > > > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > > > > only during initialization and reset.
> > > > > > > > >
> > > > > > > > > Yes.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > > > > >
> > > > > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > > > > >         if (!vq->kicked) {
> > > > > > > > > > > >                 vq->kicked = true;
> > > > > > > > > > > >                 mb();
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > > will do the trick, right?
> > > > > > > > > > >
> > > > > > > > > > > There's no much difference with the existing approach:
> > > > > > > > > > >
> > > > > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > > > > >
> > > > > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > > > > no kick before DRIVER_OK.
> > > > > > > > > >
> > > > > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > > > > to hardening
> > > > > > > > >
> > > > > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > > > > >
> > > > > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > > > > >
> > > > > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > > > > hint that the driver is ok (but actually it could not).
> > > > > > >
> > > > > > > >
> > > > > > > > > > and in absence of config interrupts is generally easily
> > > > > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > > > > >
> > > > > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > > > > early):
> > > > > > > > >
> > > > > > > > > virtio_device_ready()
> > > > > > > > > virtqueue_kick()
> > > > > > > > > /* the window */
> > > > > > > > > subsystem_registration()
> > > > > > > >
> > > > > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > > > > Want to try to find any?
> > > > > > >
> > > > > > > Yes, let me try and update.
> > > > > >
> > > > > > This is basically the device that have an RX queue, so I've found the
> > > > > > following drivers:
> > > > > >
> > > > > > scmi, mac80211_hwsim, vsock, bt, balloon.
> > > > >
> > > > > Looked and I don't see it yet. Let's consider
> > > > > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > > > > callbacks until the first kick, what is the issue with probe exactly?
> > > >
> > > > We need to make sure the callback can survive when it runs before sub
> > > > system registration.
> > >
> > > With my proposal no - only if we also kick before registration.
> > > So I do not see the issue yet.
> > >
> > > Consider ./net/vmw_vsock/virtio_transport.c
> > >
> > > kicks: virtio_transport_send_pkt_work,
> > > virtio_vsock_rx_fill, virtio_vsock_event_fill
> > >
> > > which of these triggers before we are ready to
> > > handle callbacks?
> >
> > So:
> >
> > virtio_vsock_vqs_init()
> >     virtio_device_ready()
> >     virtio_vsock_rx_fill() /* kick there */
> > rcu_assign_pointer(the_virtio_vsock, vsock)
> >
> > It means at least virtio_vsock_rx_done()/virtio_vsock_workqueue needs
> > to survive. I don't say it has a bug but we do need to audit the code
> > in this case. The implication is: the virtqueue callback should be
> > written with no assumption that the driver has registered in the
> > subsystem. We don't or can't assume all drivers are written in this
> > way.
>
>
> I thought you said you audited code and found bugs.
>
> My claim is that simply because qemu starts processing
> packets immediately upon kick, if bugs like this
> existed we would have noticed by now.

This is true for a well behaved hypervisor. But what we want to deal
with is the buggy/malicious hypervisors.

>
> In this case the_virtio_vsock is used for xmit things,
> callbacks do not seem to use it at all.

So the hypervisor can trigger the notification just after the kick and
the work function seems to be safe.

One another example for this is in virtcons_probe():

        spin_lock_init(&portdev->ports_lock);
        INIT_LIST_HEAD(&portdev->ports);
        INIT_LIST_HEAD(&portdev->list);

        virtio_device_ready(portdev->vdev);

        INIT_WORK(&portdev->config_work, &config_work_handler);
        INIT_WORK(&portdev->control_work, &control_work_handler);

in control_intr() we had:

static void control_intr(struct virtqueue *vq)
{
        struct ports_device *portdev;

        portdev = vq->vdev->priv;
        schedule_work(&portdev->control_work);
}

So we might crash if the notification is raised just after
virtio_device_ready().

This is not an exact example of when a callback is not ready after
kick, but it demonstrates that the callback could have assumed that
all setup has been done when it is called.

Thanks

>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > his way ...
> > > > > > > >
> > > > > > > > > And during remove(), we get another window:
> > > > > > > > >
> > > > > > > > > subsysrem_unregistration()
> > > > > > > > > /* the window */
> > > > > > > > > virtio_device_reset()
> > > > > > > >
> > > > > > > > Same here.
> > > > > >
> > > > > > Basically for the drivers that set driver_ok before registration,
> > > > >
> > > > > I don't see what does driver_ok have to do with it.
> > > >
> > > > I meant for those driver, in probe they do()
> > > >
> > > > virtio_device_ready()
> > > > subsystem_register()
> > > >
> > > > In remove() they do
> > > >
> > > > subsystem_unregister()
> > > > virtio_device_reset()
> > > >
> > > > for symmetry
> > >
> > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > of issues around it, but while probe is unavoidable remove can be
> > > avoided by blocking hotplug.
> >
> > Unbind can trigger this path as well.
> >
> > >
> > >
> > > > >
> > > > > > so
> > > > > > we have a lot:
> > > > > >
> > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > iommu, caif, pmem, input, mem
> > > > > >
> > > > > > So I think there's no easy way to harden the notification without
> > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > spec compliance.
> > > >
> > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > >
> > > > Thanks
> > >
> > > patches you posted deal with DRIVER_OK spec compliance.
> > > I do not see patches for kicks before callbacks are ready to run.
> >
> > Yes.
> >
> > Thanks
> >
> > >
> > > > >
> > > > >
> > > > > --
> > > > > MST
> > > > >
> > >
>
Michael S. Tsirkin June 30, 2022, 8:38 a.m. UTC | #24
On Thu, Jun 30, 2022 at 10:01:16AM +0800, Jason Wang wrote:
> On Wed, Jun 29, 2022 at 4:52 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 29, 2022 at 04:34:36PM +0800, Jason Wang wrote:
> > > On Wed, Jun 29, 2022 at 3:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> > > > > On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > > > > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > > > > > your review found and fixed all driver bugs?
> > > > > > > > > >
> > > > > > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > > > > > think differently, let me know
> > > > > > > > > >
> > > > > > > > > > > After two attempts
> > > > > > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > > > > > >
> > > > > > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > > > > > one release to finalize I guess.
> > > > > > > > >
> > > > > > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > > > > > not break existing drivers.
> > > > > > > >
> > > > > > > > I totally agree with you to seek a way without bothering the drivers.
> > > > > > > > Just wonder if this is possbile.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > > > > > >
> > > > > > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > > > > > interrupt processing until enable_irq().
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Consider as an example
> > > > > > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > > > > > >
> > > > > > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > > > > > >
> > > > > > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > > > > > >
> > > > > > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > > > > > even without DRIVER_OK.
> > > > > > > >
> > > > > > > > Oh right.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > > > > > "this vq was kicked since reset"?
> > > > > > > > > > > >
> > > > > > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > > > > > callback needs to be synchronized with the kick.
> > > > > > > > > > >
> > > > > > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > > > > > only during initialization and reset.
> > > > > > > > > >
> > > > > > > > > > Yes.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > > > > > >
> > > > > > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > > > > > >         if (!vq->kicked) {
> > > > > > > > > > > > >                 vq->kicked = true;
> > > > > > > > > > > > >                 mb();
> > > > > > > > > > > > >         }
> > > > > > > > > > > > >
> > > > > > > > > > > > > will do the trick, right?
> > > > > > > > > > > >
> > > > > > > > > > > > There's no much difference with the existing approach:
> > > > > > > > > > > >
> > > > > > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > > > > > >
> > > > > > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > > > > > no kick before DRIVER_OK.
> > > > > > > > > > >
> > > > > > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > > > > > to hardening
> > > > > > > > > >
> > > > > > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > > > > > >
> > > > > > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > > > > > >
> > > > > > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > > > > > hint that the driver is ok (but actually it could not).
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > > and in absence of config interrupts is generally easily
> > > > > > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > > > > > >
> > > > > > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > > > > > early):
> > > > > > > > > >
> > > > > > > > > > virtio_device_ready()
> > > > > > > > > > virtqueue_kick()
> > > > > > > > > > /* the window */
> > > > > > > > > > subsystem_registration()
> > > > > > > > >
> > > > > > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > > > > > Want to try to find any?
> > > > > > > >
> > > > > > > > Yes, let me try and update.
> > > > > > >
> > > > > > > This is basically the device that have an RX queue, so I've found the
> > > > > > > following drivers:
> > > > > > >
> > > > > > > scmi, mac80211_hwsim, vsock, bt, balloon.
> > > > > >
> > > > > > Looked and I don't see it yet. Let's consider
> > > > > > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > > > > > callbacks until the first kick, what is the issue with probe exactly?
> > > > >
> > > > > We need to make sure the callback can survive when it runs before sub
> > > > > system registration.
> > > >
> > > > With my proposal no - only if we also kick before registration.
> > > > So I do not see the issue yet.
> > > >
> > > > Consider ./net/vmw_vsock/virtio_transport.c
> > > >
> > > > kicks: virtio_transport_send_pkt_work,
> > > > virtio_vsock_rx_fill, virtio_vsock_event_fill
> > > >
> > > > which of these triggers before we are ready to
> > > > handle callbacks?
> > >
> > > So:
> > >
> > > virtio_vsock_vqs_init()
> > >     virtio_device_ready()
> > >     virtio_vsock_rx_fill() /* kick there */
> > > rcu_assign_pointer(the_virtio_vsock, vsock)
> > >
> > > It means at least virtio_vsock_rx_done()/virtio_vsock_workqueue needs
> > > to survive. I don't say it has a bug but we do need to audit the code
> > > in this case. The implication is: the virtqueue callback should be
> > > written with no assumption that the driver has registered in the
> > > subsystem. We don't or can't assume all drivers are written in this
> > > way.
> >
> >
> > I thought you said you audited code and found bugs.
> >
> > My claim is that simply because qemu starts processing
> > packets immediately upon kick, if bugs like this
> > existed we would have noticed by now.
> 
> This is true for a well behaved hypervisor. But what we want to deal
> with is the buggy/malicious hypervisors.




> >
> > In this case the_virtio_vsock is used for xmit things,
> > callbacks do not seem to use it at all.
> 
> So the hypervisor can trigger the notification just after the kick and
> the work function seems to be safe.
> 
> One another example for this is in virtcons_probe():
> 
>         spin_lock_init(&portdev->ports_lock);
>         INIT_LIST_HEAD(&portdev->ports);
>         INIT_LIST_HEAD(&portdev->list);
> 
>         virtio_device_ready(portdev->vdev);
> 
>         INIT_WORK(&portdev->config_work, &config_work_handler);
>         INIT_WORK(&portdev->control_work, &control_work_handler);
> 
> in control_intr() we had:
> 
> static void control_intr(struct virtqueue *vq)
> {
>         struct ports_device *portdev;
> 
>         portdev = vq->vdev->priv;
>         schedule_work(&portdev->control_work);
> }
> 
> So we might crash if the notification is raised just after
> virtio_device_ready().

Yes! But this is not my proposal. This is yours.
Your patches block interrupts until virtio_device_ready.

My proposal is to block them until kick.

In this case kick is in fill_queue after INIT_WORK.


> This is not an exact example of when a callback is not ready after
> kick, but it demonstrates that the callback could have assumed that
> all setup has been done when it is called.
> 
> Thanks

So if there are not examples of callbacks not ready after kick
then let us block callbacks until first kick. That is my idea.


> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > > his way ...
> > > > > > > > >
> > > > > > > > > > And during remove(), we get another window:
> > > > > > > > > >
> > > > > > > > > > subsysrem_unregistration()
> > > > > > > > > > /* the window */
> > > > > > > > > > virtio_device_reset()
> > > > > > > > >
> > > > > > > > > Same here.
> > > > > > >
> > > > > > > Basically for the drivers that set driver_ok before registration,
> > > > > >
> > > > > > I don't see what does driver_ok have to do with it.
> > > > >
> > > > > I meant for those driver, in probe they do()
> > > > >
> > > > > virtio_device_ready()
> > > > > subsystem_register()
> > > > >
> > > > > In remove() they do
> > > > >
> > > > > subsystem_unregister()
> > > > > virtio_device_reset()
> > > > >
> > > > > for symmetry
> > > >
> > > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > > of issues around it, but while probe is unavoidable remove can be
> > > > avoided by blocking hotplug.
> > >
> > > Unbind can trigger this path as well.
> > >
> > > >
> > > >
> > > > > >
> > > > > > > so
> > > > > > > we have a lot:
> > > > > > >
> > > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > > iommu, caif, pmem, input, mem
> > > > > > >
> > > > > > > So I think there's no easy way to harden the notification without
> > > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > > spec compliance.
> > > > >
> > > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > > >
> > > > > Thanks
> > > >
> > > > patches you posted deal with DRIVER_OK spec compliance.
> > > > I do not see patches for kicks before callbacks are ready to run.
> > >
> > > Yes.
> > >
> > > Thanks
> > >
> > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > MST
> > > > > >
> > > >
> >
Jason Wang July 4, 2022, 4:23 a.m. UTC | #25
On Thu, Jun 30, 2022 at 4:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Jun 30, 2022 at 10:01:16AM +0800, Jason Wang wrote:
> > On Wed, Jun 29, 2022 at 4:52 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 29, 2022 at 04:34:36PM +0800, Jason Wang wrote:
> > > > On Wed, Jun 29, 2022 at 3:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Jun 29, 2022 at 03:02:21PM +0800, Jason Wang wrote:
> > > > > > On Wed, Jun 29, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Jun 29, 2022 at 12:07:11PM +0800, Jason Wang wrote:
> > > > > > > > On Tue, Jun 28, 2022 at 2:17 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Jun 28, 2022 at 1:00 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, Jun 28, 2022 at 11:49:12AM +0800, Jason Wang wrote:
> > > > > > > > > > > > Heh. Yea sure. But things work fine for people. What is the chance
> > > > > > > > > > > > your review found and fixed all driver bugs?
> > > > > > > > > > >
> > > > > > > > > > > I don't/can't audit all bugs but the race between open/close against
> > > > > > > > > > > ready/reset. It looks to me a good chance to fix them all but if you
> > > > > > > > > > > think differently, let me know
> > > > > > > > > > >
> > > > > > > > > > > > After two attempts
> > > > > > > > > > > > I don't feel like hoping audit will fix all bugs.
> > > > > > > > > > >
> > > > > > > > > > > I've started the auditing and have 15+ patches in the queue. (only
> > > > > > > > > > > covers bluetooth, console, pmem, virtio-net and caif). Spotting the
> > > > > > > > > > > issue is not hard but the testing, It would take at least the time of
> > > > > > > > > > > one release to finalize I guess.
> > > > > > > > > >
> > > > > > > > > > Absolutely. So I am looking for a way to implement hardening that does
> > > > > > > > > > not break existing drivers.
> > > > > > > > >
> > > > > > > > > I totally agree with you to seek a way without bothering the drivers.
> > > > > > > > > Just wonder if this is possbile.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > The reason config was kind of easy is that config interrupt is rarely
> > > > > > > > > > > > > > vital for device function so arbitrarily deferring that does not lead to
> > > > > > > > > > > > > > deadlocks - what you are trying to do with VQ interrupts is
> > > > > > > > > > > > > > fundamentally different. Things are especially bad if we just drop
> > > > > > > > > > > > > > an interrupt but deferring can lead to problems too.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I'm not sure I see the difference, disable_irq() stuffs also delay the
> > > > > > > > > > > > > interrupt processing until enable_irq().
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Absolutely. I am not at all sure disable_irq fixes all problems.
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Consider as an example
> > > > > > > > > > > > > >     virtio-net: fix race between ndo_open() and virtio_device_ready()
> > > > > > > > > > > > > > if you just defer vq interrupts you get deadlocks.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > I don't see a deadlock here, maybe you can show more detail on this?
> > > > > > > > > > > >
> > > > > > > > > > > > What I mean is this: if we revert the above commit, things still
> > > > > > > > > > > > work (out of spec, but still). If we revert and defer interrupts until
> > > > > > > > > > > > device ready then ndo_open that triggers before device ready deadlocks.
> > > > > > > > > > >
> > > > > > > > > > > Ok, I guess you meant on a hypervisor that is strictly written with spec.
> > > > > > > > > >
> > > > > > > > > > I mean on hypervisor that starts processing queues after getting a kick
> > > > > > > > > > even without DRIVER_OK.
> > > > > > > > >
> > > > > > > > > Oh right.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So, thinking about all this, how about a simple per vq flag meaning
> > > > > > > > > > > > > > "this vq was kicked since reset"?
> > > > > > > > > > > > >
> > > > > > > > > > > > > And ignore the notification if vq is not kicked? It sounds like the
> > > > > > > > > > > > > callback needs to be synchronized with the kick.
> > > > > > > > > > > >
> > > > > > > > > > > > Note we only need to synchronize it when it changes, which is
> > > > > > > > > > > > only during initialization and reset.
> > > > > > > > > > >
> > > > > > > > > > > Yes.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If driver does not kick then it's not ready to get callbacks, right?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Sounds quite clean, but we need to think through memory ordering
> > > > > > > > > > > > > > concerns - I guess it's only when we change the value so
> > > > > > > > > > > > > >         if (!vq->kicked) {
> > > > > > > > > > > > > >                 vq->kicked = true;
> > > > > > > > > > > > > >                 mb();
> > > > > > > > > > > > > >         }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > will do the trick, right?
> > > > > > > > > > > > >
> > > > > > > > > > > > > There's no much difference with the existing approach:
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1) your proposal implicitly makes callbacks ready in virtqueue_kick()
> > > > > > > > > > > > > 2) my proposal explicitly makes callbacks ready via virtio_device_ready()
> > > > > > > > > > > > >
> > > > > > > > > > > > > Both require careful auditing of all the existing drivers to make sure
> > > > > > > > > > > > > no kick before DRIVER_OK.
> > > > > > > > > > > >
> > > > > > > > > > > > Jason, kick before DRIVER_OK is out of spec, sure. But it is unrelated
> > > > > > > > > > > > to hardening
> > > > > > > > > > >
> > > > > > > > > > > Yes but with your proposal, it seems to couple kick with DRIVER_OK somehow.
> > > > > > > > > >
> > > > > > > > > > I don't see how - my proposal ignores DRIVER_OK issues.
> > > > > > > > >
> > > > > > > > > Yes, what I meant is, in your proposal, the first kick after rest is a
> > > > > > > > > hint that the driver is ok (but actually it could not).
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > > and in absence of config interrupts is generally easily
> > > > > > > > > > > > fixed just by sticking virtio_device_ready early in initialization.
> > > > > > > > > > >
> > > > > > > > > > > So if the kick is done before the subsystem registration, there's
> > > > > > > > > > > still a window in the middle (assuming we stick virtio_device_ready()
> > > > > > > > > > > early):
> > > > > > > > > > >
> > > > > > > > > > > virtio_device_ready()
> > > > > > > > > > > virtqueue_kick()
> > > > > > > > > > > /* the window */
> > > > > > > > > > > subsystem_registration()
> > > > > > > > > >
> > > > > > > > > > Absolutely, however, I do not think we really have many such drivers
> > > > > > > > > > since this has been known as a wrong thing to do since the beginning.
> > > > > > > > > > Want to try to find any?
> > > > > > > > >
> > > > > > > > > Yes, let me try and update.
> > > > > > > >
> > > > > > > > This is basically the device that have an RX queue, so I've found the
> > > > > > > > following drivers:
> > > > > > > >
> > > > > > > > scmi, mac80211_hwsim, vsock, bt, balloon.
> > > > > > >
> > > > > > > Looked and I don't see it yet. Let's consider
> > > > > > > ./net/vmw_vsock/virtio_transport.c for example. Assuming we block
> > > > > > > callbacks until the first kick, what is the issue with probe exactly?
> > > > > >
> > > > > > We need to make sure the callback can survive when it runs before sub
> > > > > > system registration.
> > > > >
> > > > > With my proposal no - only if we also kick before registration.
> > > > > So I do not see the issue yet.
> > > > >
> > > > > Consider ./net/vmw_vsock/virtio_transport.c
> > > > >
> > > > > kicks: virtio_transport_send_pkt_work,
> > > > > virtio_vsock_rx_fill, virtio_vsock_event_fill
> > > > >
> > > > > which of these triggers before we are ready to
> > > > > handle callbacks?
> > > >
> > > > So:
> > > >
> > > > virtio_vsock_vqs_init()
> > > >     virtio_device_ready()
> > > >     virtio_vsock_rx_fill() /* kick there */
> > > > rcu_assign_pointer(the_virtio_vsock, vsock)
> > > >
> > > > It means at least virtio_vsock_rx_done()/virtio_vsock_workqueue needs
> > > > to survive. I don't say it has a bug but we do need to audit the code
> > > > in this case. The implication is: the virtqueue callback should be
> > > > written with no assumption that the driver has registered in the
> > > > subsystem. We don't or can't assume all drivers are written in this
> > > > way.
> > >
> > >
> > > I thought you said you audited code and found bugs.
> > >
> > > My claim is that simply because qemu starts processing
> > > packets immediately upon kick, if bugs like this
> > > existed we would have noticed by now.
> >
> > This is true for a well behaved hypervisor. But what we want to deal
> > with is the buggy/malicious hypervisors.
>
>
>
>
> > >
> > > In this case the_virtio_vsock is used for xmit things,
> > > callbacks do not seem to use it at all.
> >
> > So the hypervisor can trigger the notification just after the kick and
> > the work function seems to be safe.
> >
> > One another example for this is in virtcons_probe():
> >
> >         spin_lock_init(&portdev->ports_lock);
> >         INIT_LIST_HEAD(&portdev->ports);
> >         INIT_LIST_HEAD(&portdev->list);
> >
> >         virtio_device_ready(portdev->vdev);
> >
> >         INIT_WORK(&portdev->config_work, &config_work_handler);
> >         INIT_WORK(&portdev->control_work, &control_work_handler);
> >
> > in control_intr() we had:
> >
> > static void control_intr(struct virtqueue *vq)
> > {
> >         struct ports_device *portdev;
> >
> >         portdev = vq->vdev->priv;
> >         schedule_work(&portdev->control_work);
> > }
> >
> > So we might crash if the notification is raised just after
> > virtio_device_ready().
>
> Yes! But this is not my proposal. This is yours.
> Your patches block interrupts until virtio_device_ready.
>
> My proposal is to block them until kick.
>
> In this case kick is in fill_queue after INIT_WORK.

Yes.

>
>
> > This is not an exact example of when a callback is not ready after
> > kick, but it demonstrates that the callback could have assumed that
> > all setup has been done when it is called.
> >
> > Thanks
>
> So if there are not examples of callbacks not ready after kick
> then let us block callbacks until first kick. That is my idea.

Ok, let me try. I need to drain my queue of fixes first.

Thanks

>
>
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > > > his way ...
> > > > > > > > > >
> > > > > > > > > > > And during remove(), we get another window:
> > > > > > > > > > >
> > > > > > > > > > > subsysrem_unregistration()
> > > > > > > > > > > /* the window */
> > > > > > > > > > > virtio_device_reset()
> > > > > > > > > >
> > > > > > > > > > Same here.
> > > > > > > >
> > > > > > > > Basically for the drivers that set driver_ok before registration,
> > > > > > >
> > > > > > > I don't see what does driver_ok have to do with it.
> > > > > >
> > > > > > I meant for those driver, in probe they do()
> > > > > >
> > > > > > virtio_device_ready()
> > > > > > subsystem_register()
> > > > > >
> > > > > > In remove() they do
> > > > > >
> > > > > > subsystem_unregister()
> > > > > > virtio_device_reset()
> > > > > >
> > > > > > for symmetry
> > > > >
> > > > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > > > of issues around it, but while probe is unavoidable remove can be
> > > > > avoided by blocking hotplug.
> > > >
> > > > Unbind can trigger this path as well.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > so
> > > > > > > > we have a lot:
> > > > > > > >
> > > > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > > > iommu, caif, pmem, input, mem
> > > > > > > >
> > > > > > > > So I think there's no easy way to harden the notification without
> > > > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > > > spec compliance.
> > > > > >
> > > > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > patches you posted deal with DRIVER_OK spec compliance.
> > > > > I do not see patches for kicks before callbacks are ready to run.
> > > >
> > > > Yes.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > MST
> > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin July 4, 2022, 6:22 a.m. UTC | #26
On Mon, Jul 04, 2022 at 12:23:27PM +0800, Jason Wang wrote:
> > So if there are not examples of callbacks not ready after kick
> > then let us block callbacks until first kick. That is my idea.
> 
> Ok, let me try. I need to drain my queue of fixes first.
> 
> Thanks

If we do find issues, another option is blocking callbacks until the
first add. A bit higher overhead as add is a more common operation
but it has even less of a chance to introduce regressions.

> >
> >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > > > > his way ...
> > > > > > > > > > >
> > > > > > > > > > > > And during remove(), we get another window:
> > > > > > > > > > > >
> > > > > > > > > > > > subsysrem_unregistration()
> > > > > > > > > > > > /* the window */
> > > > > > > > > > > > virtio_device_reset()
> > > > > > > > > > >
> > > > > > > > > > > Same here.
> > > > > > > > >
> > > > > > > > > Basically for the drivers that set driver_ok before registration,
> > > > > > > >
> > > > > > > > I don't see what does driver_ok have to do with it.
> > > > > > >
> > > > > > > I meant for those driver, in probe they do()
> > > > > > >
> > > > > > > virtio_device_ready()
> > > > > > > subsystem_register()
> > > > > > >
> > > > > > > In remove() they do
> > > > > > >
> > > > > > > subsystem_unregister()
> > > > > > > virtio_device_reset()
> > > > > > >
> > > > > > > for symmetry
> > > > > >
> > > > > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > > > > of issues around it, but while probe is unavoidable remove can be
> > > > > > avoided by blocking hotplug.
> > > > >
> > > > > Unbind can trigger this path as well.
> > > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > so
> > > > > > > > > we have a lot:
> > > > > > > > >
> > > > > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > > > > iommu, caif, pmem, input, mem
> > > > > > > > >
> > > > > > > > > So I think there's no easy way to harden the notification without
> > > > > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > >
> > > > > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > > > > spec compliance.
> > > > > > >
> > > > > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > patches you posted deal with DRIVER_OK spec compliance.
> > > > > > I do not see patches for kicks before callbacks are ready to run.
> > > > >
> > > > > Yes.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > MST
> > > > > > > >
> > > > > >
> > > >
> >
Jason Wang July 4, 2022, 6:40 a.m. UTC | #27
On Mon, Jul 4, 2022 at 2:22 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jul 04, 2022 at 12:23:27PM +0800, Jason Wang wrote:
> > > So if there are not examples of callbacks not ready after kick
> > > then let us block callbacks until first kick. That is my idea.
> >
> > Ok, let me try. I need to drain my queue of fixes first.
> >
> > Thanks
>
> If we do find issues, another option is blocking callbacks until the
> first add. A bit higher overhead as add is a more common operation
> but it has even less of a chance to introduce regressions.

So I understand that the case of blocking until first kick but if we
block until add it means for drivers:

virtqueue_add()
virtio_device_ready()
virtqueue_kick()

We probably enlarge the window in this case.

Thanks

>
> > >
> > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > > > > > his way ...
> > > > > > > > > > > >
> > > > > > > > > > > > > And during remove(), we get another window:
> > > > > > > > > > > > >
> > > > > > > > > > > > > subsysrem_unregistration()
> > > > > > > > > > > > > /* the window */
> > > > > > > > > > > > > virtio_device_reset()
> > > > > > > > > > > >
> > > > > > > > > > > > Same here.
> > > > > > > > > >
> > > > > > > > > > Basically for the drivers that set driver_ok before registration,
> > > > > > > > >
> > > > > > > > > I don't see what does driver_ok have to do with it.
> > > > > > > >
> > > > > > > > I meant for those driver, in probe they do()
> > > > > > > >
> > > > > > > > virtio_device_ready()
> > > > > > > > subsystem_register()
> > > > > > > >
> > > > > > > > In remove() they do
> > > > > > > >
> > > > > > > > subsystem_unregister()
> > > > > > > > virtio_device_reset()
> > > > > > > >
> > > > > > > > for symmetry
> > > > > > >
> > > > > > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > > > > > of issues around it, but while probe is unavoidable remove can be
> > > > > > > avoided by blocking hotplug.
> > > > > >
> > > > > > Unbind can trigger this path as well.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > so
> > > > > > > > > > we have a lot:
> > > > > > > > > >
> > > > > > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > > > > > iommu, caif, pmem, input, mem
> > > > > > > > > >
> > > > > > > > > > So I think there's no easy way to harden the notification without
> > > > > > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > > > > > spec compliance.
> > > > > > > >
> > > > > > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > patches you posted deal with DRIVER_OK spec compliance.
> > > > > > > I do not see patches for kicks before callbacks are ready to run.
> > > > > >
> > > > > > Yes.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > MST
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin July 4, 2022, 7 a.m. UTC | #28
On Mon, Jul 04, 2022 at 02:40:16PM +0800, Jason Wang wrote:
> On Mon, Jul 4, 2022 at 2:22 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Jul 04, 2022 at 12:23:27PM +0800, Jason Wang wrote:
> > > > So if there are not examples of callbacks not ready after kick
> > > > then let us block callbacks until first kick. That is my idea.
> > >
> > > Ok, let me try. I need to drain my queue of fixes first.
> > >
> > > Thanks
> >
> > If we do find issues, another option is blocking callbacks until the
> > first add. A bit higher overhead as add is a more common operation
> > but it has even less of a chance to introduce regressions.
> 
> So I understand that the case of blocking until first kick but if we
> block until add it means for drivers:
> 
> virtqueue_add()
> virtio_device_ready()
> virtqueue_kick()
> 
> We probably enlarge the window in this case.
> 
> Thanks

Yes but I don't know whether any drivers call add before they are ready
to get a callback. The main thing with hardening is not to break
drivers. Primum non nocere and all that.


> >
> > > >
> > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >I couldn't ... except maybe bluetooth
> > > > > > > > > > > > > but that's just maintainer nacking fixes saying he'll fix it
> > > > > > > > > > > > > his way ...
> > > > > > > > > > > > >
> > > > > > > > > > > > > > And during remove(), we get another window:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > subsysrem_unregistration()
> > > > > > > > > > > > > > /* the window */
> > > > > > > > > > > > > > virtio_device_reset()
> > > > > > > > > > > > >
> > > > > > > > > > > > > Same here.
> > > > > > > > > > >
> > > > > > > > > > > Basically for the drivers that set driver_ok before registration,
> > > > > > > > > >
> > > > > > > > > > I don't see what does driver_ok have to do with it.
> > > > > > > > >
> > > > > > > > > I meant for those driver, in probe they do()
> > > > > > > > >
> > > > > > > > > virtio_device_ready()
> > > > > > > > > subsystem_register()
> > > > > > > > >
> > > > > > > > > In remove() they do
> > > > > > > > >
> > > > > > > > > subsystem_unregister()
> > > > > > > > > virtio_device_reset()
> > > > > > > > >
> > > > > > > > > for symmetry
> > > > > > > >
> > > > > > > > Let's leave remove alone for now. I am close to 100% sure we have *lots*
> > > > > > > > of issues around it, but while probe is unavoidable remove can be
> > > > > > > > avoided by blocking hotplug.
> > > > > > >
> > > > > > > Unbind can trigger this path as well.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > so
> > > > > > > > > > > we have a lot:
> > > > > > > > > > >
> > > > > > > > > > > blk, net, mac80211_hwsim, scsi, vsock, bt, crypto, gpio, gpu, i2c,
> > > > > > > > > > > iommu, caif, pmem, input, mem
> > > > > > > > > > >
> > > > > > > > > > > So I think there's no easy way to harden the notification without
> > > > > > > > > > > auditing the driver one by one (especially considering the driver may
> > > > > > > > > > > use bh or workqueue). The problem is the notification hardening
> > > > > > > > > > > depends on a correct or race-free probe/remove. So we need to fix the
> > > > > > > > > > > issues in probe/remove then do the hardening on the notification.
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > So if drivers kick but are not ready to get callbacks then let's fix
> > > > > > > > > > that first of all, these are racy with existing qemu even ignoring
> > > > > > > > > > spec compliance.
> > > > > > > > >
> > > > > > > > > Yes, (the patches I've posted so far exist even with a well-behaved device).
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > >
> > > > > > > > patches you posted deal with DRIVER_OK spec compliance.
> > > > > > > > I do not see patches for kicks before callbacks are ready to run.
> > > > > > >
> > > > > > > Yes.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > --
> > > > > > > > > > MST
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >
diff mbox series

Patch

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 97e51c34e6cf..1f6a358f65f0 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -1136,8 +1136,13 @@  static void virtio_ccw_int_handler(struct ccw_device *cdev,
 			vcdev->err = -EIO;
 	}
 	virtio_ccw_check_activity(vcdev, activity);
-	/* Interrupts are disabled here */
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
+	/*
+	 * Paried with virtio_ccw_synchronize_cbs() and interrupts are
+	 * disabled here.
+	 */
 	read_lock(&vcdev->irq_lock);
+#endif
 	for_each_set_bit(i, indicators(vcdev),
 			 sizeof(*indicators(vcdev)) * BITS_PER_BYTE) {
 		/* The bit clear must happen before the vring kick. */
@@ -1146,7 +1151,9 @@  static void virtio_ccw_int_handler(struct ccw_device *cdev,
 		vq = virtio_ccw_vq_by_ind(vcdev, i);
 		vring_interrupt(0, vq);
 	}
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 	read_unlock(&vcdev->irq_lock);
+#endif
 	if (test_bit(0, indicators2(vcdev))) {
 		virtio_config_changed(&vcdev->vdev);
 		clear_bit(0, indicators2(vcdev));
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index b5adf6abd241..c04f370a1e5c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -35,6 +35,19 @@  menuconfig VIRTIO_MENU
 
 if VIRTIO_MENU
 
+config VIRTIO_HARDEN_NOTIFICATION
+        bool "Harden virtio notification"
+        help
+          Enable this to harden the device notifications and suppress
+          those that happen at a time where notifications are illegal.
+
+          Experimental: Note that several drivers still have bugs that
+          may cause crashes or hangs when correct handling of
+          notifications is enforced; depending on the subset of
+          drivers and devices you use, this may or may not work.
+
+          If unsure, say N.
+
 config VIRTIO_PCI
 	tristate "PCI driver for virtio devices"
 	depends on PCI
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index ef04a96942bf..21dc08d2f32d 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -220,6 +220,7 @@  static int virtio_features_ok(struct virtio_device *dev)
  * */
 void virtio_reset_device(struct virtio_device *dev)
 {
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 	/*
 	 * The below virtio_synchronize_cbs() guarantees that any
 	 * interrupt for this line arriving after
@@ -228,6 +229,7 @@  void virtio_reset_device(struct virtio_device *dev)
 	 */
 	virtio_break_device(dev);
 	virtio_synchronize_cbs(dev);
+#endif
 
 	dev->config->reset(dev);
 }
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 13a7348cedff..d9d3b6e201fb 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1688,7 +1688,11 @@  static struct virtqueue *vring_create_virtqueue_packed(
 	vq->we_own_ring = true;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 	vq->broken = true;
+#else
+	vq->broken = false;
+#endif
 	vq->last_used_idx = 0;
 	vq->event_triggered = false;
 	vq->num_added = 0;
@@ -2135,9 +2139,13 @@  irqreturn_t vring_interrupt(int irq, void *_vq)
 	}
 
 	if (unlikely(vq->broken)) {
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 		dev_warn_once(&vq->vq.vdev->dev,
 			      "virtio vring IRQ raised before DRIVER_OK");
 		return IRQ_NONE;
+#else
+		return IRQ_HANDLED;
+#endif
 	}
 
 	/* Just a hint for performance: so it's ok that this can be racy! */
@@ -2180,7 +2188,11 @@  struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->we_own_ring = false;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 	vq->broken = true;
+#else
+	vq->broken = false;
+#endif
 	vq->last_used_idx = 0;
 	vq->event_triggered = false;
 	vq->num_added = 0;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 9a36051ceb76..d15c3cdda2d2 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -257,6 +257,7 @@  void virtio_device_ready(struct virtio_device *dev)
 
 	WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
 
+#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
 	/*
 	 * The virtio_synchronize_cbs() makes sure vring_interrupt()
 	 * will see the driver specific setup if it sees vq->broken
@@ -264,6 +265,7 @@  void virtio_device_ready(struct virtio_device *dev)
 	 */
 	virtio_synchronize_cbs(dev);
 	__virtio_unbreak_device(dev);
+#endif
 	/*
 	 * The transport should ensure the visibility of vq->broken
 	 * before setting DRIVER_OK. See the comments for the transport