diff mbox

[v3,6/6] vhost-user: support registering external host notifiers

Message ID 20180412151232.17506-7-tiwei.bie@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tiwei Bie April 12, 2018, 3:12 p.m. UTC
This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
With this feature negotiated, vhost-user backend can register
memory region based host notifiers. And it will allow the guest
driver in the VM to notify the hardware accelerator at the
vhost-user backend directly.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 docs/interop/vhost-user.txt    |  33 +++++++++++
 hw/virtio/vhost-user.c         | 123 +++++++++++++++++++++++++++++++++++++++++
 include/hw/virtio/vhost-user.h |   8 +++
 3 files changed, 164 insertions(+)

Comments

Michael S. Tsirkin April 18, 2018, 4:34 p.m. UTC | #1
On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:
> This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
> With this feature negotiated, vhost-user backend can register
> memory region based host notifiers. And it will allow the guest
> driver in the VM to notify the hardware accelerator at the
> vhost-user backend directly.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>

Overall I think we can merge this approach, but I have two main concerns
about this:

1. Testing. Most people do not have the virtio hardware
   so how to make sure this does not bit rot?

   I have an idea: add an option like this to libvhost-user.
   Naturally libvhost-user can not get notified about a write
   to an mmapped area, but it can write a special value there
   (e.g. all-ones?) and then poll it to detect VQ # writes.

   Then include a vhost user bridge test with an option like this.

   I'd like to see a patch doing this.

2. Memory barriers. Right now after updating the avail idx,
   virtio does smp_wmb() and then the MMIO write.
   Normal hardware drivers do wmb() which is an sfence.
   Can a PCI device read bypass index write and see a stale
   index value?
   To make virtio pci do wmb() we would need a new feature bit.
   Alternatively I guess we could maybe look at subsystem vendor/device id.

   I'd like to see a patch doing one of these things.

Thanks!

> ---
>  docs/interop/vhost-user.txt    |  33 +++++++++++
>  hw/virtio/vhost-user.c         | 123 +++++++++++++++++++++++++++++++++++++++++
>  include/hw/virtio/vhost-user.h |   8 +++
>  3 files changed, 164 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> index 534caab18a..9e57b36b20 100644
> --- a/docs/interop/vhost-user.txt
> +++ b/docs/interop/vhost-user.txt
> @@ -132,6 +132,16 @@ Depending on the request type, payload can be:
>     Payload: Size bytes array holding the contents of the virtio
>         device's configuration space
>  
> + * Vring area description
> +   -----------------------
> +   | u64 | size | offset |
> +   -----------------------
> +
> +   u64: a 64-bit integer contains vring index and flags
> +   Size: a 64-bit size of this area
> +   Offset: a 64-bit offset of this area from the start of the
> +       supplied file descriptor
> +
>  In QEMU the vhost-user message is implemented with the following struct:
>  
>  typedef struct VhostUserMsg {
> @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {
>          VhostUserLog log;
>          struct vhost_iotlb_msg iotlb;
>          VhostUserConfig config;
> +        VhostUserVringArea area;
>      };
>  } QEMU_PACKED VhostUserMsg;
>  
> @@ -380,6 +391,7 @@ Protocol features
>  #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
>  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
>  #define VHOST_USER_PROTOCOL_F_CONFIG         9
> +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
>  
>  Master message types
>  --------------------
> @@ -777,6 +789,27 @@ Slave message types
>       the VHOST_USER_NEED_REPLY flag, master must respond with zero when
>       operation is successfully completed, or non-zero otherwise.
>  
> + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
> +
> +      Id: 3
> +      Equivalent ioctl: N/A
> +      Slave payload: vring area description
> +      Master payload: N/A
> +
> +      Sets host notifier for a specified queue. The queue index is contained
> +      in the u64 field of the vring area description. The host notifier is
> +      described by the file descriptor (typically it's a VFIO device fd) which
> +      is passed as ancillary data and the size (which is mmap size and should
> +      be the same as host page size) and offset (which is mmap offset) carried
> +      in the vring area description. QEMU can mmap the file descriptor based
> +      on the size and offset to get a memory range. Registering a host notifier
> +      means mapping this memory range to the VM as the specified queue's notify
> +      MMIO region. Slave sends this request to tell QEMU to de-register the
> +      existing notifier if any and register the new notifier if the request is
> +      sent with a file descriptor.
> +      This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> +      protocol feature has been successfully negotiated.
> +
>  VHOST_USER_PROTOCOL_F_REPLY_ACK:
>  -------------------------------
>  The original vhost-user specification only demands replies for certain
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index 791e0a4763..1cd9c7276b 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -13,6 +13,7 @@
>  #include "hw/virtio/vhost.h"
>  #include "hw/virtio/vhost-user.h"
>  #include "hw/virtio/vhost-backend.h"
> +#include "hw/virtio/virtio.h"
>  #include "hw/virtio/virtio-net.h"
>  #include "chardev/char-fe.h"
>  #include "sysemu/kvm.h"
> @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
>      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
>      VHOST_USER_PROTOCOL_F_MAX
>  };
>  
> @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {
>      VHOST_USER_SLAVE_NONE = 0,
>      VHOST_USER_SLAVE_IOTLB_MSG = 1,
>      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
> +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
>      VHOST_USER_SLAVE_MAX
>  }  VhostUserSlaveRequest;
>  
> @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__ ((unused));
>                                     + sizeof(c.size) \
>                                     + sizeof(c.flags))
>  
> +typedef struct VhostUserVringArea {
> +    uint64_t u64;
> +    uint64_t size;
> +    uint64_t offset;
> +} VhostUserVringArea;
> +
>  typedef struct {
>      VhostUserRequest request;
>  
> @@ -157,6 +166,7 @@ typedef union {
>          struct vhost_iotlb_msg iotlb;
>          VhostUserConfig config;
>          VhostUserCryptoSession session;
> +        VhostUserVringArea area;
>  } VhostUserPayload;
>  
>  typedef struct VhostUserMsg {
> @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct vhost_dev *dev,
>      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
>  }
>  
> +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
> +                                             int queue_idx)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> +    VirtIODevice *vdev = dev->vdev;
> +
> +    if (n->addr && !n->set) {
> +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
> +        n->set = true;
> +    }
> +}
> +
> +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
> +                                            int queue_idx)
> +{
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> +    VirtIODevice *vdev = dev->vdev;
> +
> +    if (n->addr && n->set) {
> +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> +        n->set = false;
> +    }
> +}
> +
>  static int vhost_user_set_vring_base(struct vhost_dev *dev,
>                                       struct vhost_vring_state *ring)
>  {
> +    vhost_user_host_notifier_restore(dev, ring->index);
> +
>      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
>  }
>  
> @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
>          .hdr.size = sizeof(msg.payload.state),
>      };
>  
> +    vhost_user_host_notifier_remove(dev, ring->index);
> +
>      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
>          return -1;
>      }
> @@ -847,6 +887,76 @@ static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
>      return ret;
>  }
>  
> +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
> +                                                       VhostUserVringArea *area,
> +                                                       int fd)
> +{
> +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
> +    size_t page_size = qemu_real_host_page_size;
> +    struct vhost_user *u = dev->opaque;
> +    VhostUserState *user = u->user;
> +    VirtIODevice *vdev = dev->vdev;
> +    VhostUserHostNotifier *n;
> +    int ret = 0;
> +    void *addr;
> +    char *name;
> +
> +    if (!virtio_has_feature(dev->protocol_features,
> +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
> +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    n = &user->notifier[queue_idx];
> +
> +    if (n->addr) {
> +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> +        object_unparent(OBJECT(&n->mr));
> +        munmap(n->addr, page_size);
> +        n->addr = NULL;
> +    }
> +
> +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> +        goto out;
> +    }
> +
> +    /* Sanity check. */
> +    if (area->size != page_size) {
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +                fd, area->offset);
> +    if (addr == MAP_FAILED) {
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
> +                           user, queue_idx);
> +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
> +                                      page_size, addr);
> +    g_free(name);
> +
> +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
> +        munmap(addr, page_size);
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    n->addr = addr;
> +    n->set = true;
> +
> +out:
> +    /* Always close the fd. */
> +    if (fd != -1) {
> +        close(fd);
> +    }
> +    return ret;
> +}
> +
>  static void slave_read(void *opaque)
>  {
>      struct vhost_dev *dev = opaque;
> @@ -913,6 +1023,10 @@ static void slave_read(void *opaque)
>      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
>          ret = vhost_user_slave_handle_config_change(dev);
>          break;
> +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
> +        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
> +                                                          fd);
> +        break;
>      default:
>          error_report("Received unexpected msg type.");
>          if (fd != -1) {
> @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)
>  
>  void vhost_user_cleanup(VhostUserState *user)
>  {
> +    int i;
> +
> +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
> +        if (user->notifier[i].addr) {
> +            object_unparent(OBJECT(&user->notifier[i].mr));
> +            munmap(user->notifier[i].addr, qemu_real_host_page_size);
> +            user->notifier[i].addr = NULL;
> +        }
> +    }
>  }
>  
>  const VhostOps user_ops = {
> diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> index eb8bc0d90d..fd660393a0 100644
> --- a/include/hw/virtio/vhost-user.h
> +++ b/include/hw/virtio/vhost-user.h
> @@ -9,9 +9,17 @@
>  #define HW_VIRTIO_VHOST_USER_H
>  
>  #include "chardev/char-fe.h"
> +#include "hw/virtio/virtio.h"
> +
> +typedef struct VhostUserHostNotifier {
> +    MemoryRegion mr;
> +    void *addr;
> +    bool set;
> +} VhostUserHostNotifier;
>  
>  typedef struct VhostUserState {
>      CharBackend *chr;
> +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
>  } VhostUserState;
>  
>  VhostUserState *vhost_user_init(void);
> -- 
> 2.11.0
Tiwei Bie April 19, 2018, 11:14 a.m. UTC | #2
On Wed, Apr 18, 2018 at 07:34:06PM +0300, Michael S. Tsirkin wrote:
> On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:
> > This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
> > With this feature negotiated, vhost-user backend can register
> > memory region based host notifiers. And it will allow the guest
> > driver in the VM to notify the hardware accelerator at the
> > vhost-user backend directly.
> > 
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> 
> Overall I think we can merge this approach, but I have two main concerns
> about this:
> 
> 1. Testing. Most people do not have the virtio hardware
>    so how to make sure this does not bit rot?
> 
>    I have an idea: add an option like this to libvhost-user.
>    Naturally libvhost-user can not get notified about a write
>    to an mmapped area, but it can write a special value there
>    (e.g. all-ones?) and then poll it to detect VQ # writes.
> 
>    Then include a vhost user bridge test with an option like this.
> 
>    I'd like to see a patch doing this.

Sure, I'll do it. Thanks for the suggestion!

> 
> 2. Memory barriers. Right now after updating the avail idx,
>    virtio does smp_wmb() and then the MMIO write.
>    Normal hardware drivers do wmb() which is an sfence.
>    Can a PCI device read bypass index write and see a stale
>    index value?

It depends on arch's memory model. Cunming will provide
more details about this later.

>    To make virtio pci do wmb() we would need a new feature bit.
>    Alternatively I guess we could maybe look at subsystem vendor/device id.
> 
>    I'd like to see a patch doing one of these things.

We prefer to add a new feature bit as it's a more robust
way to do this. I'll send out some patches soon.

Thank you very much! :)

Best regards,
Tiwei Bie

> 
> Thanks!
> 
> > ---
> >  docs/interop/vhost-user.txt    |  33 +++++++++++
> >  hw/virtio/vhost-user.c         | 123 +++++++++++++++++++++++++++++++++++++++++
> >  include/hw/virtio/vhost-user.h |   8 +++
> >  3 files changed, 164 insertions(+)
> > 
> > diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> > index 534caab18a..9e57b36b20 100644
> > --- a/docs/interop/vhost-user.txt
> > +++ b/docs/interop/vhost-user.txt
> > @@ -132,6 +132,16 @@ Depending on the request type, payload can be:
> >     Payload: Size bytes array holding the contents of the virtio
> >         device's configuration space
> >  
> > + * Vring area description
> > +   -----------------------
> > +   | u64 | size | offset |
> > +   -----------------------
> > +
> > +   u64: a 64-bit integer contains vring index and flags
> > +   Size: a 64-bit size of this area
> > +   Offset: a 64-bit offset of this area from the start of the
> > +       supplied file descriptor
> > +
> >  In QEMU the vhost-user message is implemented with the following struct:
> >  
> >  typedef struct VhostUserMsg {
> > @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {
> >          VhostUserLog log;
> >          struct vhost_iotlb_msg iotlb;
> >          VhostUserConfig config;
> > +        VhostUserVringArea area;
> >      };
> >  } QEMU_PACKED VhostUserMsg;
> >  
> > @@ -380,6 +391,7 @@ Protocol features
> >  #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
> >  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
> >  #define VHOST_USER_PROTOCOL_F_CONFIG         9
> > +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
> >  
> >  Master message types
> >  --------------------
> > @@ -777,6 +789,27 @@ Slave message types
> >       the VHOST_USER_NEED_REPLY flag, master must respond with zero when
> >       operation is successfully completed, or non-zero otherwise.
> >  
> > + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
> > +
> > +      Id: 3
> > +      Equivalent ioctl: N/A
> > +      Slave payload: vring area description
> > +      Master payload: N/A
> > +
> > +      Sets host notifier for a specified queue. The queue index is contained
> > +      in the u64 field of the vring area description. The host notifier is
> > +      described by the file descriptor (typically it's a VFIO device fd) which
> > +      is passed as ancillary data and the size (which is mmap size and should
> > +      be the same as host page size) and offset (which is mmap offset) carried
> > +      in the vring area description. QEMU can mmap the file descriptor based
> > +      on the size and offset to get a memory range. Registering a host notifier
> > +      means mapping this memory range to the VM as the specified queue's notify
> > +      MMIO region. Slave sends this request to tell QEMU to de-register the
> > +      existing notifier if any and register the new notifier if the request is
> > +      sent with a file descriptor.
> > +      This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> > +      protocol feature has been successfully negotiated.
> > +
> >  VHOST_USER_PROTOCOL_F_REPLY_ACK:
> >  -------------------------------
> >  The original vhost-user specification only demands replies for certain
> > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> > index 791e0a4763..1cd9c7276b 100644
> > --- a/hw/virtio/vhost-user.c
> > +++ b/hw/virtio/vhost-user.c
> > @@ -13,6 +13,7 @@
> >  #include "hw/virtio/vhost.h"
> >  #include "hw/virtio/vhost-user.h"
> >  #include "hw/virtio/vhost-backend.h"
> > +#include "hw/virtio/virtio.h"
> >  #include "hw/virtio/virtio-net.h"
> >  #include "chardev/char-fe.h"
> >  #include "sysemu/kvm.h"
> > @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {
> >      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
> >      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
> >      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> > +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
> >      VHOST_USER_PROTOCOL_F_MAX
> >  };
> >  
> > @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {
> >      VHOST_USER_SLAVE_NONE = 0,
> >      VHOST_USER_SLAVE_IOTLB_MSG = 1,
> >      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
> > +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> >      VHOST_USER_SLAVE_MAX
> >  }  VhostUserSlaveRequest;
> >  
> > @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__ ((unused));
> >                                     + sizeof(c.size) \
> >                                     + sizeof(c.flags))
> >  
> > +typedef struct VhostUserVringArea {
> > +    uint64_t u64;
> > +    uint64_t size;
> > +    uint64_t offset;
> > +} VhostUserVringArea;
> > +
> >  typedef struct {
> >      VhostUserRequest request;
> >  
> > @@ -157,6 +166,7 @@ typedef union {
> >          struct vhost_iotlb_msg iotlb;
> >          VhostUserConfig config;
> >          VhostUserCryptoSession session;
> > +        VhostUserVringArea area;
> >  } VhostUserPayload;
> >  
> >  typedef struct VhostUserMsg {
> > @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct vhost_dev *dev,
> >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
> >  }
> >  
> > +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
> > +                                             int queue_idx)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > +    VirtIODevice *vdev = dev->vdev;
> > +
> > +    if (n->addr && !n->set) {
> > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
> > +        n->set = true;
> > +    }
> > +}
> > +
> > +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
> > +                                            int queue_idx)
> > +{
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > +    VirtIODevice *vdev = dev->vdev;
> > +
> > +    if (n->addr && n->set) {
> > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > +        n->set = false;
> > +    }
> > +}
> > +
> >  static int vhost_user_set_vring_base(struct vhost_dev *dev,
> >                                       struct vhost_vring_state *ring)
> >  {
> > +    vhost_user_host_notifier_restore(dev, ring->index);
> > +
> >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
> >  }
> >  
> > @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
> >          .hdr.size = sizeof(msg.payload.state),
> >      };
> >  
> > +    vhost_user_host_notifier_remove(dev, ring->index);
> > +
> >      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
> >          return -1;
> >      }
> > @@ -847,6 +887,76 @@ static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
> >      return ret;
> >  }
> >  
> > +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
> > +                                                       VhostUserVringArea *area,
> > +                                                       int fd)
> > +{
> > +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
> > +    size_t page_size = qemu_real_host_page_size;
> > +    struct vhost_user *u = dev->opaque;
> > +    VhostUserState *user = u->user;
> > +    VirtIODevice *vdev = dev->vdev;
> > +    VhostUserHostNotifier *n;
> > +    int ret = 0;
> > +    void *addr;
> > +    char *name;
> > +
> > +    if (!virtio_has_feature(dev->protocol_features,
> > +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
> > +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    n = &user->notifier[queue_idx];
> > +
> > +    if (n->addr) {
> > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > +        object_unparent(OBJECT(&n->mr));
> > +        munmap(n->addr, page_size);
> > +        n->addr = NULL;
> > +    }
> > +
> > +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> > +        goto out;
> > +    }
> > +
> > +    /* Sanity check. */
> > +    if (area->size != page_size) {
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > +                fd, area->offset);
> > +    if (addr == MAP_FAILED) {
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
> > +                           user, queue_idx);
> > +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
> > +                                      page_size, addr);
> > +    g_free(name);
> > +
> > +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
> > +        munmap(addr, page_size);
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    n->addr = addr;
> > +    n->set = true;
> > +
> > +out:
> > +    /* Always close the fd. */
> > +    if (fd != -1) {
> > +        close(fd);
> > +    }
> > +    return ret;
> > +}
> > +
> >  static void slave_read(void *opaque)
> >  {
> >      struct vhost_dev *dev = opaque;
> > @@ -913,6 +1023,10 @@ static void slave_read(void *opaque)
> >      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
> >          ret = vhost_user_slave_handle_config_change(dev);
> >          break;
> > +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
> > +        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
> > +                                                          fd);
> > +        break;
> >      default:
> >          error_report("Received unexpected msg type.");
> >          if (fd != -1) {
> > @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)
> >  
> >  void vhost_user_cleanup(VhostUserState *user)
> >  {
> > +    int i;
> > +
> > +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
> > +        if (user->notifier[i].addr) {
> > +            object_unparent(OBJECT(&user->notifier[i].mr));
> > +            munmap(user->notifier[i].addr, qemu_real_host_page_size);
> > +            user->notifier[i].addr = NULL;
> > +        }
> > +    }
> >  }
> >  
> >  const VhostOps user_ops = {
> > diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> > index eb8bc0d90d..fd660393a0 100644
> > --- a/include/hw/virtio/vhost-user.h
> > +++ b/include/hw/virtio/vhost-user.h
> > @@ -9,9 +9,17 @@
> >  #define HW_VIRTIO_VHOST_USER_H
> >  
> >  #include "chardev/char-fe.h"
> > +#include "hw/virtio/virtio.h"
> > +
> > +typedef struct VhostUserHostNotifier {
> > +    MemoryRegion mr;
> > +    void *addr;
> > +    bool set;
> > +} VhostUserHostNotifier;
> >  
> >  typedef struct VhostUserState {
> >      CharBackend *chr;
> > +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> >  } VhostUserState;
> >  
> >  VhostUserState *vhost_user_init(void);
> > -- 
> > 2.11.0
Liang, Cunming April 19, 2018, 12:43 p.m. UTC | #3
> -----Original Message-----

> From: Bie, Tiwei

> Sent: Thursday, April 19, 2018 7:15 PM

> To: Michael S. Tsirkin <mst@redhat.com>

> Cc: jasowang@redhat.com; alex.williamson@redhat.com; pbonzini@redhat.com;

> stefanha@redhat.com; qemu-devel@nongnu.org; virtio-dev@lists.oasis-

> open.org; Liang, Cunming <cunming.liang@intel.com>; Daly, Dan

> <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong

> <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>

> Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host

> notifiers

> 

> On Wed, Apr 18, 2018 at 07:34:06PM +0300, Michael S. Tsirkin wrote:

> > On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:

> > > This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.

> > > With this feature negotiated, vhost-user backend can register memory

> > > region based host notifiers. And it will allow the guest driver in

> > > the VM to notify the hardware accelerator at the vhost-user backend

> > > directly.

> > >

> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>

> >

> > Overall I think we can merge this approach, but I have two main

> > concerns about this:

> >

> > 1. Testing. Most people do not have the virtio hardware

> >    so how to make sure this does not bit rot?

> >

> >    I have an idea: add an option like this to libvhost-user.

> >    Naturally libvhost-user can not get notified about a write

> >    to an mmapped area, but it can write a special value there

> >    (e.g. all-ones?) and then poll it to detect VQ # writes.

> >

> >    Then include a vhost user bridge test with an option like this.

> >

> >    I'd like to see a patch doing this.

> 

> Sure, I'll do it. Thanks for the suggestion!

> 

> >

> > 2. Memory barriers. Right now after updating the avail idx,

> >    virtio does smp_wmb() and then the MMIO write.

> >    Normal hardware drivers do wmb() which is an sfence.

> >    Can a PCI device read bypass index write and see a stale

> >    index value?

A compiler barrier is enough on strongly-ordered memory platform. As it doesn't re-order store, PCI device won't see a stale index value. But a weakly-ordered memory needs sfence.

> 

> It depends on arch's memory model. Cunming will provide more details about

> this later.

> 

> >    To make virtio pci do wmb() we would need a new feature bit.

> >    Alternatively I guess we could maybe look at subsystem vendor/device id.

> >

> >    I'd like to see a patch doing one of these things.

> 

> We prefer to add a new feature bit as it's a more robust way to do this. I'll send

> out some patches soon.

> 

> Thank you very much! :)

> 

> Best regards,

> Tiwei Bie

> 

> >

> > Thanks!

> >

> > > ---

> > >  docs/interop/vhost-user.txt    |  33 +++++++++++

> > >  hw/virtio/vhost-user.c         | 123

> +++++++++++++++++++++++++++++++++++++++++

> > >  include/hw/virtio/vhost-user.h |   8 +++

> > >  3 files changed, 164 insertions(+)

> > >

> > > diff --git a/docs/interop/vhost-user.txt

> > > b/docs/interop/vhost-user.txt index 534caab18a..9e57b36b20 100644

> > > --- a/docs/interop/vhost-user.txt

> > > +++ b/docs/interop/vhost-user.txt

> > > @@ -132,6 +132,16 @@ Depending on the request type, payload can be:

> > >     Payload: Size bytes array holding the contents of the virtio

> > >         device's configuration space

> > >

> > > + * Vring area description

> > > +   -----------------------

> > > +   | u64 | size | offset |

> > > +   -----------------------

> > > +

> > > +   u64: a 64-bit integer contains vring index and flags

> > > +   Size: a 64-bit size of this area

> > > +   Offset: a 64-bit offset of this area from the start of the

> > > +       supplied file descriptor

> > > +

> > >  In QEMU the vhost-user message is implemented with the following struct:

> > >

> > >  typedef struct VhostUserMsg {

> > > @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {

> > >          VhostUserLog log;

> > >          struct vhost_iotlb_msg iotlb;

> > >          VhostUserConfig config;

> > > +        VhostUserVringArea area;

> > >      };

> > >  } QEMU_PACKED VhostUserMsg;

> > >

> > > @@ -380,6 +391,7 @@ Protocol features  #define

> > > VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7

> > >  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8

> > >  #define VHOST_USER_PROTOCOL_F_CONFIG         9

> > > +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10

> > >

> > >  Master message types

> > >  --------------------

> > > @@ -777,6 +789,27 @@ Slave message types

> > >       the VHOST_USER_NEED_REPLY flag, master must respond with zero when

> > >       operation is successfully completed, or non-zero otherwise.

> > >

> > > + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG

> > > +

> > > +      Id: 3

> > > +      Equivalent ioctl: N/A

> > > +      Slave payload: vring area description

> > > +      Master payload: N/A

> > > +

> > > +      Sets host notifier for a specified queue. The queue index is contained

> > > +      in the u64 field of the vring area description. The host notifier is

> > > +      described by the file descriptor (typically it's a VFIO device fd) which

> > > +      is passed as ancillary data and the size (which is mmap size and should

> > > +      be the same as host page size) and offset (which is mmap offset) carried

> > > +      in the vring area description. QEMU can mmap the file descriptor based

> > > +      on the size and offset to get a memory range. Registering a host notifier

> > > +      means mapping this memory range to the VM as the specified queue's

> notify

> > > +      MMIO region. Slave sends this request to tell QEMU to de-register the

> > > +      existing notifier if any and register the new notifier if the request is

> > > +      sent with a file descriptor.

> > > +      This request should be sent only when

> VHOST_USER_PROTOCOL_F_HOST_NOTIFIER

> > > +      protocol feature has been successfully negotiated.

> > > +

> > >  VHOST_USER_PROTOCOL_F_REPLY_ACK:

> > >  -------------------------------

> > >  The original vhost-user specification only demands replies for

> > > certain diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c

> > > index 791e0a4763..1cd9c7276b 100644

> > > --- a/hw/virtio/vhost-user.c

> > > +++ b/hw/virtio/vhost-user.c

> > > @@ -13,6 +13,7 @@

> > >  #include "hw/virtio/vhost.h"

> > >  #include "hw/virtio/vhost-user.h"

> > >  #include "hw/virtio/vhost-backend.h"

> > > +#include "hw/virtio/virtio.h"

> > >  #include "hw/virtio/virtio-net.h"

> > >  #include "chardev/char-fe.h"

> > >  #include "sysemu/kvm.h"

> > > @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {

> > >      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,

> > >      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,

> > >      VHOST_USER_PROTOCOL_F_CONFIG = 9,

> > > +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,

> > >      VHOST_USER_PROTOCOL_F_MAX

> > >  };

> > >

> > > @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {

> > >      VHOST_USER_SLAVE_NONE = 0,

> > >      VHOST_USER_SLAVE_IOTLB_MSG = 1,

> > >      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,

> > > +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,

> > >      VHOST_USER_SLAVE_MAX

> > >  }  VhostUserSlaveRequest;

> > >

> > > @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__ ((unused));

> > >                                     + sizeof(c.size) \

> > >                                     + sizeof(c.flags))

> > >

> > > +typedef struct VhostUserVringArea {

> > > +    uint64_t u64;

> > > +    uint64_t size;

> > > +    uint64_t offset;

> > > +} VhostUserVringArea;

> > > +

> > >  typedef struct {

> > >      VhostUserRequest request;

> > >

> > > @@ -157,6 +166,7 @@ typedef union {

> > >          struct vhost_iotlb_msg iotlb;

> > >          VhostUserConfig config;

> > >          VhostUserCryptoSession session;

> > > +        VhostUserVringArea area;

> > >  } VhostUserPayload;

> > >

> > >  typedef struct VhostUserMsg {

> > > @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct

> vhost_dev *dev,

> > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);  }

> > >

> > > +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,

> > > +                                             int queue_idx) {

> > > +    struct vhost_user *u = dev->opaque;

> > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];

> > > +    VirtIODevice *vdev = dev->vdev;

> > > +

> > > +    if (n->addr && !n->set) {

> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);

> > > +        n->set = true;

> > > +    }

> > > +}

> > > +

> > > +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,

> > > +                                            int queue_idx) {

> > > +    struct vhost_user *u = dev->opaque;

> > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];

> > > +    VirtIODevice *vdev = dev->vdev;

> > > +

> > > +    if (n->addr && n->set) {

> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);

> > > +        n->set = false;

> > > +    }

> > > +}

> > > +

> > >  static int vhost_user_set_vring_base(struct vhost_dev *dev,

> > >                                       struct vhost_vring_state

> > > *ring)  {

> > > +    vhost_user_host_notifier_restore(dev, ring->index);

> > > +

> > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);

> > > }

> > >

> > > @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct

> vhost_dev *dev,

> > >          .hdr.size = sizeof(msg.payload.state),

> > >      };

> > >

> > > +    vhost_user_host_notifier_remove(dev, ring->index);

> > > +

> > >      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {

> > >          return -1;

> > >      }

> > > @@ -847,6 +887,76 @@ static int

> vhost_user_slave_handle_config_change(struct vhost_dev *dev)

> > >      return ret;

> > >  }

> > >

> > > +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev

> *dev,

> > > +                                                       VhostUserVringArea *area,

> > > +                                                       int fd) {

> > > +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;

> > > +    size_t page_size = qemu_real_host_page_size;

> > > +    struct vhost_user *u = dev->opaque;

> > > +    VhostUserState *user = u->user;

> > > +    VirtIODevice *vdev = dev->vdev;

> > > +    VhostUserHostNotifier *n;

> > > +    int ret = 0;

> > > +    void *addr;

> > > +    char *name;

> > > +

> > > +    if (!virtio_has_feature(dev->protocol_features,

> > > +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||

> > > +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {

> > > +        ret = -1;

> > > +        goto out;

> > > +    }

> > > +

> > > +    n = &user->notifier[queue_idx];

> > > +

> > > +    if (n->addr) {

> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);

> > > +        object_unparent(OBJECT(&n->mr));

> > > +        munmap(n->addr, page_size);

> > > +        n->addr = NULL;

> > > +    }

> > > +

> > > +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {

> > > +        goto out;

> > > +    }

> > > +

> > > +    /* Sanity check. */

> > > +    if (area->size != page_size) {

> > > +        ret = -1;

> > > +        goto out;

> > > +    }

> > > +

> > > +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,

> > > +                fd, area->offset);

> > > +    if (addr == MAP_FAILED) {

> > > +        ret = -1;

> > > +        goto out;

> > > +    }

> > > +

> > > +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",

> > > +                           user, queue_idx);

> > > +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,

> > > +                                      page_size, addr);

> > > +    g_free(name);

> > > +

> > > +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {

> > > +        munmap(addr, page_size);

> > > +        ret = -1;

> > > +        goto out;

> > > +    }

> > > +

> > > +    n->addr = addr;

> > > +    n->set = true;

> > > +

> > > +out:

> > > +    /* Always close the fd. */

> > > +    if (fd != -1) {

> > > +        close(fd);

> > > +    }

> > > +    return ret;

> > > +}

> > > +

> > >  static void slave_read(void *opaque)  {

> > >      struct vhost_dev *dev = opaque; @@ -913,6 +1023,10 @@ static

> > > void slave_read(void *opaque)

> > >      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :

> > >          ret = vhost_user_slave_handle_config_change(dev);

> > >          break;

> > > +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:

> > > +        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,

> > > +                                                          fd);

> > > +        break;

> > >      default:

> > >          error_report("Received unexpected msg type.");

> > >          if (fd != -1) {

> > > @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)

> > >

> > >  void vhost_user_cleanup(VhostUserState *user)  {

> > > +    int i;

> > > +

> > > +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {

> > > +        if (user->notifier[i].addr) {

> > > +            object_unparent(OBJECT(&user->notifier[i].mr));

> > > +            munmap(user->notifier[i].addr, qemu_real_host_page_size);

> > > +            user->notifier[i].addr = NULL;

> > > +        }

> > > +    }

> > >  }

> > >

> > >  const VhostOps user_ops = {

> > > diff --git a/include/hw/virtio/vhost-user.h

> > > b/include/hw/virtio/vhost-user.h index eb8bc0d90d..fd660393a0 100644

> > > --- a/include/hw/virtio/vhost-user.h

> > > +++ b/include/hw/virtio/vhost-user.h

> > > @@ -9,9 +9,17 @@

> > >  #define HW_VIRTIO_VHOST_USER_H

> > >

> > >  #include "chardev/char-fe.h"

> > > +#include "hw/virtio/virtio.h"

> > > +

> > > +typedef struct VhostUserHostNotifier {

> > > +    MemoryRegion mr;

> > > +    void *addr;

> > > +    bool set;

> > > +} VhostUserHostNotifier;

> > >

> > >  typedef struct VhostUserState {

> > >      CharBackend *chr;

> > > +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];

> > >  } VhostUserState;

> > >

> > >  VhostUserState *vhost_user_init(void);

> > > --

> > > 2.11.0
Paolo Bonzini April 19, 2018, 1:02 p.m. UTC | #4
On 19/04/2018 14:43, Liang, Cunming wrote:
>> 2. Memory barriers. Right now after updating the avail idx, 
>> virtio does smp_wmb() and then the MMIO write. Normal hardware
>> drivers do wmb() which is an sfence. Can a PCI device read bypass
>> index write and see a stale index value?
>
> A compiler barrier is enough on strongly-ordered memory platform. As
> it doesn't re-order store, PCI device won't see a stale index value.
> But a weakly-ordered memory needs sfence.

That is complicated then.  We need to define a feature bit and (in the
Linux driver) propagate it to vring_create_virtqueue's weak_barrier
argument.  However:

- if we make it 1 when weak barriers are needed, the device also needs
to nack feature negotiation (not allow setting the FEATURES_OK) if the
bit is not set by the driver.  However, that is not enough.  Live
migration assumes that it is okay to migrate a virtual machine from a
source that doesn't support a feature to a destination that supports it.
 In this case, it would assume that it is okay to migrate from software
virtio to hardware virtio.  This is wrong because the destination would
use weak barriers

- if we make it 1 when strong barriers are enough, software virtio
devices needs to be updated to expose the bit.  This works, including
live migration, but updated drivers will now go slower when run against
an old device that doesn't know the feature bit.

Maybe bump the PCI revision, so that only the new revision has the bit?

Thanks,

Paolo
Michael S. Tsirkin April 19, 2018, 3:19 p.m. UTC | #5
On Thu, Apr 19, 2018 at 03:02:40PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 14:43, Liang, Cunming wrote:
> >> 2. Memory barriers. Right now after updating the avail idx, 
> >> virtio does smp_wmb() and then the MMIO write. Normal hardware
> >> drivers do wmb() which is an sfence. Can a PCI device read bypass
> >> index write and see a stale index value?
> >
> > A compiler barrier is enough on strongly-ordered memory platform. As
> > it doesn't re-order store, PCI device won't see a stale index value.
> > But a weakly-ordered memory needs sfence.
> 
> That is complicated then.  We need to define a feature bit and (in the
> Linux driver) propagate it to vring_create_virtqueue's weak_barrier
> argument.  However:
> 
> - if we make it 1 when weak barriers are needed, the device also needs
> to nack feature negotiation (not allow setting the FEATURES_OK) if the
> bit is not set by the driver.
>  However, that is not enough.  Live
> migration assumes that it is okay to migrate a virtual machine from a
> source that doesn't support a feature to a destination that supports it.
>  In this case, it would assume that it is okay to migrate from software
> virtio to hardware virtio.  This is wrong because the destination would
> use weak barriers

You can't migrate between systems with different sets of device features
right now.

> - if we make it 1 when strong barriers are enough, software virtio
> devices needs to be updated to expose the bit.  This works, including
> live migration, but updated drivers will now go slower when run against
> an old device that doesn't know the feature bit.
> 
> Maybe bump the PCI revision, so that only the new revision has the bit?
> 
> Thanks,
> 
> Paolo

As a first step, if you want to migrate to a HW offloaded solution
then you need to enable the feature. It does mean it will go
a bit slower when run with software, so it's only good if
most systems in your cluster do have the HW offload.
I think we can start by getting that working and
think about ways to improve down the road.


That's the usecase we designed FEATURES_OK for though, so I do
think/hope it's enough and we don't need to play with revisions.
Michael S. Tsirkin April 19, 2018, 3:42 p.m. UTC | #6
On Thu, Apr 19, 2018 at 12:43:42PM +0000, Liang, Cunming wrote:
> 
> 
> > -----Original Message-----
> > From: Bie, Tiwei
> > Sent: Thursday, April 19, 2018 7:15 PM
> > To: Michael S. Tsirkin <mst@redhat.com>
> > Cc: jasowang@redhat.com; alex.williamson@redhat.com; pbonzini@redhat.com;
> > stefanha@redhat.com; qemu-devel@nongnu.org; virtio-dev@lists.oasis-
> > open.org; Liang, Cunming <cunming.liang@intel.com>; Daly, Dan
> > <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> > Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host
> > notifiers
> > 
> > On Wed, Apr 18, 2018 at 07:34:06PM +0300, Michael S. Tsirkin wrote:
> > > On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:
> > > > This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
> > > > With this feature negotiated, vhost-user backend can register memory
> > > > region based host notifiers. And it will allow the guest driver in
> > > > the VM to notify the hardware accelerator at the vhost-user backend
> > > > directly.
> > > >
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > >
> > > Overall I think we can merge this approach, but I have two main
> > > concerns about this:
> > >
> > > 1. Testing. Most people do not have the virtio hardware
> > >    so how to make sure this does not bit rot?
> > >
> > >    I have an idea: add an option like this to libvhost-user.
> > >    Naturally libvhost-user can not get notified about a write
> > >    to an mmapped area, but it can write a special value there
> > >    (e.g. all-ones?) and then poll it to detect VQ # writes.
> > >
> > >    Then include a vhost user bridge test with an option like this.
> > >
> > >    I'd like to see a patch doing this.
> > 
> > Sure, I'll do it. Thanks for the suggestion!
> > 
> > >
> > > 2. Memory barriers. Right now after updating the avail idx,
> > >    virtio does smp_wmb() and then the MMIO write.
> > >    Normal hardware drivers do wmb() which is an sfence.
> > >    Can a PCI device read bypass index write and see a stale
> > >    index value?
> A compiler barrier is enough on strongly-ordered memory platform. As it doesn't re-order store, PCI device won't see a stale index value. But a weakly-ordered memory needs sfence.


Oh you are right.

So it's only needed for non-intel platforms or when packets are in WC
memory then. And I don't know whether dpdk ever puts packets in WC memory.

I guess we'll cross this bridge when we get to it.


> > 
> > It depends on arch's memory model. Cunming will provide more details about
> > this later.
> > 
> > >    To make virtio pci do wmb() we would need a new feature bit.
> > >    Alternatively I guess we could maybe look at subsystem vendor/device id.
> > >
> > >    I'd like to see a patch doing one of these things.
> > 
> > We prefer to add a new feature bit as it's a more robust way to do this. I'll send
> > out some patches soon.
> > 
> > Thank you very much! :)
> > 
> > Best regards,
> > Tiwei Bie
> > 
> > >
> > > Thanks!
> > >
> > > > ---
> > > >  docs/interop/vhost-user.txt    |  33 +++++++++++
> > > >  hw/virtio/vhost-user.c         | 123
> > +++++++++++++++++++++++++++++++++++++++++
> > > >  include/hw/virtio/vhost-user.h |   8 +++
> > > >  3 files changed, 164 insertions(+)
> > > >
> > > > diff --git a/docs/interop/vhost-user.txt
> > > > b/docs/interop/vhost-user.txt index 534caab18a..9e57b36b20 100644
> > > > --- a/docs/interop/vhost-user.txt
> > > > +++ b/docs/interop/vhost-user.txt
> > > > @@ -132,6 +132,16 @@ Depending on the request type, payload can be:
> > > >     Payload: Size bytes array holding the contents of the virtio
> > > >         device's configuration space
> > > >
> > > > + * Vring area description
> > > > +   -----------------------
> > > > +   | u64 | size | offset |
> > > > +   -----------------------
> > > > +
> > > > +   u64: a 64-bit integer contains vring index and flags
> > > > +   Size: a 64-bit size of this area
> > > > +   Offset: a 64-bit offset of this area from the start of the
> > > > +       supplied file descriptor
> > > > +
> > > >  In QEMU the vhost-user message is implemented with the following struct:
> > > >
> > > >  typedef struct VhostUserMsg {
> > > > @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {
> > > >          VhostUserLog log;
> > > >          struct vhost_iotlb_msg iotlb;
> > > >          VhostUserConfig config;
> > > > +        VhostUserVringArea area;
> > > >      };
> > > >  } QEMU_PACKED VhostUserMsg;
> > > >
> > > > @@ -380,6 +391,7 @@ Protocol features  #define
> > > > VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
> > > >  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
> > > >  #define VHOST_USER_PROTOCOL_F_CONFIG         9
> > > > +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
> > > >
> > > >  Master message types
> > > >  --------------------
> > > > @@ -777,6 +789,27 @@ Slave message types
> > > >       the VHOST_USER_NEED_REPLY flag, master must respond with zero when
> > > >       operation is successfully completed, or non-zero otherwise.
> > > >
> > > > + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
> > > > +
> > > > +      Id: 3
> > > > +      Equivalent ioctl: N/A
> > > > +      Slave payload: vring area description
> > > > +      Master payload: N/A
> > > > +
> > > > +      Sets host notifier for a specified queue. The queue index is contained
> > > > +      in the u64 field of the vring area description. The host notifier is
> > > > +      described by the file descriptor (typically it's a VFIO device fd) which
> > > > +      is passed as ancillary data and the size (which is mmap size and should
> > > > +      be the same as host page size) and offset (which is mmap offset) carried
> > > > +      in the vring area description. QEMU can mmap the file descriptor based
> > > > +      on the size and offset to get a memory range. Registering a host notifier
> > > > +      means mapping this memory range to the VM as the specified queue's
> > notify
> > > > +      MMIO region. Slave sends this request to tell QEMU to de-register the
> > > > +      existing notifier if any and register the new notifier if the request is
> > > > +      sent with a file descriptor.
> > > > +      This request should be sent only when
> > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> > > > +      protocol feature has been successfully negotiated.
> > > > +
> > > >  VHOST_USER_PROTOCOL_F_REPLY_ACK:
> > > >  -------------------------------
> > > >  The original vhost-user specification only demands replies for
> > > > certain diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> > > > index 791e0a4763..1cd9c7276b 100644
> > > > --- a/hw/virtio/vhost-user.c
> > > > +++ b/hw/virtio/vhost-user.c
> > > > @@ -13,6 +13,7 @@
> > > >  #include "hw/virtio/vhost.h"
> > > >  #include "hw/virtio/vhost-user.h"
> > > >  #include "hw/virtio/vhost-backend.h"
> > > > +#include "hw/virtio/virtio.h"
> > > >  #include "hw/virtio/virtio-net.h"
> > > >  #include "chardev/char-fe.h"
> > > >  #include "sysemu/kvm.h"
> > > > @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {
> > > >      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
> > > >      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
> > > >      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> > > > +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
> > > >      VHOST_USER_PROTOCOL_F_MAX
> > > >  };
> > > >
> > > > @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {
> > > >      VHOST_USER_SLAVE_NONE = 0,
> > > >      VHOST_USER_SLAVE_IOTLB_MSG = 1,
> > > >      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
> > > > +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> > > >      VHOST_USER_SLAVE_MAX
> > > >  }  VhostUserSlaveRequest;
> > > >
> > > > @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__ ((unused));
> > > >                                     + sizeof(c.size) \
> > > >                                     + sizeof(c.flags))
> > > >
> > > > +typedef struct VhostUserVringArea {
> > > > +    uint64_t u64;
> > > > +    uint64_t size;
> > > > +    uint64_t offset;
> > > > +} VhostUserVringArea;
> > > > +
> > > >  typedef struct {
> > > >      VhostUserRequest request;
> > > >
> > > > @@ -157,6 +166,7 @@ typedef union {
> > > >          struct vhost_iotlb_msg iotlb;
> > > >          VhostUserConfig config;
> > > >          VhostUserCryptoSession session;
> > > > +        VhostUserVringArea area;
> > > >  } VhostUserPayload;
> > > >
> > > >  typedef struct VhostUserMsg {
> > > > @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct
> > vhost_dev *dev,
> > > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);  }
> > > >
> > > > +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
> > > > +                                             int queue_idx) {
> > > > +    struct vhost_user *u = dev->opaque;
> > > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > > +    VirtIODevice *vdev = dev->vdev;
> > > > +
> > > > +    if (n->addr && !n->set) {
> > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
> > > > +        n->set = true;
> > > > +    }
> > > > +}
> > > > +
> > > > +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
> > > > +                                            int queue_idx) {
> > > > +    struct vhost_user *u = dev->opaque;
> > > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > > +    VirtIODevice *vdev = dev->vdev;
> > > > +
> > > > +    if (n->addr && n->set) {
> > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > > > +        n->set = false;
> > > > +    }
> > > > +}
> > > > +
> > > >  static int vhost_user_set_vring_base(struct vhost_dev *dev,
> > > >                                       struct vhost_vring_state
> > > > *ring)  {
> > > > +    vhost_user_host_notifier_restore(dev, ring->index);
> > > > +
> > > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
> > > > }
> > > >
> > > > @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct
> > vhost_dev *dev,
> > > >          .hdr.size = sizeof(msg.payload.state),
> > > >      };
> > > >
> > > > +    vhost_user_host_notifier_remove(dev, ring->index);
> > > > +
> > > >      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
> > > >          return -1;
> > > >      }
> > > > @@ -847,6 +887,76 @@ static int
> > vhost_user_slave_handle_config_change(struct vhost_dev *dev)
> > > >      return ret;
> > > >  }
> > > >
> > > > +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev
> > *dev,
> > > > +                                                       VhostUserVringArea *area,
> > > > +                                                       int fd) {
> > > > +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
> > > > +    size_t page_size = qemu_real_host_page_size;
> > > > +    struct vhost_user *u = dev->opaque;
> > > > +    VhostUserState *user = u->user;
> > > > +    VirtIODevice *vdev = dev->vdev;
> > > > +    VhostUserHostNotifier *n;
> > > > +    int ret = 0;
> > > > +    void *addr;
> > > > +    char *name;
> > > > +
> > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
> > > > +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
> > > > +        ret = -1;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    n = &user->notifier[queue_idx];
> > > > +
> > > > +    if (n->addr) {
> > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > > > +        object_unparent(OBJECT(&n->mr));
> > > > +        munmap(n->addr, page_size);
> > > > +        n->addr = NULL;
> > > > +    }
> > > > +
> > > > +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /* Sanity check. */
> > > > +    if (area->size != page_size) {
> > > > +        ret = -1;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > +                fd, area->offset);
> > > > +    if (addr == MAP_FAILED) {
> > > > +        ret = -1;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
> > > > +                           user, queue_idx);
> > > > +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
> > > > +                                      page_size, addr);
> > > > +    g_free(name);
> > > > +
> > > > +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
> > > > +        munmap(addr, page_size);
> > > > +        ret = -1;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    n->addr = addr;
> > > > +    n->set = true;
> > > > +
> > > > +out:
> > > > +    /* Always close the fd. */
> > > > +    if (fd != -1) {
> > > > +        close(fd);
> > > > +    }
> > > > +    return ret;
> > > > +}
> > > > +
> > > >  static void slave_read(void *opaque)  {
> > > >      struct vhost_dev *dev = opaque; @@ -913,6 +1023,10 @@ static
> > > > void slave_read(void *opaque)
> > > >      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
> > > >          ret = vhost_user_slave_handle_config_change(dev);
> > > >          break;
> > > > +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
> > > > +        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
> > > > +                                                          fd);
> > > > +        break;
> > > >      default:
> > > >          error_report("Received unexpected msg type.");
> > > >          if (fd != -1) {
> > > > @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)
> > > >
> > > >  void vhost_user_cleanup(VhostUserState *user)  {
> > > > +    int i;
> > > > +
> > > > +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
> > > > +        if (user->notifier[i].addr) {
> > > > +            object_unparent(OBJECT(&user->notifier[i].mr));
> > > > +            munmap(user->notifier[i].addr, qemu_real_host_page_size);
> > > > +            user->notifier[i].addr = NULL;
> > > > +        }
> > > > +    }
> > > >  }
> > > >
> > > >  const VhostOps user_ops = {
> > > > diff --git a/include/hw/virtio/vhost-user.h
> > > > b/include/hw/virtio/vhost-user.h index eb8bc0d90d..fd660393a0 100644
> > > > --- a/include/hw/virtio/vhost-user.h
> > > > +++ b/include/hw/virtio/vhost-user.h
> > > > @@ -9,9 +9,17 @@
> > > >  #define HW_VIRTIO_VHOST_USER_H
> > > >
> > > >  #include "chardev/char-fe.h"
> > > > +#include "hw/virtio/virtio.h"
> > > > +
> > > > +typedef struct VhostUserHostNotifier {
> > > > +    MemoryRegion mr;
> > > > +    void *addr;
> > > > +    bool set;
> > > > +} VhostUserHostNotifier;
> > > >
> > > >  typedef struct VhostUserState {
> > > >      CharBackend *chr;
> > > > +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > > >  } VhostUserState;
> > > >
> > > >  VhostUserState *vhost_user_init(void);
> > > > --
> > > > 2.11.0
Paolo Bonzini April 19, 2018, 3:51 p.m. UTC | #7
On 19/04/2018 17:19, Michael S. Tsirkin wrote:
>> - if we make it 1 when weak barriers are needed, the device also needs
>> to nack feature negotiation (not allow setting the FEATURES_OK) if the
>> bit is not set by the driver.
>>  However, that is not enough.  Live
>> migration assumes that it is okay to migrate a virtual machine from a
>> source that doesn't support a feature to a destination that supports it.
>>  In this case, it would assume that it is okay to migrate from software
>> virtio to hardware virtio.  This is wrong because the destination would
>> use weak barriers
> 
> You can't migrate between systems with different sets of device features
> right now.

Yes, you can, exactly because some features are defined not by the
machine type but rather by the host kernel.  See virtio_net_get_features
in QEMU's hw/virtio/virtio-net.c, and virtio_set_features_nocheck in
QEMU's hw/virtio/virtio.c.

Thanks,

Paolo
Paolo Bonzini April 19, 2018, 3:52 p.m. UTC | #8
On 19/04/2018 17:42, Michael S. Tsirkin wrote:
>> A compiler barrier is enough on strongly-ordered memory platform.
>> As it doesn't re-order store, PCI device won't see a stale index
>> value. But a weakly-ordered memory needs sfence.
> 
> Oh you are right.
> 
> So it's only needed for non-intel platforms or when packets are in
> WC memory then. And I don't know whether dpdk ever puts packets in WC
> memory.
> 
> I guess we'll cross this bridge when we get to it.

Non-TSO architectures seem important...

Paolo
Michael S. Tsirkin April 19, 2018, 3:59 p.m. UTC | #9
On Thu, Apr 19, 2018 at 05:51:51PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 17:19, Michael S. Tsirkin wrote:
> >> - if we make it 1 when weak barriers are needed, the device also needs
> >> to nack feature negotiation (not allow setting the FEATURES_OK) if the
> >> bit is not set by the driver.
> >>  However, that is not enough.  Live
> >> migration assumes that it is okay to migrate a virtual machine from a
> >> source that doesn't support a feature to a destination that supports it.
> >>  In this case, it would assume that it is okay to migrate from software
> >> virtio to hardware virtio.  This is wrong because the destination would
> >> use weak barriers
> > 
> > You can't migrate between systems with different sets of device features
> > right now.
> 
> Yes, you can, exactly because some features are defined not by the
> machine type but rather by the host kernel.  See virtio_net_get_features
> in QEMU's hw/virtio/virtio-net.c, and virtio_set_features_nocheck in
> QEMU's hw/virtio/virtio.c.
> 
> Thanks,
> 
> Paolo

Oh you are right. Well we can just special-case that one :)
Paolo Bonzini April 19, 2018, 4:07 p.m. UTC | #10
On 19/04/2018 17:59, Michael S. Tsirkin wrote:
> On Thu, Apr 19, 2018 at 05:51:51PM +0200, Paolo Bonzini wrote:
>> On 19/04/2018 17:19, Michael S. Tsirkin wrote:
>>>> - if we make it 1 when weak barriers are needed, the device also needs
>>>> to nack feature negotiation (not allow setting the FEATURES_OK) if the
>>>> bit is not set by the driver.
>>>>  However, that is not enough.  Live
>>>> migration assumes that it is okay to migrate a virtual machine from a
>>>> source that doesn't support a feature to a destination that supports it.
>>>>  In this case, it would assume that it is okay to migrate from software
>>>> virtio to hardware virtio.  This is wrong because the destination would
>>>> use weak barriers
>>>
>>> You can't migrate between systems with different sets of device features
>>> right now.
>>
>> Yes, you can, exactly because some features are defined not by the
>> machine type but rather by the host kernel.  See virtio_net_get_features
>> in QEMU's hw/virtio/virtio-net.c, and virtio_set_features_nocheck in
>> QEMU's hw/virtio/virtio.c.
> 
> Oh you are right. Well we can just special-case that one :)

Anything wrong with bumping the PCI revision?

Paolo
Liang, Cunming April 19, 2018, 4:24 p.m. UTC | #11
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Thursday, April 19, 2018 11:19 PM
> To: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Liang, Cunming <cunming.liang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> jasowang@redhat.com; alex.williamson@redhat.com; stefanha@redhat.com;
> qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> Subject: Re: [virtio-dev] RE: [PATCH v3 6/6] vhost-user: support registering
> external host notifiers
> 
> On Thu, Apr 19, 2018 at 03:02:40PM +0200, Paolo Bonzini wrote:
> > On 19/04/2018 14:43, Liang, Cunming wrote:
> > >> 2. Memory barriers. Right now after updating the avail idx, virtio
> > >> does smp_wmb() and then the MMIO write. Normal hardware drivers do
> > >> wmb() which is an sfence. Can a PCI device read bypass index write
> > >> and see a stale index value?
> > >
> > > A compiler barrier is enough on strongly-ordered memory platform. As
> > > it doesn't re-order store, PCI device won't see a stale index value.
> > > But a weakly-ordered memory needs sfence.
> >
> > That is complicated then.  We need to define a feature bit and (in the
> > Linux driver) propagate it to vring_create_virtqueue's weak_barrier
> > argument.  However:
> >
> > - if we make it 1 when weak barriers are needed, the device also needs
> > to nack feature negotiation (not allow setting the FEATURES_OK) if the
> > bit is not set by the driver.
> >  However, that is not enough.  Live
> > migration assumes that it is okay to migrate a virtual machine from a
> > source that doesn't support a feature to a destination that supports it.
> >  In this case, it would assume that it is okay to migrate from
> > software virtio to hardware virtio.  This is wrong because the
> > destination would use weak barriers
> 
> You can't migrate between systems with different sets of device features right
> now.
> 
> > - if we make it 1 when strong barriers are enough, software virtio
> > devices needs to be updated to expose the bit.  This works, including
> > live migration, but updated drivers will now go slower when run
> > against an old device that doesn't know the feature bit.
> >
> > Maybe bump the PCI revision, so that only the new revision has the bit?
> >
> > Thanks,
> >
> > Paolo
> 
> As a first step, if you want to migrate to a HW offloaded solution then you need
> to enable the feature.

> It does mean it will go a bit slower when run with software,
> so it's only good if most systems in your cluster do have the HW offload.
To clarify a bit more, it's suboptimal to always use mandatory barriers for MMIO. Per strongly-order memory, 'weak barriers' (smp_wmb) is pretty good for MMIO. The tradeoff doesn't always happen, software and HW offload can align on the same page.

> I think we can start by getting that working and think about ways to improve
> down the road.
> 
> 
> That's the usecase we designed FEATURES_OK for though, so I do think/hope it's
> enough and we don't need to play with revisions.
> 
> 
> --
> MST
Liang, Cunming April 19, 2018, 4:27 p.m. UTC | #12
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Thursday, April 19, 2018 11:43 PM
> To: Liang, Cunming <cunming.liang@intel.com>
> Cc: Bie, Tiwei <tiwei.bie@intel.com>; jasowang@redhat.com;
> alex.williamson@redhat.com; pbonzini@redhat.com; stefanha@redhat.com;
> qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host
> notifiers
> 
> On Thu, Apr 19, 2018 at 12:43:42PM +0000, Liang, Cunming wrote:
> >
> >
> > > -----Original Message-----
> > > From: Bie, Tiwei
> > > Sent: Thursday, April 19, 2018 7:15 PM
> > > To: Michael S. Tsirkin <mst@redhat.com>
> > > Cc: jasowang@redhat.com; alex.williamson@redhat.com;
> > > pbonzini@redhat.com; stefanha@redhat.com; qemu-devel@nongnu.org;
> > > virtio-dev@lists.oasis- open.org; Liang, Cunming
> > > <cunming.liang@intel.com>; Daly, Dan <dan.daly@intel.com>; Tan,
> > > Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> > > <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> > > Subject: Re: [PATCH v3 6/6] vhost-user: support registering external
> > > host notifiers
> > >
> > > On Wed, Apr 18, 2018 at 07:34:06PM +0300, Michael S. Tsirkin wrote:
> > > > On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:
> > > > > This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
> > > > > With this feature negotiated, vhost-user backend can register
> > > > > memory region based host notifiers. And it will allow the guest
> > > > > driver in the VM to notify the hardware accelerator at the
> > > > > vhost-user backend directly.
> > > > >
> > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > >
> > > > Overall I think we can merge this approach, but I have two main
> > > > concerns about this:
> > > >
> > > > 1. Testing. Most people do not have the virtio hardware
> > > >    so how to make sure this does not bit rot?
> > > >
> > > >    I have an idea: add an option like this to libvhost-user.
> > > >    Naturally libvhost-user can not get notified about a write
> > > >    to an mmapped area, but it can write a special value there
> > > >    (e.g. all-ones?) and then poll it to detect VQ # writes.
> > > >
> > > >    Then include a vhost user bridge test with an option like this.
> > > >
> > > >    I'd like to see a patch doing this.
> > >
> > > Sure, I'll do it. Thanks for the suggestion!
> > >
> > > >
> > > > 2. Memory barriers. Right now after updating the avail idx,
> > > >    virtio does smp_wmb() and then the MMIO write.
> > > >    Normal hardware drivers do wmb() which is an sfence.
> > > >    Can a PCI device read bypass index write and see a stale
> > > >    index value?
> > A compiler barrier is enough on strongly-ordered memory platform. As it
> doesn't re-order store, PCI device won't see a stale index value. But a weakly-
> ordered memory needs sfence.
> 
> 
> Oh you are right.
> 
> So it's only needed for non-intel platforms or when packets are in WC memory
> then. And I don't know whether dpdk ever puts packets in WC memory.

No, we haven't use WC memory.

> 
> I guess we'll cross this bridge when we get to it.
> 
> 
> > >
> > > It depends on arch's memory model. Cunming will provide more details
> > > about this later.
> > >
> > > >    To make virtio pci do wmb() we would need a new feature bit.
> > > >    Alternatively I guess we could maybe look at subsystem vendor/device id.
> > > >
> > > >    I'd like to see a patch doing one of these things.
> > >
> > > We prefer to add a new feature bit as it's a more robust way to do
> > > this. I'll send out some patches soon.
> > >
> > > Thank you very much! :)
> > >
> > > Best regards,
> > > Tiwei Bie
> > >
> > > >
> > > > Thanks!
> > > >
> > > > > ---
> > > > >  docs/interop/vhost-user.txt    |  33 +++++++++++
> > > > >  hw/virtio/vhost-user.c         | 123
> > > +++++++++++++++++++++++++++++++++++++++++
> > > > >  include/hw/virtio/vhost-user.h |   8 +++
> > > > >  3 files changed, 164 insertions(+)
> > > > >
> > > > > diff --git a/docs/interop/vhost-user.txt
> > > > > b/docs/interop/vhost-user.txt index 534caab18a..9e57b36b20
> > > > > 100644
> > > > > --- a/docs/interop/vhost-user.txt
> > > > > +++ b/docs/interop/vhost-user.txt
> > > > > @@ -132,6 +132,16 @@ Depending on the request type, payload can be:
> > > > >     Payload: Size bytes array holding the contents of the virtio
> > > > >         device's configuration space
> > > > >
> > > > > + * Vring area description
> > > > > +   -----------------------
> > > > > +   | u64 | size | offset |
> > > > > +   -----------------------
> > > > > +
> > > > > +   u64: a 64-bit integer contains vring index and flags
> > > > > +   Size: a 64-bit size of this area
> > > > > +   Offset: a 64-bit offset of this area from the start of the
> > > > > +       supplied file descriptor
> > > > > +
> > > > >  In QEMU the vhost-user message is implemented with the following
> struct:
> > > > >
> > > > >  typedef struct VhostUserMsg {
> > > > > @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {
> > > > >          VhostUserLog log;
> > > > >          struct vhost_iotlb_msg iotlb;
> > > > >          VhostUserConfig config;
> > > > > +        VhostUserVringArea area;
> > > > >      };
> > > > >  } QEMU_PACKED VhostUserMsg;
> > > > >
> > > > > @@ -380,6 +391,7 @@ Protocol features  #define
> > > > > VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
> > > > >  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
> > > > >  #define VHOST_USER_PROTOCOL_F_CONFIG         9
> > > > > +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
> > > > >
> > > > >  Master message types
> > > > >  --------------------
> > > > > @@ -777,6 +789,27 @@ Slave message types
> > > > >       the VHOST_USER_NEED_REPLY flag, master must respond with zero
> when
> > > > >       operation is successfully completed, or non-zero otherwise.
> > > > >
> > > > > + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
> > > > > +
> > > > > +      Id: 3
> > > > > +      Equivalent ioctl: N/A
> > > > > +      Slave payload: vring area description
> > > > > +      Master payload: N/A
> > > > > +
> > > > > +      Sets host notifier for a specified queue. The queue index is contained
> > > > > +      in the u64 field of the vring area description. The host notifier is
> > > > > +      described by the file descriptor (typically it's a VFIO device fd) which
> > > > > +      is passed as ancillary data and the size (which is mmap size and should
> > > > > +      be the same as host page size) and offset (which is mmap offset)
> carried
> > > > > +      in the vring area description. QEMU can mmap the file descriptor
> based
> > > > > +      on the size and offset to get a memory range. Registering a host
> notifier
> > > > > +      means mapping this memory range to the VM as the
> > > > > + specified queue's
> > > notify
> > > > > +      MMIO region. Slave sends this request to tell QEMU to de-register
> the
> > > > > +      existing notifier if any and register the new notifier if the request is
> > > > > +      sent with a file descriptor.
> > > > > +      This request should be sent only when
> > > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> > > > > +      protocol feature has been successfully negotiated.
> > > > > +
> > > > >  VHOST_USER_PROTOCOL_F_REPLY_ACK:
> > > > >  -------------------------------  The original vhost-user
> > > > > specification only demands replies for certain diff --git
> > > > > a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index
> > > > > 791e0a4763..1cd9c7276b 100644
> > > > > --- a/hw/virtio/vhost-user.c
> > > > > +++ b/hw/virtio/vhost-user.c
> > > > > @@ -13,6 +13,7 @@
> > > > >  #include "hw/virtio/vhost.h"
> > > > >  #include "hw/virtio/vhost-user.h"
> > > > >  #include "hw/virtio/vhost-backend.h"
> > > > > +#include "hw/virtio/virtio.h"
> > > > >  #include "hw/virtio/virtio-net.h"
> > > > >  #include "chardev/char-fe.h"
> > > > >  #include "sysemu/kvm.h"
> > > > > @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {
> > > > >      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
> > > > >      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
> > > > >      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> > > > > +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
> > > > >      VHOST_USER_PROTOCOL_F_MAX
> > > > >  };
> > > > >
> > > > > @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {
> > > > >      VHOST_USER_SLAVE_NONE = 0,
> > > > >      VHOST_USER_SLAVE_IOTLB_MSG = 1,
> > > > >      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
> > > > > +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> > > > >      VHOST_USER_SLAVE_MAX
> > > > >  }  VhostUserSlaveRequest;
> > > > >
> > > > > @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__
> ((unused));
> > > > >                                     + sizeof(c.size) \
> > > > >                                     + sizeof(c.flags))
> > > > >
> > > > > +typedef struct VhostUserVringArea {
> > > > > +    uint64_t u64;
> > > > > +    uint64_t size;
> > > > > +    uint64_t offset;
> > > > > +} VhostUserVringArea;
> > > > > +
> > > > >  typedef struct {
> > > > >      VhostUserRequest request;
> > > > >
> > > > > @@ -157,6 +166,7 @@ typedef union {
> > > > >          struct vhost_iotlb_msg iotlb;
> > > > >          VhostUserConfig config;
> > > > >          VhostUserCryptoSession session;
> > > > > +        VhostUserVringArea area;
> > > > >  } VhostUserPayload;
> > > > >
> > > > >  typedef struct VhostUserMsg {
> > > > > @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct
> > > vhost_dev *dev,
> > > > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM,
> > > > > ring);  }
> > > > >
> > > > > +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
> > > > > +                                             int queue_idx) {
> > > > > +    struct vhost_user *u = dev->opaque;
> > > > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > +
> > > > > +    if (n->addr && !n->set) {
> > > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
> > > > > +        n->set = true;
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
> > > > > +                                            int queue_idx) {
> > > > > +    struct vhost_user *u = dev->opaque;
> > > > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > +
> > > > > +    if (n->addr && n->set) {
> > > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr,
> false);
> > > > > +        n->set = false;
> > > > > +    }
> > > > > +}
> > > > > +
> > > > >  static int vhost_user_set_vring_base(struct vhost_dev *dev,
> > > > >                                       struct vhost_vring_state
> > > > > *ring)  {
> > > > > +    vhost_user_host_notifier_restore(dev, ring->index);
> > > > > +
> > > > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE,
> > > > > ring); }
> > > > >
> > > > > @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct
> > > vhost_dev *dev,
> > > > >          .hdr.size = sizeof(msg.payload.state),
> > > > >      };
> > > > >
> > > > > +    vhost_user_host_notifier_remove(dev, ring->index);
> > > > > +
> > > > >      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
> > > > >          return -1;
> > > > >      }
> > > > > @@ -847,6 +887,76 @@ static int
> > > vhost_user_slave_handle_config_change(struct vhost_dev *dev)
> > > > >      return ret;
> > > > >  }
> > > > >
> > > > > +static int vhost_user_slave_handle_vring_host_notifier(struct
> > > > > +vhost_dev
> > > *dev,
> > > > > +                                                       VhostUserVringArea *area,
> > > > > +                                                       int fd) {
> > > > > +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
> > > > > +    size_t page_size = qemu_real_host_page_size;
> > > > > +    struct vhost_user *u = dev->opaque;
> > > > > +    VhostUserState *user = u->user;
> > > > > +    VirtIODevice *vdev = dev->vdev;
> > > > > +    VhostUserHostNotifier *n;
> > > > > +    int ret = 0;
> > > > > +    void *addr;
> > > > > +    char *name;
> > > > > +
> > > > > +    if (!virtio_has_feature(dev->protocol_features,
> > > > > +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
> > > > > +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
> > > > > +        ret = -1;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    n = &user->notifier[queue_idx];
> > > > > +
> > > > > +    if (n->addr) {
> > > > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr,
> false);
> > > > > +        object_unparent(OBJECT(&n->mr));
> > > > > +        munmap(n->addr, page_size);
> > > > > +        n->addr = NULL;
> > > > > +    }
> > > > > +
> > > > > +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /* Sanity check. */
> > > > > +    if (area->size != page_size) {
> > > > > +        ret = -1;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
> MAP_SHARED,
> > > > > +                fd, area->offset);
> > > > > +    if (addr == MAP_FAILED) {
> > > > > +        ret = -1;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
> > > > > +                           user, queue_idx);
> > > > > +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
> > > > > +                                      page_size, addr);
> > > > > +    g_free(name);
> > > > > +
> > > > > +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr,
> true)) {
> > > > > +        munmap(addr, page_size);
> > > > > +        ret = -1;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    n->addr = addr;
> > > > > +    n->set = true;
> > > > > +
> > > > > +out:
> > > > > +    /* Always close the fd. */
> > > > > +    if (fd != -1) {
> > > > > +        close(fd);
> > > > > +    }
> > > > > +    return ret;
> > > > > +}
> > > > > +
> > > > >  static void slave_read(void *opaque)  {
> > > > >      struct vhost_dev *dev = opaque; @@ -913,6 +1023,10 @@
> > > > > static void slave_read(void *opaque)
> > > > >      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
> > > > >          ret = vhost_user_slave_handle_config_change(dev);
> > > > >          break;
> > > > > +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
> > > > > +        ret = vhost_user_slave_handle_vring_host_notifier(dev,
> &payload.area,
> > > > > +                                                          fd);
> > > > > +        break;
> > > > >      default:
> > > > >          error_report("Received unexpected msg type.");
> > > > >          if (fd != -1) {
> > > > > @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)
> > > > >
> > > > >  void vhost_user_cleanup(VhostUserState *user)  {
> > > > > +    int i;
> > > > > +
> > > > > +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
> > > > > +        if (user->notifier[i].addr) {
> > > > > +            object_unparent(OBJECT(&user->notifier[i].mr));
> > > > > +            munmap(user->notifier[i].addr, qemu_real_host_page_size);
> > > > > +            user->notifier[i].addr = NULL;
> > > > > +        }
> > > > > +    }
> > > > >  }
> > > > >
> > > > >  const VhostOps user_ops = {
> > > > > diff --git a/include/hw/virtio/vhost-user.h
> > > > > b/include/hw/virtio/vhost-user.h index eb8bc0d90d..fd660393a0
> > > > > 100644
> > > > > --- a/include/hw/virtio/vhost-user.h
> > > > > +++ b/include/hw/virtio/vhost-user.h
> > > > > @@ -9,9 +9,17 @@
> > > > >  #define HW_VIRTIO_VHOST_USER_H
> > > > >
> > > > >  #include "chardev/char-fe.h"
> > > > > +#include "hw/virtio/virtio.h"
> > > > > +
> > > > > +typedef struct VhostUserHostNotifier {
> > > > > +    MemoryRegion mr;
> > > > > +    void *addr;
> > > > > +    bool set;
> > > > > +} VhostUserHostNotifier;
> > > > >
> > > > >  typedef struct VhostUserState {
> > > > >      CharBackend *chr;
> > > > > +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > > > >  } VhostUserState;
> > > > >
> > > > >  VhostUserState *vhost_user_init(void);
> > > > > --
> > > > > 2.11.0
Michael S. Tsirkin April 19, 2018, 4:34 p.m. UTC | #13
On Thu, Apr 19, 2018 at 05:52:23PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 17:42, Michael S. Tsirkin wrote:
> >> A compiler barrier is enough on strongly-ordered memory platform.
> >> As it doesn't re-order store, PCI device won't see a stale index
> >> value. But a weakly-ordered memory needs sfence.
> > 
> > Oh you are right.
> > 
> > So it's only needed for non-intel platforms or when packets are in
> > WC memory then. And I don't know whether dpdk ever puts packets in WC
> > memory.
> > 
> > I guess we'll cross this bridge when we get to it.
> 
> Non-TSO architectures seem important...
> 
> Paolo

Only if there are virtio offload engines on platforms with these
architectures :) Worth working on for sure but doesn't have to block
this patchset.
Michael S. Tsirkin April 19, 2018, 4:48 p.m. UTC | #14
On Thu, Apr 19, 2018 at 06:07:07PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 17:59, Michael S. Tsirkin wrote:
> > On Thu, Apr 19, 2018 at 05:51:51PM +0200, Paolo Bonzini wrote:
> >> On 19/04/2018 17:19, Michael S. Tsirkin wrote:
> >>>> - if we make it 1 when weak barriers are needed, the device also needs
> >>>> to nack feature negotiation (not allow setting the FEATURES_OK) if the
> >>>> bit is not set by the driver.
> >>>>  However, that is not enough.  Live
> >>>> migration assumes that it is okay to migrate a virtual machine from a
> >>>> source that doesn't support a feature to a destination that supports it.
> >>>>  In this case, it would assume that it is okay to migrate from software
> >>>> virtio to hardware virtio.  This is wrong because the destination would
> >>>> use weak barriers
> >>>
> >>> You can't migrate between systems with different sets of device features
> >>> right now.
> >>
> >> Yes, you can, exactly because some features are defined not by the
> >> machine type but rather by the host kernel.  See virtio_net_get_features
> >> in QEMU's hw/virtio/virtio-net.c, and virtio_set_features_nocheck in
> >> QEMU's hw/virtio/virtio.c.
> > 
> > Oh you are right. Well we can just special-case that one :)
> 
> Anything wrong with bumping the PCI revision?
> 
> Paolo

Nothing except in virtio 1 bumping the revision does not prevent
loading the old driver.
Liang, Cunming April 19, 2018, 4:52 p.m. UTC | #15
> -----Original Message-----

> From: Paolo Bonzini [mailto:pbonzini@redhat.com]

> Sent: Thursday, April 19, 2018 11:52 PM

> To: Michael S. Tsirkin <mst@redhat.com>; Liang, Cunming

> <cunming.liang@intel.com>

> Cc: Bie, Tiwei <tiwei.bie@intel.com>; jasowang@redhat.com;

> alex.williamson@redhat.com; stefanha@redhat.com; qemu-devel@nongnu.org;

> virtio-dev@lists.oasis-open.org; Daly, Dan <dan.daly@intel.com>; Tan, Jianfeng

> <jianfeng.tan@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Wang,

> Xiao W <xiao.w.wang@intel.com>

> Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host

> notifiers

> 

> On 19/04/2018 17:42, Michael S. Tsirkin wrote:

> >> A compiler barrier is enough on strongly-ordered memory platform.

> >> As it doesn't re-order store, PCI device won't see a stale index

> >> value. But a weakly-ordered memory needs sfence.

> >

> > Oh you are right.

> >

> > So it's only needed for non-intel platforms or when packets are in WC

> > memory then. And I don't know whether dpdk ever puts packets in WC

> > memory.

> >

> > I guess we'll cross this bridge when we get to it.

> 

> Non-TSO architectures seem important...


I'm not familiar with Non-TSO, trying to understand the difference according to the feature set. Let's say non-TSO architectures do not set 'weak_barriers'. Then mandatory barrier is used for software. HW offload on that platform would choose different feature set against software? 
If it's not, essentially we're worried about live migration from a TSO to a non-TSO architectures platform?

> 

> Paolo
Michael S. Tsirkin April 19, 2018, 4:55 p.m. UTC | #16
On Thu, Apr 19, 2018 at 04:24:29PM +0000, Liang, Cunming wrote:
> 
> 
> > -----Original Message-----
> > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > Sent: Thursday, April 19, 2018 11:19 PM
> > To: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Liang, Cunming <cunming.liang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > jasowang@redhat.com; alex.williamson@redhat.com; stefanha@redhat.com;
> > qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> > <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> > Subject: Re: [virtio-dev] RE: [PATCH v3 6/6] vhost-user: support registering
> > external host notifiers
> > 
> > On Thu, Apr 19, 2018 at 03:02:40PM +0200, Paolo Bonzini wrote:
> > > On 19/04/2018 14:43, Liang, Cunming wrote:
> > > >> 2. Memory barriers. Right now after updating the avail idx, virtio
> > > >> does smp_wmb() and then the MMIO write. Normal hardware drivers do
> > > >> wmb() which is an sfence. Can a PCI device read bypass index write
> > > >> and see a stale index value?
> > > >
> > > > A compiler barrier is enough on strongly-ordered memory platform. As
> > > > it doesn't re-order store, PCI device won't see a stale index value.
> > > > But a weakly-ordered memory needs sfence.
> > >
> > > That is complicated then.  We need to define a feature bit and (in the
> > > Linux driver) propagate it to vring_create_virtqueue's weak_barrier
> > > argument.  However:
> > >
> > > - if we make it 1 when weak barriers are needed, the device also needs
> > > to nack feature negotiation (not allow setting the FEATURES_OK) if the
> > > bit is not set by the driver.
> > >  However, that is not enough.  Live
> > > migration assumes that it is okay to migrate a virtual machine from a
> > > source that doesn't support a feature to a destination that supports it.
> > >  In this case, it would assume that it is okay to migrate from
> > > software virtio to hardware virtio.  This is wrong because the
> > > destination would use weak barriers
> > 
> > You can't migrate between systems with different sets of device features right
> > now.
> > 
> > > - if we make it 1 when strong barriers are enough, software virtio
> > > devices needs to be updated to expose the bit.  This works, including
> > > live migration, but updated drivers will now go slower when run
> > > against an old device that doesn't know the feature bit.
> > >
> > > Maybe bump the PCI revision, so that only the new revision has the bit?
> > >
> > > Thanks,
> > >
> > > Paolo
> > 
> > As a first step, if you want to migrate to a HW offloaded solution then you need
> > to enable the feature.
> 
> > It does mean it will go a bit slower when run with software,
> > so it's only good if most systems in your cluster do have the HW offload.
> To clarify a bit more, it's suboptimal to always use mandatory barriers for MMIO. Per strongly-order memory, 'weak barriers' (smp_wmb) is pretty good for MMIO. The tradeoff doesn't always happen, software and HW offload can align on the same page.

I agree to all of the above except where you say smp_wmb.

smp_wmb is for controlling SMP effects on Linux, and I suspect
it will not do the right thing on some non-Intel architectures.

The claim is I think correct for Intel/AMD platforms, and probably
other strongly ordered ones. I suspect it's incorrect for ARM and
power.

Replace smp_wmb with 'asm volatile ("") on Intel' and I'll agree.



> > I think we can start by getting that working and think about ways to improve
> > down the road.
> > 
> > 
> > That's the usecase we designed FEATURES_OK for though, so I do think/hope it's
> > enough and we don't need to play with revisions.
> > 
> > 
> > --
> > MST
Paolo Bonzini April 19, 2018, 4:59 p.m. UTC | #17
On 19/04/2018 18:52, Liang, Cunming wrote:
>>> Oh you are right.
>>> 
>>> So it's only needed for non-intel platforms or when packets are
>>> in WC memory then. And I don't know whether dpdk ever puts
>>> packets in WC memory.
>>> 
>>> I guess we'll cross this bridge when we get to it.
>> Non-TSO architectures seem important...
>
> I'm not familiar with Non-TSO, trying to understand the difference
> according to the feature set. Let's say non-TSO architectures do not
> set 'weak_barriers'. Then mandatory barrier is used for software. HW
> offload on that platform would choose different feature set against
> software? If it's not, essentially we're worried about live migration
> from a TSO to a non-TSO architectures platform?

I'm worried about live migration from software virtio to hardware virtio
on non-TSO architectures.  For example, on ARM you would have a "dmb
ishst" (smp_wmb) for software virtio and a "dsb st" (wmb) or "dmb oshst"
(dma_wmb) for hardware virtio.

For this to work, you would have to set up the VM so that it uses the
heavier barriers from the beginning, even when backed by software virtio.

Paolo
Michael S. Tsirkin April 19, 2018, 5 p.m. UTC | #18
On Thu, Apr 19, 2018 at 04:52:20PM +0000, Liang, Cunming wrote:
> 
> 
> > -----Original Message-----
> > From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> > Sent: Thursday, April 19, 2018 11:52 PM
> > To: Michael S. Tsirkin <mst@redhat.com>; Liang, Cunming
> > <cunming.liang@intel.com>
> > Cc: Bie, Tiwei <tiwei.bie@intel.com>; jasowang@redhat.com;
> > alex.williamson@redhat.com; stefanha@redhat.com; qemu-devel@nongnu.org;
> > virtio-dev@lists.oasis-open.org; Daly, Dan <dan.daly@intel.com>; Tan, Jianfeng
> > <jianfeng.tan@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Wang,
> > Xiao W <xiao.w.wang@intel.com>
> > Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host
> > notifiers
> > 
> > On 19/04/2018 17:42, Michael S. Tsirkin wrote:
> > >> A compiler barrier is enough on strongly-ordered memory platform.
> > >> As it doesn't re-order store, PCI device won't see a stale index
> > >> value. But a weakly-ordered memory needs sfence.
> > >
> > > Oh you are right.
> > >
> > > So it's only needed for non-intel platforms or when packets are in WC
> > > memory then. And I don't know whether dpdk ever puts packets in WC
> > > memory.
> > >
> > > I guess we'll cross this bridge when we get to it.
> > 
> > Non-TSO architectures seem important...
> 
> I'm not familiar with Non-TSO, trying to understand the difference
> according to the feature set. Let's say non-TSO architectures do not
> set 'weak_barriers'. Then mandatory barrier is used for software. HW
> offload on that platform would choose different feature set against
> software? 

Right. We'll need a flag for this feature for starters. It doesn't exist
:) Paolo also points out that we should then add code to disallow
migration between setups with and without the feature.

> If it's not, essentially we're worried about live migration from a TSO to a non-TSO architectures platform?

Probably not.

> > 
> > Paolo
Michael S. Tsirkin April 19, 2018, 5:27 p.m. UTC | #19
On Thu, Apr 19, 2018 at 06:59:39PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 18:52, Liang, Cunming wrote:
> >>> Oh you are right.
> >>> 
> >>> So it's only needed for non-intel platforms or when packets are
> >>> in WC memory then. And I don't know whether dpdk ever puts
> >>> packets in WC memory.
> >>> 
> >>> I guess we'll cross this bridge when we get to it.
> >> Non-TSO architectures seem important...
> >
> > I'm not familiar with Non-TSO, trying to understand the difference
> > according to the feature set. Let's say non-TSO architectures do not
> > set 'weak_barriers'. Then mandatory barrier is used for software. HW
> > offload on that platform would choose different feature set against
> > software? If it's not, essentially we're worried about live migration
> > from a TSO to a non-TSO architectures platform?
> 
> I'm worried about live migration from software virtio to hardware virtio
> on non-TSO architectures.  For example, on ARM you would have a "dmb
> ishst" (smp_wmb) for software virtio and a "dsb st" (wmb) or "dmb oshst"
> (dma_wmb) for hardware virtio.
> 
> For this to work, you would have to set up the VM so that it uses the
> heavier barriers from the beginning, even when backed by software virtio.
> 
> Paolo

Right. Or disallow hardware to software migrations.

But generally the mandatory and even dma barriers in Linux are often an overkill.

See ARM for example: 

#if defined(CONFIG_ARM_DMA_MEM_BUFFERABLE) || defined(CONFIG_SMP)
#define mb()            __arm_heavy_mb()
#define rmb()           dsb()
#define wmb()           __arm_heavy_mb(st)
#define dma_rmb()       dmb(osh)
#define dma_wmb()       dmb(oshst)
#else
#define mb()            barrier()
#define rmb()           barrier()
#define wmb()           barrier()
#define dma_rmb()       barrier()
#define dma_wmb()       barrier()
#endif

That CONFIG_SMP here is clearly wrong but I don't really know what
to set it to. Also, we probably should switch virtio_wmb to dma_XX
barriers.

That's actually easy. Will try to do.
Paolo Bonzini April 19, 2018, 5:35 p.m. UTC | #20
On 19/04/2018 19:27, Michael S. Tsirkin wrote:
> 
> That CONFIG_SMP here is clearly wrong but I don't really know what
> to set it to. Also, we probably should switch virtio_wmb to dma_XX
> barriers.
> 
> That's actually easy. Will try to do.

Should it be dma_wmb() before updating the indices, and wmb() before
writing the "kick virtqueue" register?

Paolo
Michael S. Tsirkin April 19, 2018, 5:39 p.m. UTC | #21
On Thu, Apr 19, 2018 at 07:35:57PM +0200, Paolo Bonzini wrote:
> On 19/04/2018 19:27, Michael S. Tsirkin wrote:
> > 
> > That CONFIG_SMP here is clearly wrong but I don't really know what
> > to set it to. Also, we probably should switch virtio_wmb to dma_XX
> > barriers.
> > 
> > That's actually easy. Will try to do.
> 
> Should it be dma_wmb() before updating the indices, and wmb() before
> writing the "kick virtqueue" register?
> 
> Paolo

if anything kick virtqueue should do wmb internally
within virtio pci.

No one uses strong barriers with virtio pci right now
so we don't do it.
Liang, Cunming April 19, 2018, 11:05 p.m. UTC | #22
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Friday, April 20, 2018 1:01 AM
> To: Liang, Cunming <cunming.liang@intel.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> jasowang@redhat.com; alex.williamson@redhat.com; stefanha@redhat.com;
> qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> Subject: Re: [PATCH v3 6/6] vhost-user: support registering external host
> notifiers
> 
> On Thu, Apr 19, 2018 at 04:52:20PM +0000, Liang, Cunming wrote:
> >
> >
> > > -----Original Message-----
> > > From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> > > Sent: Thursday, April 19, 2018 11:52 PM
> > > To: Michael S. Tsirkin <mst@redhat.com>; Liang, Cunming
> > > <cunming.liang@intel.com>
> > > Cc: Bie, Tiwei <tiwei.bie@intel.com>; jasowang@redhat.com;
> > > alex.williamson@redhat.com; stefanha@redhat.com;
> > > qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> > > <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang,
> > > Zhihong <zhihong.wang@intel.com>; Wang, Xiao W
> > > <xiao.w.wang@intel.com>
> > > Subject: Re: [PATCH v3 6/6] vhost-user: support registering external
> > > host notifiers
> > >
> > > On 19/04/2018 17:42, Michael S. Tsirkin wrote:
> > > >> A compiler barrier is enough on strongly-ordered memory platform.
> > > >> As it doesn't re-order store, PCI device won't see a stale index
> > > >> value. But a weakly-ordered memory needs sfence.
> > > >
> > > > Oh you are right.
> > > >
> > > > So it's only needed for non-intel platforms or when packets are in
> > > > WC memory then. And I don't know whether dpdk ever puts packets in
> > > > WC memory.
> > > >
> > > > I guess we'll cross this bridge when we get to it.
> > >
> > > Non-TSO architectures seem important...
> >
> > I'm not familiar with Non-TSO, trying to understand the difference
> > according to the feature set. Let's say non-TSO architectures do not
> > set 'weak_barriers'. Then mandatory barrier is used for software. HW
> > offload on that platform would choose different feature set against
> > software?
> 
> Right. We'll need a flag for this feature for starters. It doesn't exist
> :) Paolo also points out that we should then add code to disallow migration
> between setups with and without the feature.
I see. Thanks.

> 
> > If it's not, essentially we're worried about live migration from a TSO to a non-
> TSO architectures platform?
> 
> Probably not.
> 
> > >
> > > Paolo
Liang, Cunming April 20, 2018, 3:01 a.m. UTC | #23
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Friday, April 20, 2018 12:56 AM
> To: Liang, Cunming <cunming.liang@intel.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> jasowang@redhat.com; alex.williamson@redhat.com; stefanha@redhat.com;
> qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>
> Subject: Re: [virtio-dev] RE: [PATCH v3 6/6] vhost-user: support registering
> external host notifiers
> 
> On Thu, Apr 19, 2018 at 04:24:29PM +0000, Liang, Cunming wrote:
> >
> >
> > > -----Original Message-----
> > > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > > Sent: Thursday, April 19, 2018 11:19 PM
> > > To: Paolo Bonzini <pbonzini@redhat.com>
> > > Cc: Liang, Cunming <cunming.liang@intel.com>; Bie, Tiwei
> > > <tiwei.bie@intel.com>; jasowang@redhat.com;
> > > alex.williamson@redhat.com; stefanha@redhat.com;
> > > qemu-devel@nongnu.org; virtio-dev@lists.oasis-open.org; Daly, Dan
> > > <dan.daly@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; Wang,
> > > Zhihong <zhihong.wang@intel.com>; Wang, Xiao W
> > > <xiao.w.wang@intel.com>
> > > Subject: Re: [virtio-dev] RE: [PATCH v3 6/6] vhost-user: support
> > > registering external host notifiers
> > >
> > > On Thu, Apr 19, 2018 at 03:02:40PM +0200, Paolo Bonzini wrote:
> > > > On 19/04/2018 14:43, Liang, Cunming wrote:
> > > > >> 2. Memory barriers. Right now after updating the avail idx,
> > > > >> virtio does smp_wmb() and then the MMIO write. Normal hardware
> > > > >> drivers do
> > > > >> wmb() which is an sfence. Can a PCI device read bypass index
> > > > >> write and see a stale index value?
> > > > >
> > > > > A compiler barrier is enough on strongly-ordered memory
> > > > > platform. As it doesn't re-order store, PCI device won't see a stale index
> value.
> > > > > But a weakly-ordered memory needs sfence.
> > > >
> > > > That is complicated then.  We need to define a feature bit and (in
> > > > the Linux driver) propagate it to vring_create_virtqueue's
> > > > weak_barrier argument.  However:
> > > >
> > > > - if we make it 1 when weak barriers are needed, the device also
> > > > needs to nack feature negotiation (not allow setting the
> > > > FEATURES_OK) if the bit is not set by the driver.
> > > >  However, that is not enough.  Live migration assumes that it is
> > > > okay to migrate a virtual machine from a source that doesn't
> > > > support a feature to a destination that supports it.
> > > >  In this case, it would assume that it is okay to migrate from
> > > > software virtio to hardware virtio.  This is wrong because the
> > > > destination would use weak barriers
> > >
> > > You can't migrate between systems with different sets of device
> > > features right now.
> > >
> > > > - if we make it 1 when strong barriers are enough, software virtio
> > > > devices needs to be updated to expose the bit.  This works,
> > > > including live migration, but updated drivers will now go slower
> > > > when run against an old device that doesn't know the feature bit.
> > > >
> > > > Maybe bump the PCI revision, so that only the new revision has the bit?
> > > >
> > > > Thanks,
> > > >
> > > > Paolo
> > >
> > > As a first step, if you want to migrate to a HW offloaded solution
> > > then you need to enable the feature.
> >
> > > It does mean it will go a bit slower when run with software, so it's
> > > only good if most systems in your cluster do have the HW offload.
> > To clarify a bit more, it's suboptimal to always use mandatory barriers for MMIO.
> Per strongly-order memory, 'weak barriers' (smp_wmb) is pretty good for MMIO.
> The tradeoff doesn't always happen, software and HW offload can align on the
> same page.
> 
> I agree to all of the above except where you say smp_wmb.
> 
> smp_wmb is for controlling SMP effects on Linux, and I suspect it will not do the
> right thing on some non-Intel architectures.
> 
> The claim is I think correct for Intel/AMD platforms, and probably other strongly
> ordered ones. I suspect it's incorrect for ARM and power.
> 
> Replace smp_wmb with 'asm volatile ("") on Intel' and I'll agree.

Yeah, that's more accurate. 

> 
> 
> 
> > > I think we can start by getting that working and think about ways to
> > > improve down the road.
> > >
> > >
> > > That's the usecase we designed FEATURES_OK for though, so I do
> > > think/hope it's enough and we don't need to play with revisions.
> > >
> > >
> > > --
> > > MST
Tiwei Bie May 2, 2018, 10:32 a.m. UTC | #24
On Thu, Apr 19, 2018 at 07:14:39PM +0800, Tiwei Bie wrote:
> On Wed, Apr 18, 2018 at 07:34:06PM +0300, Michael S. Tsirkin wrote:
> > On Thu, Apr 12, 2018 at 11:12:32PM +0800, Tiwei Bie wrote:
> > > This patch introduces VHOST_USER_PROTOCOL_F_HOST_NOTIFIER.
> > > With this feature negotiated, vhost-user backend can register
> > > memory region based host notifiers. And it will allow the guest
> > > driver in the VM to notify the hardware accelerator at the
> > > vhost-user backend directly.
> > > 
> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > 
> > Overall I think we can merge this approach, but I have two main concerns
> > about this:
> > 
> > 1. Testing. Most people do not have the virtio hardware
> >    so how to make sure this does not bit rot?
> > 
> >    I have an idea: add an option like this to libvhost-user.
> >    Naturally libvhost-user can not get notified about a write
> >    to an mmapped area, but it can write a special value there
> >    (e.g. all-ones?) and then poll it to detect VQ # writes.
> > 
> >    Then include a vhost user bridge test with an option like this.
> > 
> >    I'd like to see a patch doing this.
> 
> Sure, I'll do it. Thanks for the suggestion!
> 
> > 
> > 2. Memory barriers. Right now after updating the avail idx,
> >    virtio does smp_wmb() and then the MMIO write.
> >    Normal hardware drivers do wmb() which is an sfence.
> >    Can a PCI device read bypass index write and see a stale
> >    index value?
> 
> It depends on arch's memory model. Cunming will provide
> more details about this later.
> 
> >    To make virtio pci do wmb() we would need a new feature bit.
> >    Alternatively I guess we could maybe look at subsystem vendor/device id.
> > 
> >    I'd like to see a patch doing one of these things.
> 
> We prefer to add a new feature bit as it's a more robust
> way to do this. I'll send out some patches soon.
> 
> Thank you very much! :)

Hi Michael,

It seems that we have started to merge patches for QEMU 2.13.
Currently, all your concerns about this patch set have been
addressed or WIP. Do you think it's possible to get this patch
set merged first? Or do you have any other concerns? Thanks!

Best regards,
Tiwei Bie


> 
> Best regards,
> Tiwei Bie
> 
> > 
> > Thanks!
> > 
> > > ---
> > >  docs/interop/vhost-user.txt    |  33 +++++++++++
> > >  hw/virtio/vhost-user.c         | 123 +++++++++++++++++++++++++++++++++++++++++
> > >  include/hw/virtio/vhost-user.h |   8 +++
> > >  3 files changed, 164 insertions(+)
> > > 
> > > diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
> > > index 534caab18a..9e57b36b20 100644
> > > --- a/docs/interop/vhost-user.txt
> > > +++ b/docs/interop/vhost-user.txt
> > > @@ -132,6 +132,16 @@ Depending on the request type, payload can be:
> > >     Payload: Size bytes array holding the contents of the virtio
> > >         device's configuration space
> > >  
> > > + * Vring area description
> > > +   -----------------------
> > > +   | u64 | size | offset |
> > > +   -----------------------
> > > +
> > > +   u64: a 64-bit integer contains vring index and flags
> > > +   Size: a 64-bit size of this area
> > > +   Offset: a 64-bit offset of this area from the start of the
> > > +       supplied file descriptor
> > > +
> > >  In QEMU the vhost-user message is implemented with the following struct:
> > >  
> > >  typedef struct VhostUserMsg {
> > > @@ -146,6 +156,7 @@ typedef struct VhostUserMsg {
> > >          VhostUserLog log;
> > >          struct vhost_iotlb_msg iotlb;
> > >          VhostUserConfig config;
> > > +        VhostUserVringArea area;
> > >      };
> > >  } QEMU_PACKED VhostUserMsg;
> > >  
> > > @@ -380,6 +391,7 @@ Protocol features
> > >  #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
> > >  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
> > >  #define VHOST_USER_PROTOCOL_F_CONFIG         9
> > > +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
> > >  
> > >  Master message types
> > >  --------------------
> > > @@ -777,6 +789,27 @@ Slave message types
> > >       the VHOST_USER_NEED_REPLY flag, master must respond with zero when
> > >       operation is successfully completed, or non-zero otherwise.
> > >  
> > > + * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
> > > +
> > > +      Id: 3
> > > +      Equivalent ioctl: N/A
> > > +      Slave payload: vring area description
> > > +      Master payload: N/A
> > > +
> > > +      Sets host notifier for a specified queue. The queue index is contained
> > > +      in the u64 field of the vring area description. The host notifier is
> > > +      described by the file descriptor (typically it's a VFIO device fd) which
> > > +      is passed as ancillary data and the size (which is mmap size and should
> > > +      be the same as host page size) and offset (which is mmap offset) carried
> > > +      in the vring area description. QEMU can mmap the file descriptor based
> > > +      on the size and offset to get a memory range. Registering a host notifier
> > > +      means mapping this memory range to the VM as the specified queue's notify
> > > +      MMIO region. Slave sends this request to tell QEMU to de-register the
> > > +      existing notifier if any and register the new notifier if the request is
> > > +      sent with a file descriptor.
> > > +      This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
> > > +      protocol feature has been successfully negotiated.
> > > +
> > >  VHOST_USER_PROTOCOL_F_REPLY_ACK:
> > >  -------------------------------
> > >  The original vhost-user specification only demands replies for certain
> > > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> > > index 791e0a4763..1cd9c7276b 100644
> > > --- a/hw/virtio/vhost-user.c
> > > +++ b/hw/virtio/vhost-user.c
> > > @@ -13,6 +13,7 @@
> > >  #include "hw/virtio/vhost.h"
> > >  #include "hw/virtio/vhost-user.h"
> > >  #include "hw/virtio/vhost-backend.h"
> > > +#include "hw/virtio/virtio.h"
> > >  #include "hw/virtio/virtio-net.h"
> > >  #include "chardev/char-fe.h"
> > >  #include "sysemu/kvm.h"
> > > @@ -48,6 +49,7 @@ enum VhostUserProtocolFeature {
> > >      VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
> > >      VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
> > >      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> > > +    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
> > >      VHOST_USER_PROTOCOL_F_MAX
> > >  };
> > >  
> > > @@ -92,6 +94,7 @@ typedef enum VhostUserSlaveRequest {
> > >      VHOST_USER_SLAVE_NONE = 0,
> > >      VHOST_USER_SLAVE_IOTLB_MSG = 1,
> > >      VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
> > > +    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
> > >      VHOST_USER_SLAVE_MAX
> > >  }  VhostUserSlaveRequest;
> > >  
> > > @@ -136,6 +139,12 @@ static VhostUserConfig c __attribute__ ((unused));
> > >                                     + sizeof(c.size) \
> > >                                     + sizeof(c.flags))
> > >  
> > > +typedef struct VhostUserVringArea {
> > > +    uint64_t u64;
> > > +    uint64_t size;
> > > +    uint64_t offset;
> > > +} VhostUserVringArea;
> > > +
> > >  typedef struct {
> > >      VhostUserRequest request;
> > >  
> > > @@ -157,6 +166,7 @@ typedef union {
> > >          struct vhost_iotlb_msg iotlb;
> > >          VhostUserConfig config;
> > >          VhostUserCryptoSession session;
> > > +        VhostUserVringArea area;
> > >  } VhostUserPayload;
> > >  
> > >  typedef struct VhostUserMsg {
> > > @@ -638,9 +648,37 @@ static int vhost_user_set_vring_num(struct vhost_dev *dev,
> > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
> > >  }
> > >  
> > > +static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
> > > +                                             int queue_idx)
> > > +{
> > > +    struct vhost_user *u = dev->opaque;
> > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > +    VirtIODevice *vdev = dev->vdev;
> > > +
> > > +    if (n->addr && !n->set) {
> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
> > > +        n->set = true;
> > > +    }
> > > +}
> > > +
> > > +static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
> > > +                                            int queue_idx)
> > > +{
> > > +    struct vhost_user *u = dev->opaque;
> > > +    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
> > > +    VirtIODevice *vdev = dev->vdev;
> > > +
> > > +    if (n->addr && n->set) {
> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > > +        n->set = false;
> > > +    }
> > > +}
> > > +
> > >  static int vhost_user_set_vring_base(struct vhost_dev *dev,
> > >                                       struct vhost_vring_state *ring)
> > >  {
> > > +    vhost_user_host_notifier_restore(dev, ring->index);
> > > +
> > >      return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
> > >  }
> > >  
> > > @@ -674,6 +712,8 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
> > >          .hdr.size = sizeof(msg.payload.state),
> > >      };
> > >  
> > > +    vhost_user_host_notifier_remove(dev, ring->index);
> > > +
> > >      if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
> > >          return -1;
> > >      }
> > > @@ -847,6 +887,76 @@ static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
> > >      return ret;
> > >  }
> > >  
> > > +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
> > > +                                                       VhostUserVringArea *area,
> > > +                                                       int fd)
> > > +{
> > > +    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
> > > +    size_t page_size = qemu_real_host_page_size;
> > > +    struct vhost_user *u = dev->opaque;
> > > +    VhostUserState *user = u->user;
> > > +    VirtIODevice *vdev = dev->vdev;
> > > +    VhostUserHostNotifier *n;
> > > +    int ret = 0;
> > > +    void *addr;
> > > +    char *name;
> > > +
> > > +    if (!virtio_has_feature(dev->protocol_features,
> > > +                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
> > > +        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
> > > +        ret = -1;
> > > +        goto out;
> > > +    }
> > > +
> > > +    n = &user->notifier[queue_idx];
> > > +
> > > +    if (n->addr) {
> > > +        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
> > > +        object_unparent(OBJECT(&n->mr));
> > > +        munmap(n->addr, page_size);
> > > +        n->addr = NULL;
> > > +    }
> > > +
> > > +    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* Sanity check. */
> > > +    if (area->size != page_size) {
> > > +        ret = -1;
> > > +        goto out;
> > > +    }
> > > +
> > > +    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > > +                fd, area->offset);
> > > +    if (addr == MAP_FAILED) {
> > > +        ret = -1;
> > > +        goto out;
> > > +    }
> > > +
> > > +    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
> > > +                           user, queue_idx);
> > > +    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
> > > +                                      page_size, addr);
> > > +    g_free(name);
> > > +
> > > +    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
> > > +        munmap(addr, page_size);
> > > +        ret = -1;
> > > +        goto out;
> > > +    }
> > > +
> > > +    n->addr = addr;
> > > +    n->set = true;
> > > +
> > > +out:
> > > +    /* Always close the fd. */
> > > +    if (fd != -1) {
> > > +        close(fd);
> > > +    }
> > > +    return ret;
> > > +}
> > > +
> > >  static void slave_read(void *opaque)
> > >  {
> > >      struct vhost_dev *dev = opaque;
> > > @@ -913,6 +1023,10 @@ static void slave_read(void *opaque)
> > >      case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
> > >          ret = vhost_user_slave_handle_config_change(dev);
> > >          break;
> > > +    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
> > > +        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
> > > +                                                          fd);
> > > +        break;
> > >      default:
> > >          error_report("Received unexpected msg type.");
> > >          if (fd != -1) {
> > > @@ -1641,6 +1755,15 @@ VhostUserState *vhost_user_init(void)
> > >  
> > >  void vhost_user_cleanup(VhostUserState *user)
> > >  {
> > > +    int i;
> > > +
> > > +    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
> > > +        if (user->notifier[i].addr) {
> > > +            object_unparent(OBJECT(&user->notifier[i].mr));
> > > +            munmap(user->notifier[i].addr, qemu_real_host_page_size);
> > > +            user->notifier[i].addr = NULL;
> > > +        }
> > > +    }
> > >  }
> > >  
> > >  const VhostOps user_ops = {
> > > diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> > > index eb8bc0d90d..fd660393a0 100644
> > > --- a/include/hw/virtio/vhost-user.h
> > > +++ b/include/hw/virtio/vhost-user.h
> > > @@ -9,9 +9,17 @@
> > >  #define HW_VIRTIO_VHOST_USER_H
> > >  
> > >  #include "chardev/char-fe.h"
> > > +#include "hw/virtio/virtio.h"
> > > +
> > > +typedef struct VhostUserHostNotifier {
> > > +    MemoryRegion mr;
> > > +    void *addr;
> > > +    bool set;
> > > +} VhostUserHostNotifier;
> > >  
> > >  typedef struct VhostUserState {
> > >      CharBackend *chr;
> > > +    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > >  } VhostUserState;
> > >  
> > >  VhostUserState *vhost_user_init(void);
> > > -- 
> > > 2.11.0
diff mbox

Patch

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index 534caab18a..9e57b36b20 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -132,6 +132,16 @@  Depending on the request type, payload can be:
    Payload: Size bytes array holding the contents of the virtio
        device's configuration space
 
+ * Vring area description
+   -----------------------
+   | u64 | size | offset |
+   -----------------------
+
+   u64: a 64-bit integer contains vring index and flags
+   Size: a 64-bit size of this area
+   Offset: a 64-bit offset of this area from the start of the
+       supplied file descriptor
+
 In QEMU the vhost-user message is implemented with the following struct:
 
 typedef struct VhostUserMsg {
@@ -146,6 +156,7 @@  typedef struct VhostUserMsg {
         VhostUserLog log;
         struct vhost_iotlb_msg iotlb;
         VhostUserConfig config;
+        VhostUserVringArea area;
     };
 } QEMU_PACKED VhostUserMsg;
 
@@ -380,6 +391,7 @@  Protocol features
 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
 #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
 #define VHOST_USER_PROTOCOL_F_CONFIG         9
+#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  10
 
 Master message types
 --------------------
@@ -777,6 +789,27 @@  Slave message types
      the VHOST_USER_NEED_REPLY flag, master must respond with zero when
      operation is successfully completed, or non-zero otherwise.
 
+ * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
+
+      Id: 3
+      Equivalent ioctl: N/A
+      Slave payload: vring area description
+      Master payload: N/A
+
+      Sets host notifier for a specified queue. The queue index is contained
+      in the u64 field of the vring area description. The host notifier is
+      described by the file descriptor (typically it's a VFIO device fd) which
+      is passed as ancillary data and the size (which is mmap size and should
+      be the same as host page size) and offset (which is mmap offset) carried
+      in the vring area description. QEMU can mmap the file descriptor based
+      on the size and offset to get a memory range. Registering a host notifier
+      means mapping this memory range to the VM as the specified queue's notify
+      MMIO region. Slave sends this request to tell QEMU to de-register the
+      existing notifier if any and register the new notifier if the request is
+      sent with a file descriptor.
+      This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
+      protocol feature has been successfully negotiated.
+
 VHOST_USER_PROTOCOL_F_REPLY_ACK:
 -------------------------------
 The original vhost-user specification only demands replies for certain
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 791e0a4763..1cd9c7276b 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -13,6 +13,7 @@ 
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost-user.h"
 #include "hw/virtio/vhost-backend.h"
+#include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-net.h"
 #include "chardev/char-fe.h"
 #include "sysemu/kvm.h"
@@ -48,6 +49,7 @@  enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
     VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
+    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 10,
     VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -92,6 +94,7 @@  typedef enum VhostUserSlaveRequest {
     VHOST_USER_SLAVE_NONE = 0,
     VHOST_USER_SLAVE_IOTLB_MSG = 1,
     VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+    VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
     VHOST_USER_SLAVE_MAX
 }  VhostUserSlaveRequest;
 
@@ -136,6 +139,12 @@  static VhostUserConfig c __attribute__ ((unused));
                                    + sizeof(c.size) \
                                    + sizeof(c.flags))
 
+typedef struct VhostUserVringArea {
+    uint64_t u64;
+    uint64_t size;
+    uint64_t offset;
+} VhostUserVringArea;
+
 typedef struct {
     VhostUserRequest request;
 
@@ -157,6 +166,7 @@  typedef union {
         struct vhost_iotlb_msg iotlb;
         VhostUserConfig config;
         VhostUserCryptoSession session;
+        VhostUserVringArea area;
 } VhostUserPayload;
 
 typedef struct VhostUserMsg {
@@ -638,9 +648,37 @@  static int vhost_user_set_vring_num(struct vhost_dev *dev,
     return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
 }
 
+static void vhost_user_host_notifier_restore(struct vhost_dev *dev,
+                                             int queue_idx)
+{
+    struct vhost_user *u = dev->opaque;
+    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
+    VirtIODevice *vdev = dev->vdev;
+
+    if (n->addr && !n->set) {
+        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true);
+        n->set = true;
+    }
+}
+
+static void vhost_user_host_notifier_remove(struct vhost_dev *dev,
+                                            int queue_idx)
+{
+    struct vhost_user *u = dev->opaque;
+    VhostUserHostNotifier *n = &u->user->notifier[queue_idx];
+    VirtIODevice *vdev = dev->vdev;
+
+    if (n->addr && n->set) {
+        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
+        n->set = false;
+    }
+}
+
 static int vhost_user_set_vring_base(struct vhost_dev *dev,
                                      struct vhost_vring_state *ring)
 {
+    vhost_user_host_notifier_restore(dev, ring->index);
+
     return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
 }
 
@@ -674,6 +712,8 @@  static int vhost_user_get_vring_base(struct vhost_dev *dev,
         .hdr.size = sizeof(msg.payload.state),
     };
 
+    vhost_user_host_notifier_remove(dev, ring->index);
+
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
         return -1;
     }
@@ -847,6 +887,76 @@  static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
     return ret;
 }
 
+static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
+                                                       VhostUserVringArea *area,
+                                                       int fd)
+{
+    int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
+    size_t page_size = qemu_real_host_page_size;
+    struct vhost_user *u = dev->opaque;
+    VhostUserState *user = u->user;
+    VirtIODevice *vdev = dev->vdev;
+    VhostUserHostNotifier *n;
+    int ret = 0;
+    void *addr;
+    char *name;
+
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
+        vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
+        ret = -1;
+        goto out;
+    }
+
+    n = &user->notifier[queue_idx];
+
+    if (n->addr) {
+        virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false);
+        object_unparent(OBJECT(&n->mr));
+        munmap(n->addr, page_size);
+        n->addr = NULL;
+    }
+
+    if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
+        goto out;
+    }
+
+    /* Sanity check. */
+    if (area->size != page_size) {
+        ret = -1;
+        goto out;
+    }
+
+    addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                fd, area->offset);
+    if (addr == MAP_FAILED) {
+        ret = -1;
+        goto out;
+    }
+
+    name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
+                           user, queue_idx);
+    memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
+                                      page_size, addr);
+    g_free(name);
+
+    if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
+        munmap(addr, page_size);
+        ret = -1;
+        goto out;
+    }
+
+    n->addr = addr;
+    n->set = true;
+
+out:
+    /* Always close the fd. */
+    if (fd != -1) {
+        close(fd);
+    }
+    return ret;
+}
+
 static void slave_read(void *opaque)
 {
     struct vhost_dev *dev = opaque;
@@ -913,6 +1023,10 @@  static void slave_read(void *opaque)
     case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
         ret = vhost_user_slave_handle_config_change(dev);
         break;
+    case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
+        ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
+                                                          fd);
+        break;
     default:
         error_report("Received unexpected msg type.");
         if (fd != -1) {
@@ -1641,6 +1755,15 @@  VhostUserState *vhost_user_init(void)
 
 void vhost_user_cleanup(VhostUserState *user)
 {
+    int i;
+
+    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+        if (user->notifier[i].addr) {
+            object_unparent(OBJECT(&user->notifier[i].mr));
+            munmap(user->notifier[i].addr, qemu_real_host_page_size);
+            user->notifier[i].addr = NULL;
+        }
+    }
 }
 
 const VhostOps user_ops = {
diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
index eb8bc0d90d..fd660393a0 100644
--- a/include/hw/virtio/vhost-user.h
+++ b/include/hw/virtio/vhost-user.h
@@ -9,9 +9,17 @@ 
 #define HW_VIRTIO_VHOST_USER_H
 
 #include "chardev/char-fe.h"
+#include "hw/virtio/virtio.h"
+
+typedef struct VhostUserHostNotifier {
+    MemoryRegion mr;
+    void *addr;
+    bool set;
+} VhostUserHostNotifier;
 
 typedef struct VhostUserState {
     CharBackend *chr;
+    VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostUserState;
 
 VhostUserState *vhost_user_init(void);