diff mbox series

[RFC,v3,16/19] vfio-user: dma map/unmap operations

Message ID 9317e19ef1b2b73864be268b6715fcf53a0704a4.1636057885.git.john.g.johnson@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user client | expand

Commit Message

John Johnson Nov. 9, 2021, 12:46 a.m. UTC
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
---
 hw/vfio/pci.h                 |   1 +
 hw/vfio/user-protocol.h       |  32 +++++++
 hw/vfio/user.h                |   1 +
 include/hw/vfio/vfio-common.h |   4 +
 hw/vfio/common.c              |  76 +++++++++++++---
 hw/vfio/pci.c                 |   4 +
 hw/vfio/user.c                | 206 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 309 insertions(+), 15 deletions(-)

Comments

Alex Williamson Nov. 19, 2021, 10:42 p.m. UTC | #1
On Mon,  8 Nov 2021 16:46:44 -0800
John Johnson <john.g.johnson@oracle.com> wrote:

> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> ---
>  hw/vfio/pci.h                 |   1 +
>  hw/vfio/user-protocol.h       |  32 +++++++
>  hw/vfio/user.h                |   1 +
>  include/hw/vfio/vfio-common.h |   4 +
>  hw/vfio/common.c              |  76 +++++++++++++---
>  hw/vfio/pci.c                 |   4 +
>  hw/vfio/user.c                | 206 ++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 309 insertions(+), 15 deletions(-)
> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 643ff75..156fee2 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -193,6 +193,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
>  struct VFIOUserPCIDevice {
>      VFIOPCIDevice device;
>      char *sock_name;
> +    bool secure_dma;    /* disable shared mem for DMA */

????????????  It's there, it's gone, it's back.

>      bool send_queued;   /* all sends are queued */
>      bool no_post;       /* all regions write are sync */
>  };
> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
> index 5614efa..ca53fce 100644
> --- a/hw/vfio/user-protocol.h
> +++ b/hw/vfio/user-protocol.h
> @@ -83,6 +83,31 @@ typedef struct {
>  
>  
>  /*
> + * VFIO_USER_DMA_MAP
> + * imported from struct vfio_iommu_type1_dma_map
> + */
> +typedef struct {
> +    VFIOUserHdr hdr;
> +    uint32_t argsz;
> +    uint32_t flags;
> +    uint64_t offset;    /* FD offset */
> +    uint64_t iova;
> +    uint64_t size;
> +} VFIOUserDMAMap;
> +
> +/*
> + * VFIO_USER_DMA_UNMAP
> + * imported from struct vfio_iommu_type1_dma_unmap
> + */
> +typedef struct {
> +    VFIOUserHdr hdr;
> +    uint32_t argsz;
> +    uint32_t flags;
> +    uint64_t iova;
> +    uint64_t size;
> +} VFIOUserDMAUnmap;
> +
> +/*
>   * VFIO_USER_DEVICE_GET_INFO
>   * imported from struct_device_info
>   */
> @@ -146,4 +171,11 @@ typedef struct {
>      char data[];
>  } VFIOUserRegionRW;
>  
> +/*imported from struct vfio_bitmap */
> +typedef struct {
> +    uint64_t pgsize;
> +    uint64_t size;
> +    char data[];
> +} VFIOUserBitmap;
> +
>  #endif /* VFIO_USER_PROTOCOL_H */
> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> index 8d03e7c..997f748 100644
> --- a/hw/vfio/user.h
> +++ b/hw/vfio/user.h
> @@ -74,6 +74,7 @@ typedef struct VFIOProxy {
>  
>  /* VFIOProxy flags */
>  #define VFIO_PROXY_CLIENT        0x1
> +#define VFIO_PROXY_SECURE        0x2
>  #define VFIO_PROXY_FORCE_QUEUED  0x4
>  #define VFIO_PROXY_NO_POST       0x8
>  
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index c0e7632..dcfae2c 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -90,6 +90,8 @@ typedef struct VFIOContainer {
>      VFIOContIO *io_ops;
>      bool initialized;
>      bool dirty_pages_supported;
> +    bool will_commit;

The entire will_commit concept hidden in the map and unmap operations
from many patches ago should be introduced here, or later.

> +    bool need_map_fd;
>      uint64_t dirty_pgsizes;
>      uint64_t max_dirty_bitmap_size;
>      unsigned long pgsizes;
> @@ -210,6 +212,7 @@ struct VFIOContIO {
>      int (*dirty_bitmap)(VFIOContainer *container,
>                          struct vfio_iommu_type1_dirty_bitmap *bitmap,
>                          struct vfio_iommu_type1_dirty_bitmap_get *range);
> +    void (*wait_commit)(VFIOContainer *container);
>  };
>  
>  #define CONT_DMA_MAP(cont, map, fd, will_commit) \
> @@ -218,6 +221,7 @@ struct VFIOContIO {
>      ((cont)->io_ops->dma_unmap((cont), (unmap), (bitmap), (will_commit)))
>  #define CONT_DIRTY_BITMAP(cont, bitmap, range) \
>      ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range)))
> +#define CONT_WAIT_COMMIT(cont) ((cont)->io_ops->wait_commit(cont))
>  
>  extern VFIODevIO vfio_dev_io_ioctl;
>  extern VFIOContIO vfio_cont_io_ioctl;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index fdd2702..0840c8f 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -411,6 +411,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>      struct vfio_iommu_type1_dma_unmap *unmap;
>      struct vfio_bitmap *bitmap;
>      uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
> +    bool will_commit = container->will_commit;
>      int ret;
>  
>      unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
> @@ -444,7 +445,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>          goto unmap_exit;
>      }
>  
> -    ret = CONT_DMA_UNMAP(container, unmap, bitmap, false);
> +    ret = CONT_DMA_UNMAP(container, unmap, bitmap, will_commit);
>      if (!ret) {
>          cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
>                  iotlb->translated_addr, pages);
> @@ -471,16 +472,17 @@ static int vfio_dma_unmap(VFIOContainer *container,
>          .iova = iova,
>          .size = size,
>      };
> +    bool will_commit = container->will_commit;
>  
>      if (iotlb && container->dirty_pages_supported &&
>          vfio_devices_all_running_and_saving(container)) {
>          return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
>      }
>  
> -    return CONT_DMA_UNMAP(container, &unmap, NULL, false);
> +    return CONT_DMA_UNMAP(container, &unmap, NULL, will_commit);

We're passing the container, why do we need a separate will_commit arg
for these?


>  }
>  
> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +static int vfio_dma_map(VFIOContainer *container, MemoryRegion *mr, hwaddr iova,
>                          ram_addr_t size, void *vaddr, bool readonly)
>  {
>      struct vfio_iommu_type1_dma_map map = {
> @@ -490,13 +492,23 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>          .iova = iova,
>          .size = size,
>      };
> -    int ret;
> +    int fd, ret;
> +    bool will_commit = container->will_commit;
>  
>      if (!readonly) {
>          map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>      }
>  
> -    ret = CONT_DMA_MAP(container, &map, -1, false);
> +    if (container->need_map_fd) {
> +        fd = memory_region_get_fd(mr);
> +        if (fd != -1) {
> +            map.vaddr = qemu_ram_block_host_offset(mr->ram_block, vaddr);
> +        }
> +    } else {
> +        fd = -1;
> +    }
> +
> +    ret = CONT_DMA_MAP(container, &map, fd, will_commit);

Why were we even passing a -1 fd previously?  Would it make more sense
to pass the mr and put this in the user variant .map_dma?  We're going
to the trouble to pass the mr down this far here.  If the map callback
handled the above fd and map.vaddr we could also avoid of the
need_map_fd flag on the container.

>  
>      if (ret < 0) {
>          error_report("VFIO_MAP_DMA failed: %s", strerror(-ret));
> @@ -557,7 +569,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  
>  /* Called with rcu_read_lock held.  */
>  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                               ram_addr_t *ram_addr, bool *read_only)
> +                               ram_addr_t *ram_addr, bool *read_only,
> +                               MemoryRegion **mrp)
>  {
>      MemoryRegion *mr;
>      hwaddr xlat;
> @@ -638,6 +651,10 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
>          *read_only = !writable || mr->readonly;
>      }
>  
> +    if (mrp != NULL) {
> +        *mrp = mr;
> +    }
> +
>      return true;
>  }
>  
> @@ -645,6 +662,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>  {
>      VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>      VFIOContainer *container = giommu->container;
> +    MemoryRegion *mr;
>      hwaddr iova = iotlb->iova + giommu->iommu_offset;
>      void *vaddr;
>      int ret;
> @@ -663,7 +681,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
>          bool read_only;
>  
> -        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mr)) {
>              goto out;
>          }
>          /*
> @@ -673,14 +691,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>           * of vaddr will always be there, even if the memory object is
>           * destroyed and its backing memory munmap-ed.
>           */
> -        ret = vfio_dma_map(container, iova,
> +        ret = vfio_dma_map(container, mr, iova,
>                             iotlb->addr_mask + 1, vaddr,
>                             read_only);
>          if (ret) {
>              error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> -                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
> +                         "0x%"HWADDR_PRIx", %p)",
>                           container, iova,
> -                         iotlb->addr_mask + 1, vaddr, ret);
> +                         iotlb->addr_mask + 1, vaddr);
>          }
>      } else {
>          ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
> @@ -735,7 +753,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
>                 section->offset_within_address_space;
>          vaddr = memory_region_get_ram_ptr(section->mr) + start;
>  
> -        ret = vfio_dma_map(vrdl->container, iova, next - start,
> +        ret = vfio_dma_map(vrdl->container, section->mr, iova, next - start,
>                             vaddr, section->readonly);
>          if (ret) {
>              /* Rollback */
> @@ -843,6 +861,23 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
>      g_free(vrdl);
>  }
>  
> +static void vfio_listener_begin(MemoryListener *listener)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> +
> +    /* cannot drop BQL during the transaction, send maps/demaps async */
> +    container->will_commit = true;
> +}
> +
> +static void vfio_listener_commit(MemoryListener *listener)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> +
> +    /* wait for any async requests sent during the transaction */
> +    CONT_WAIT_COMMIT(container);
> +    container->will_commit = false;
> +}

Not sure I follow the semantics, when would the map/unmap callbacks get
called when will_commit is false?

Does it really make sense to have macros for ops that we call in one
place?

> +
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -1035,12 +1070,12 @@ static void vfio_listener_region_add(MemoryListener *listener,
>          }
>      }
>  
> -    ret = vfio_dma_map(container, iova, int128_get64(llsize),
> +    ret = vfio_dma_map(container, section->mr, iova, int128_get64(llsize),
>                         vaddr, section->readonly);
>      if (ret) {
>          error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> -                   "0x%"HWADDR_PRIx", %p) = %d (%m)",
> -                   container, iova, int128_get64(llsize), vaddr, ret);
> +                   "0x%"HWADDR_PRIx", %p)",
> +                   container, iova, int128_get64(llsize), vaddr);
>          if (memory_region_is_ram_device(section->mr)) {
>              /* Allow unexpected mappings not to be fatal for RAM devices */
>              error_report_err(err);
> @@ -1301,7 +1336,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      }
>  
>      rcu_read_lock();
> -    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
> +    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL)) {
>          int ret;
>  
>          ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
> @@ -1418,6 +1453,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
>  }
>  
>  static const MemoryListener vfio_memory_listener = {
> +    .begin = vfio_listener_begin,
> +    .commit = vfio_listener_commit,
>      .region_add = vfio_listener_region_add,
>      .region_del = vfio_listener_region_del,
>      .log_global_start = vfio_listener_log_global_start,
> @@ -1561,6 +1598,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>      region->size = info->size;
>      region->fd_offset = info->offset;
>      region->nr = index;
> +    region->post_wr = false;

Should this be in a different patch?  It looks unrelated.

>      region->remfd = vfio_get_region_info_remfd(vbasedev, index);
>  
>      if (region->size) {
> @@ -2047,6 +2085,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
>      container->dirty_pages_supported = false;
>      container->dma_max_mappings = 0;
>      container->io_ops = &vfio_cont_io_ioctl;
> +    container->need_map_fd = false;
>      QLIST_INIT(&container->giommu_list);
>      QLIST_INIT(&container->hostwin_list);
>      QLIST_INIT(&container->vrdl_list);
> @@ -2230,6 +2269,7 @@ void vfio_connect_proxy(VFIOProxy *proxy, VFIOGroup *group, AddressSpace *as)
>      container->space = space;
>      container->fd = -1;
>      container->io_ops = &vfio_cont_io_sock;
> +    container->need_map_fd = (proxy->flags & VFIO_PROXY_SECURE) == 0;
>      QLIST_INIT(&container->giommu_list);
>      QLIST_INIT(&container->hostwin_list);
>      container->proxy = proxy;
> @@ -2879,8 +2919,14 @@ static int vfio_io_dirty_bitmap(VFIOContainer *container,
>      return ret;
>  }
>  
> +static void vfio_io_wait_commit(VFIOContainer *container)
> +{
> +    /* ioctl()s are synchronous */
> +}
> +

Maybe these should just be "dma_commit" rather than "wait_commit"?  I'd
also tend to suggest "async" rather than "will_commit".

>  VFIOContIO vfio_cont_io_ioctl = {
>      .dma_map = vfio_io_dma_map,
>      .dma_unmap = vfio_io_dma_unmap,
>      .dirty_bitmap = vfio_io_dirty_bitmap,
> +    .wait_commit = vfio_io_wait_commit,
>  };
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index d657b01..ca821da 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3516,6 +3516,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
>      vbasedev->proxy = proxy;
>      vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
>  
> +    if (udev->secure_dma) {
> +        proxy->flags |= VFIO_PROXY_SECURE;
> +    }
>      if (udev->send_queued) {
>          proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
>      }
> @@ -3686,6 +3689,7 @@ static void vfio_user_instance_finalize(Object *obj)
>  
>  static Property vfio_user_pci_dev_properties[] = {
>      DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
> +    DEFINE_PROP_BOOL("secure-dma", VFIOUserPCIDevice, secure_dma, false),

"secure_dma" looks entirely compartmentalized that it could be a
separate patch.  Thanks,

Alex

>      DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
>      DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
>      DEFINE_PROP_END_OF_LIST(),
> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
> index 70fe7a6..cee08b6 100644
> --- a/hw/vfio/user.c
> +++ b/hw/vfio/user.c
> @@ -52,8 +52,11 @@ static void vfio_user_request(void *opaque);
>  static int vfio_user_send_queued(VFIOProxy *proxy, VFIOUserMsg *msg);
>  static void vfio_user_send_async(VFIOProxy *proxy, VFIOUserHdr *hdr,
>                                   VFIOUserFDs *fds);
> +static void vfio_user_send_nowait(VFIOProxy *proxy, VFIOUserHdr *hdr,
> +                                  VFIOUserFDs *fds, int rsize);
>  static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
>                                  VFIOUserFDs *fds, int rsize, bool nobql);
> +static void vfio_user_wait_reqs(VFIOProxy *proxy);
>  static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
>                                    uint32_t size, uint32_t flags);
>  
> @@ -562,6 +565,36 @@ static void vfio_user_send_async(VFIOProxy *proxy, VFIOUserHdr *hdr,
>      }
>  }
>  
> +/*
> + * nowait send - vfio_wait_reqs() can wait for it later
> + */
> +static void vfio_user_send_nowait(VFIOProxy *proxy, VFIOUserHdr *hdr,
> +                                  VFIOUserFDs *fds, int rsize)
> +{
> +    VFIOUserMsg *msg;
> +    int ret;
> +
> +    if (hdr->flags & VFIO_USER_NO_REPLY) {
> +        error_printf("vfio_user_send_nowait on async message\n");
> +        return;
> +    }
> +
> +    QEMU_LOCK_GUARD(&proxy->lock);
> +
> +    msg = vfio_user_getmsg(proxy, hdr, fds);
> +    msg->id = hdr->id;
> +    msg->rsize = rsize ? rsize : hdr->size;
> +    msg->type = VFIO_MSG_NOWAIT;
> +
> +    ret = vfio_user_send_queued(proxy, msg);
> +    if (ret < 0) {
> +        vfio_user_recycle(proxy, msg);
> +        return;
> +    }
> +
> +    proxy->last_nowait = msg;
> +}
> +
>  static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
>                                  VFIOUserFDs *fds, int rsize, bool nobql)
>  {
> @@ -610,6 +643,56 @@ static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
>      }
>  }
>  
> +static void vfio_user_wait_reqs(VFIOProxy *proxy)
> +{
> +    VFIOUserMsg *msg;
> +    bool iolock = false;
> +
> +    /*
> +     * Any DMA map/unmap requests sent in the middle
> +     * of a memory region transaction were sent nowait.
> +     * Wait for them here.
> +     */
> +    qemu_mutex_lock(&proxy->lock);
> +    if (proxy->last_nowait != NULL) {
> +        iolock = qemu_mutex_iothread_locked();
> +        if (iolock) {
> +            qemu_mutex_unlock_iothread();
> +        }
> +
> +        /*
> +         * Change type to WAIT to wait for reply
> +         */
> +        msg = proxy->last_nowait;
> +        msg->type = VFIO_MSG_WAIT;
> +        while (!msg->complete) {
> +            if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
> +                error_printf("vfio_wait_reqs - timed out\n");
> +                break;
> +            }
> +        }
> +
> +        if (msg->hdr->flags & VFIO_USER_ERROR) {
> +            error_printf("vfio_user_wait_reqs - error reply on async request ");
> +            error_printf("command %x error %s\n", msg->hdr->command,
> +                         strerror(msg->hdr->error_reply));
> +        }
> +
> +        proxy->last_nowait = NULL;
> +        /*
> +         * Change type back to NOWAIT to free
> +         */
> +        msg->type = VFIO_MSG_NOWAIT;
> +        vfio_user_recycle(proxy, msg);
> +    }
> +
> +    /* lock order is BQL->proxy - don't hold proxy when getting BQL */
> +    qemu_mutex_unlock(&proxy->lock);
> +    if (iolock) {
> +        qemu_mutex_lock_iothread();
> +    }
> +}
> +
>  static QLIST_HEAD(, VFIOProxy) vfio_user_sockets =
>      QLIST_HEAD_INITIALIZER(vfio_user_sockets);
>  
> @@ -935,6 +1018,102 @@ int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp)
>      return 0;
>  }
>  
> +static int vfio_user_dma_map(VFIOProxy *proxy,
> +                             struct vfio_iommu_type1_dma_map *map,
> +                             int fd, bool will_commit)
> +{
> +    VFIOUserFDs *fds = NULL;
> +    VFIOUserDMAMap *msgp = g_malloc0(sizeof(*msgp));
> +    int ret;
> +
> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_MAP, sizeof(*msgp), 0);
> +    msgp->argsz = map->argsz;
> +    msgp->flags = map->flags;
> +    msgp->offset = map->vaddr;
> +    msgp->iova = map->iova;
> +    msgp->size = map->size;
> +
> +    /*
> +     * The will_commit case sends without blocking or dropping BQL.
> +     * They're later waited for in vfio_send_wait_reqs.
> +     */
> +    if (will_commit) {
> +        /* can't use auto variable since we don't block */
> +        if (fd != -1) {
> +            fds = vfio_user_getfds(1);
> +            fds->send_fds = 1;
> +            fds->fds[0] = fd;
> +        }
> +        vfio_user_send_nowait(proxy, &msgp->hdr, fds, 0);
> +        ret = 0;
> +    } else {
> +        VFIOUserFDs local_fds = { 1, 0, &fd };
> +
> +        fds = fd != -1 ? &local_fds : NULL;
> +        vfio_user_send_wait(proxy, &msgp->hdr, fds, 0, will_commit);
> +        ret = (msgp->hdr.flags & VFIO_USER_ERROR) ? -msgp->hdr.error_reply : 0;
> +        g_free(msgp);
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_user_dma_unmap(VFIOProxy *proxy,
> +                               struct vfio_iommu_type1_dma_unmap *unmap,
> +                               struct vfio_bitmap *bitmap, bool will_commit)
> +{
> +    struct {
> +        VFIOUserDMAUnmap msg;
> +        VFIOUserBitmap bitmap;
> +    } *msgp = NULL;
> +    int msize, rsize;
> +    bool blocking = !will_commit;
> +
> +    if (bitmap == NULL &&
> +        (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)) {
> +        error_printf("vfio_user_dma_unmap mismatched flags and bitmap\n");
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * If a dirty bitmap is returned, allocate extra space for it
> +     * and block for reply even in the will_commit case.
> +     * Otherwise, can send the unmap request without waiting.
> +     */
> +    if (bitmap != NULL) {
> +        blocking = true;
> +        msize = sizeof(*msgp);
> +        rsize = msize + bitmap->size;
> +        msgp = g_malloc0(rsize);
> +        msgp->bitmap.pgsize = bitmap->pgsize;
> +        msgp->bitmap.size = bitmap->size;
> +    } else {
> +        msize = rsize = sizeof(VFIOUserDMAUnmap);
> +        msgp = g_malloc0(rsize);
> +    }
> +
> +    vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DMA_UNMAP, msize, 0);
> +    msgp->msg.argsz = unmap->argsz;
> +    msgp->msg.flags = unmap->flags;
> +    msgp->msg.iova = unmap->iova;
> +    msgp->msg.size = unmap->size;
> +
> +    if (blocking) {
> +        vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, will_commit);
> +        if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
> +            return -msgp->msg.hdr.error_reply;
> +        }
> +        if (bitmap != NULL) {
> +            memcpy(bitmap->data, &msgp->bitmap.data, bitmap->size);
> +        }
> +        g_free(msgp);
> +    } else {
> +        vfio_user_send_nowait(proxy, &msgp->msg.hdr, NULL, rsize);
> +    }
> +
> +    return 0;
> +}
> +
>  static int vfio_user_get_info(VFIOProxy *proxy, struct vfio_device_info *info)
>  {
>      VFIOUserDeviceInfo msg;
> @@ -1225,5 +1404,32 @@ VFIODevIO vfio_dev_io_sock = {
>  };
>  
>  
> +static int vfio_user_io_dma_map(VFIOContainer *container,
> +                                struct vfio_iommu_type1_dma_map *map,
> +                                int fd, bool will_commit)
> +{
> +    if (fd != -1) {
> +        return vfio_user_dma_map(container->proxy, map, fd, will_commit);
> +    } else {
> +        map->vaddr = 0;
> +        return vfio_user_dma_map(container->proxy, map, -1, will_commit);
> +    }
> +}
> +
> +static int vfio_user_io_dma_unmap(VFIOContainer *container,
> +                                  struct vfio_iommu_type1_dma_unmap *unmap,
> +                                  struct vfio_bitmap *bitmap, bool will_commit)
> +{
> +    return vfio_user_dma_unmap(container->proxy, unmap, bitmap, will_commit);
> +}
> +
> +static void vfio_user_io_wait_commit(VFIOContainer *container)
> +{
> +    vfio_user_wait_reqs(container->proxy);
> +}
> +
>  VFIOContIO vfio_cont_io_sock = {
> +    .dma_map = vfio_user_io_dma_map,
> +    .dma_unmap = vfio_user_io_dma_unmap,
> +    .wait_commit = vfio_user_io_wait_commit,
>  };
John Johnson Dec. 7, 2021, 7:50 a.m. UTC | #2
> On Nov 19, 2021, at 2:42 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Mon,  8 Nov 2021 16:46:44 -0800
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> ---
>> hw/vfio/pci.h                 |   1 +
>> hw/vfio/user-protocol.h       |  32 +++++++
>> hw/vfio/user.h                |   1 +
>> include/hw/vfio/vfio-common.h |   4 +
>> hw/vfio/common.c              |  76 +++++++++++++---
>> hw/vfio/pci.c                 |   4 +
>> hw/vfio/user.c                | 206 ++++++++++++++++++++++++++++++++++++++++++
>> 7 files changed, 309 insertions(+), 15 deletions(-)
>> 
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 643ff75..156fee2 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -193,6 +193,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
>> struct VFIOUserPCIDevice {
>>     VFIOPCIDevice device;
>>     char *sock_name;
>> +    bool secure_dma;    /* disable shared mem for DMA */
> 
> ????????????  It's there, it's gone, it's back.
> 

	This was a merge mistake



>> 
>> 
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index c0e7632..dcfae2c 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -90,6 +90,8 @@ typedef struct VFIOContainer {
>>     VFIOContIO *io_ops;
>>     bool initialized;
>>     bool dirty_pages_supported;
>> +    bool will_commit;
> 
> The entire will_commit concept hidden in the map and unmap operations
> from many patches ago should be introduced here, or later.
> 

	ok


>> +    bool need_map_fd;
>>     uint64_t dirty_pgsizes;
>>     uint64_t max_dirty_bitmap_size;
>>     unsigned long pgsizes;
>> @@ -210,6 +212,7 @@ struct VFIOContIO {
>>     int (*dirty_bitmap)(VFIOContainer *container,
>>                         struct vfio_iommu_type1_dirty_bitmap *bitmap,
>>                         struct vfio_iommu_type1_dirty_bitmap_get *range);
>> +    void (*wait_commit)(VFIOContainer *container);
>> };
>> 
>> #define CONT_DMA_MAP(cont, map, fd, will_commit) \
>> @@ -218,6 +221,7 @@ struct VFIOContIO {
>>     ((cont)->io_ops->dma_unmap((cont), (unmap), (bitmap), (will_commit)))
>> #define CONT_DIRTY_BITMAP(cont, bitmap, range) \
>>     ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range)))
>> +#define CONT_WAIT_COMMIT(cont) ((cont)->io_ops->wait_commit(cont))
>> 
>> extern VFIODevIO vfio_dev_io_ioctl;
>> extern VFIOContIO vfio_cont_io_ioctl;
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index fdd2702..0840c8f 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -411,6 +411,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>>     struct vfio_iommu_type1_dma_unmap *unmap;
>>     struct vfio_bitmap *bitmap;
>>     uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
>> +    bool will_commit = container->will_commit;
>>     int ret;
>> 
>>     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
>> @@ -444,7 +445,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>>         goto unmap_exit;
>>     }
>> 
>> -    ret = CONT_DMA_UNMAP(container, unmap, bitmap, false);
>> +    ret = CONT_DMA_UNMAP(container, unmap, bitmap, will_commit);
>>     if (!ret) {
>>         cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
>>                 iotlb->translated_addr, pages);
>> @@ -471,16 +472,17 @@ static int vfio_dma_unmap(VFIOContainer *container,
>>         .iova = iova,
>>         .size = size,
>>     };
>> +    bool will_commit = container->will_commit;
>> 
>>     if (iotlb && container->dirty_pages_supported &&
>>         vfio_devices_all_running_and_saving(container)) {
>>         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
>>     }
>> 
>> -    return CONT_DMA_UNMAP(container, &unmap, NULL, false);
>> +    return CONT_DMA_UNMAP(container, &unmap, NULL, will_commit);
> 
> We're passing the container, why do we need a separate will_commit arg
> for these?
> 

	ok

> 
>> }
>> 
>> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> +static int vfio_dma_map(VFIOContainer *container, MemoryRegion *mr, hwaddr iova,
>>                         ram_addr_t size, void *vaddr, bool readonly)
>> {
>>     struct vfio_iommu_type1_dma_map map = {
>> @@ -490,13 +492,23 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>         .iova = iova,
>>         .size = size,
>>     };
>> -    int ret;
>> +    int fd, ret;
>> +    bool will_commit = container->will_commit;
>> 
>>     if (!readonly) {
>>         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>>     }
>> 
>> -    ret = CONT_DMA_MAP(container, &map, -1, false);
>> +    if (container->need_map_fd) {
>> +        fd = memory_region_get_fd(mr);
>> +        if (fd != -1) {
>> +            map.vaddr = qemu_ram_block_host_offset(mr->ram_block, vaddr);
>> +        }
>> +    } else {
>> +        fd = -1;
>> +    }
>> +
>> +    ret = CONT_DMA_MAP(container, &map, fd, will_commit);
> 
> Why were we even passing a -1 fd previously?  Would it make more sense
> to pass the mr and put this in the user variant .map_dma?  We're going
> to the trouble to pass the mr down this far here.  If the map callback
> handled the above fd and map.vaddr we could also avoid of the
> need_map_fd flag on the container.
> 

	ok

>> 
>> 
>> +static void vfio_listener_begin(MemoryListener *listener)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>> +
>> +    /* cannot drop BQL during the transaction, send maps/demaps async */
>> +    container->will_commit = true;
>> +}
>> +
>> +static void vfio_listener_commit(MemoryListener *listener)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>> +
>> +    /* wait for any async requests sent during the transaction */
>> +    CONT_WAIT_COMMIT(container);
>> +    container->will_commit = false;
>> +}
> 
> Not sure I follow the semantics, when would the map/unmap callbacks get
> called when will_commit is false?
> 

	If map/unmap is called outside a transaction, it can drop BQL and
wait.  For unmap/map inside a transaction, they're sent async, and waited
for after the commit (when it’s safe to drop BQL while we wait)


> Does it really make sense to have macros for ops that we call in one
> place?
> 

	I did it this way so all container ops calls are done the same way.


>> \
>> static const MemoryListener vfio_memory_listener = {
>> +    .begin = vfio_listener_begin,
>> +    .commit = vfio_listener_commit,
>>     .region_add = vfio_listener_region_add,
>>     .region_del = vfio_listener_region_del,
>>     .log_global_start = vfio_listener_log_global_start,
>> @@ -1561,6 +1598,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>>     region->size = info->size;
>>     region->fd_offset = info->offset;
>>     region->nr = index;
>> +    region->post_wr = false;
> 
> Should this be in a different patch?  It looks unrelated.
> 

	I think you said as much in the patch that introduced the region write
ops call.


>>     region->remfd = vfio_get_region_info_remfd(vbasedev, index);
>> 
>>     if (region->size) {
>> @@ -2047,6 +2085,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
>>     container->dirty_pages_supported = false;
>>     container->dma_max_mappings = 0;
>>     container->io_ops = &vfio_cont_io_ioctl;
>> +    container->need_map_fd = false;
>>     QLIST_INIT(&container->giommu_list);
>>     QLIST_INIT(&container->hostwin_list);
>>     QLIST_INIT(&container->vrdl_list);
>> @@ -2230,6 +2269,7 @@ void vfio_connect_proxy(VFIOProxy *proxy, VFIOGroup *group, AddressSpace *as)
>>     container->space = space;
>>     container->fd = -1;
>>     container->io_ops = &vfio_cont_io_sock;
>> +    container->need_map_fd = (proxy->flags & VFIO_PROXY_SECURE) == 0;
>>     QLIST_INIT(&container->giommu_list);
>>     QLIST_INIT(&container->hostwin_list);
>>     container->proxy = proxy;
>> @@ -2879,8 +2919,14 @@ static int vfio_io_dirty_bitmap(VFIOContainer *container,
>>     return ret;
>> }
>> 
>> +static void vfio_io_wait_commit(VFIOContainer *container)
>> +{
>> +    /* ioctl()s are synchronous */
>> +}
>> +
> 
> Maybe these should just be "dma_commit" rather than "wait_commit"?  I'd
> also tend to suggest "async" rather than "will_commit".
> 

	ok


>> VFIOContIO vfio_cont_io_ioctl = {
>>     .dma_map = vfio_io_dma_map,
>>     .dma_unmap = vfio_io_dma_unmap,
>>     .dirty_bitmap = vfio_io_dirty_bitmap,
>> +    .wait_commit = vfio_io_wait_commit,
>> };
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index d657b01..ca821da 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3516,6 +3516,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
>>     vbasedev->proxy = proxy;
>>     vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
>> 
>> +    if (udev->secure_dma) {
>> +        proxy->flags |= VFIO_PROXY_SECURE;
>> +    }
>>     if (udev->send_queued) {
>>         proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
>>     }
>> @@ -3686,6 +3689,7 @@ static void vfio_user_instance_finalize(Object *obj)
>> 
>> static Property vfio_user_pci_dev_properties[] = {
>>     DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
>> +    DEFINE_PROP_BOOL("secure-dma", VFIOUserPCIDevice, secure_dma, false),
> 
> "secure_dma" looks entirely compartmentalized that it could be a
> separate patch.  Thanks,
> 

	ok

		JJ
diff mbox series

Patch

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 643ff75..156fee2 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -193,6 +193,7 @@  OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
 struct VFIOUserPCIDevice {
     VFIOPCIDevice device;
     char *sock_name;
+    bool secure_dma;    /* disable shared mem for DMA */
     bool send_queued;   /* all sends are queued */
     bool no_post;       /* all regions write are sync */
 };
diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
index 5614efa..ca53fce 100644
--- a/hw/vfio/user-protocol.h
+++ b/hw/vfio/user-protocol.h
@@ -83,6 +83,31 @@  typedef struct {
 
 
 /*
+ * VFIO_USER_DMA_MAP
+ * imported from struct vfio_iommu_type1_dma_map
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint32_t argsz;
+    uint32_t flags;
+    uint64_t offset;    /* FD offset */
+    uint64_t iova;
+    uint64_t size;
+} VFIOUserDMAMap;
+
+/*
+ * VFIO_USER_DMA_UNMAP
+ * imported from struct vfio_iommu_type1_dma_unmap
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint32_t argsz;
+    uint32_t flags;
+    uint64_t iova;
+    uint64_t size;
+} VFIOUserDMAUnmap;
+
+/*
  * VFIO_USER_DEVICE_GET_INFO
  * imported from struct_device_info
  */
@@ -146,4 +171,11 @@  typedef struct {
     char data[];
 } VFIOUserRegionRW;
 
+/*imported from struct vfio_bitmap */
+typedef struct {
+    uint64_t pgsize;
+    uint64_t size;
+    char data[];
+} VFIOUserBitmap;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
index 8d03e7c..997f748 100644
--- a/hw/vfio/user.h
+++ b/hw/vfio/user.h
@@ -74,6 +74,7 @@  typedef struct VFIOProxy {
 
 /* VFIOProxy flags */
 #define VFIO_PROXY_CLIENT        0x1
+#define VFIO_PROXY_SECURE        0x2
 #define VFIO_PROXY_FORCE_QUEUED  0x4
 #define VFIO_PROXY_NO_POST       0x8
 
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c0e7632..dcfae2c 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -90,6 +90,8 @@  typedef struct VFIOContainer {
     VFIOContIO *io_ops;
     bool initialized;
     bool dirty_pages_supported;
+    bool will_commit;
+    bool need_map_fd;
     uint64_t dirty_pgsizes;
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
@@ -210,6 +212,7 @@  struct VFIOContIO {
     int (*dirty_bitmap)(VFIOContainer *container,
                         struct vfio_iommu_type1_dirty_bitmap *bitmap,
                         struct vfio_iommu_type1_dirty_bitmap_get *range);
+    void (*wait_commit)(VFIOContainer *container);
 };
 
 #define CONT_DMA_MAP(cont, map, fd, will_commit) \
@@ -218,6 +221,7 @@  struct VFIOContIO {
     ((cont)->io_ops->dma_unmap((cont), (unmap), (bitmap), (will_commit)))
 #define CONT_DIRTY_BITMAP(cont, bitmap, range) \
     ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range)))
+#define CONT_WAIT_COMMIT(cont) ((cont)->io_ops->wait_commit(cont))
 
 extern VFIODevIO vfio_dev_io_ioctl;
 extern VFIOContIO vfio_cont_io_ioctl;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index fdd2702..0840c8f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -411,6 +411,7 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
     struct vfio_iommu_type1_dma_unmap *unmap;
     struct vfio_bitmap *bitmap;
     uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
+    bool will_commit = container->will_commit;
     int ret;
 
     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
@@ -444,7 +445,7 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
         goto unmap_exit;
     }
 
-    ret = CONT_DMA_UNMAP(container, unmap, bitmap, false);
+    ret = CONT_DMA_UNMAP(container, unmap, bitmap, will_commit);
     if (!ret) {
         cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
                 iotlb->translated_addr, pages);
@@ -471,16 +472,17 @@  static int vfio_dma_unmap(VFIOContainer *container,
         .iova = iova,
         .size = size,
     };
+    bool will_commit = container->will_commit;
 
     if (iotlb && container->dirty_pages_supported &&
         vfio_devices_all_running_and_saving(container)) {
         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
     }
 
-    return CONT_DMA_UNMAP(container, &unmap, NULL, false);
+    return CONT_DMA_UNMAP(container, &unmap, NULL, will_commit);
 }
 
-static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+static int vfio_dma_map(VFIOContainer *container, MemoryRegion *mr, hwaddr iova,
                         ram_addr_t size, void *vaddr, bool readonly)
 {
     struct vfio_iommu_type1_dma_map map = {
@@ -490,13 +492,23 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
-    int ret;
+    int fd, ret;
+    bool will_commit = container->will_commit;
 
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
     }
 
-    ret = CONT_DMA_MAP(container, &map, -1, false);
+    if (container->need_map_fd) {
+        fd = memory_region_get_fd(mr);
+        if (fd != -1) {
+            map.vaddr = qemu_ram_block_host_offset(mr->ram_block, vaddr);
+        }
+    } else {
+        fd = -1;
+    }
+
+    ret = CONT_DMA_MAP(container, &map, fd, will_commit);
 
     if (ret < 0) {
         error_report("VFIO_MAP_DMA failed: %s", strerror(-ret));
@@ -557,7 +569,8 @@  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 
 /* Called with rcu_read_lock held.  */
 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
-                               ram_addr_t *ram_addr, bool *read_only)
+                               ram_addr_t *ram_addr, bool *read_only,
+                               MemoryRegion **mrp)
 {
     MemoryRegion *mr;
     hwaddr xlat;
@@ -638,6 +651,10 @@  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
         *read_only = !writable || mr->readonly;
     }
 
+    if (mrp != NULL) {
+        *mrp = mr;
+    }
+
     return true;
 }
 
@@ -645,6 +662,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
+    MemoryRegion *mr;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     void *vaddr;
     int ret;
@@ -663,7 +681,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
         bool read_only;
 
-        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
+        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mr)) {
             goto out;
         }
         /*
@@ -673,14 +691,14 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
          * of vaddr will always be there, even if the memory object is
          * destroyed and its backing memory munmap-ed.
          */
-        ret = vfio_dma_map(container, iova,
+        ret = vfio_dma_map(container, mr, iova,
                            iotlb->addr_mask + 1, vaddr,
                            read_only);
         if (ret) {
             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                         "0x%"HWADDR_PRIx", %p)",
                          container, iova,
-                         iotlb->addr_mask + 1, vaddr, ret);
+                         iotlb->addr_mask + 1, vaddr);
         }
     } else {
         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
@@ -735,7 +753,7 @@  static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
                section->offset_within_address_space;
         vaddr = memory_region_get_ram_ptr(section->mr) + start;
 
-        ret = vfio_dma_map(vrdl->container, iova, next - start,
+        ret = vfio_dma_map(vrdl->container, section->mr, iova, next - start,
                            vaddr, section->readonly);
         if (ret) {
             /* Rollback */
@@ -843,6 +861,23 @@  static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
     g_free(vrdl);
 }
 
+static void vfio_listener_begin(MemoryListener *listener)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+    /* cannot drop BQL during the transaction, send maps/demaps async */
+    container->will_commit = true;
+}
+
+static void vfio_listener_commit(MemoryListener *listener)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+    /* wait for any async requests sent during the transaction */
+    CONT_WAIT_COMMIT(container);
+    container->will_commit = false;
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -1035,12 +1070,12 @@  static void vfio_listener_region_add(MemoryListener *listener,
         }
     }
 
-    ret = vfio_dma_map(container, iova, int128_get64(llsize),
+    ret = vfio_dma_map(container, section->mr, iova, int128_get64(llsize),
                        vaddr, section->readonly);
     if (ret) {
         error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                   "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                   container, iova, int128_get64(llsize), vaddr, ret);
+                   "0x%"HWADDR_PRIx", %p)",
+                   container, iova, int128_get64(llsize), vaddr);
         if (memory_region_is_ram_device(section->mr)) {
             /* Allow unexpected mappings not to be fatal for RAM devices */
             error_report_err(err);
@@ -1301,7 +1336,7 @@  static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     }
 
     rcu_read_lock();
-    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
+    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL)) {
         int ret;
 
         ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
@@ -1418,6 +1453,8 @@  static void vfio_listener_log_sync(MemoryListener *listener,
 }
 
 static const MemoryListener vfio_memory_listener = {
+    .begin = vfio_listener_begin,
+    .commit = vfio_listener_commit,
     .region_add = vfio_listener_region_add,
     .region_del = vfio_listener_region_del,
     .log_global_start = vfio_listener_log_global_start,
@@ -1561,6 +1598,7 @@  int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
     region->size = info->size;
     region->fd_offset = info->offset;
     region->nr = index;
+    region->post_wr = false;
     region->remfd = vfio_get_region_info_remfd(vbasedev, index);
 
     if (region->size) {
@@ -2047,6 +2085,7 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     container->dirty_pages_supported = false;
     container->dma_max_mappings = 0;
     container->io_ops = &vfio_cont_io_ioctl;
+    container->need_map_fd = false;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
     QLIST_INIT(&container->vrdl_list);
@@ -2230,6 +2269,7 @@  void vfio_connect_proxy(VFIOProxy *proxy, VFIOGroup *group, AddressSpace *as)
     container->space = space;
     container->fd = -1;
     container->io_ops = &vfio_cont_io_sock;
+    container->need_map_fd = (proxy->flags & VFIO_PROXY_SECURE) == 0;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
     container->proxy = proxy;
@@ -2879,8 +2919,14 @@  static int vfio_io_dirty_bitmap(VFIOContainer *container,
     return ret;
 }
 
+static void vfio_io_wait_commit(VFIOContainer *container)
+{
+    /* ioctl()s are synchronous */
+}
+
 VFIOContIO vfio_cont_io_ioctl = {
     .dma_map = vfio_io_dma_map,
     .dma_unmap = vfio_io_dma_unmap,
     .dirty_bitmap = vfio_io_dirty_bitmap,
+    .wait_commit = vfio_io_wait_commit,
 };
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d657b01..ca821da 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3516,6 +3516,9 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
     vbasedev->proxy = proxy;
     vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
 
+    if (udev->secure_dma) {
+        proxy->flags |= VFIO_PROXY_SECURE;
+    }
     if (udev->send_queued) {
         proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
     }
@@ -3686,6 +3689,7 @@  static void vfio_user_instance_finalize(Object *obj)
 
 static Property vfio_user_pci_dev_properties[] = {
     DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
+    DEFINE_PROP_BOOL("secure-dma", VFIOUserPCIDevice, secure_dma, false),
     DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
     DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index 70fe7a6..cee08b6 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -52,8 +52,11 @@  static void vfio_user_request(void *opaque);
 static int vfio_user_send_queued(VFIOProxy *proxy, VFIOUserMsg *msg);
 static void vfio_user_send_async(VFIOProxy *proxy, VFIOUserHdr *hdr,
                                  VFIOUserFDs *fds);
+static void vfio_user_send_nowait(VFIOProxy *proxy, VFIOUserHdr *hdr,
+                                  VFIOUserFDs *fds, int rsize);
 static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
                                 VFIOUserFDs *fds, int rsize, bool nobql);
+static void vfio_user_wait_reqs(VFIOProxy *proxy);
 static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
                                   uint32_t size, uint32_t flags);
 
@@ -562,6 +565,36 @@  static void vfio_user_send_async(VFIOProxy *proxy, VFIOUserHdr *hdr,
     }
 }
 
+/*
+ * nowait send - vfio_wait_reqs() can wait for it later
+ */
+static void vfio_user_send_nowait(VFIOProxy *proxy, VFIOUserHdr *hdr,
+                                  VFIOUserFDs *fds, int rsize)
+{
+    VFIOUserMsg *msg;
+    int ret;
+
+    if (hdr->flags & VFIO_USER_NO_REPLY) {
+        error_printf("vfio_user_send_nowait on async message\n");
+        return;
+    }
+
+    QEMU_LOCK_GUARD(&proxy->lock);
+
+    msg = vfio_user_getmsg(proxy, hdr, fds);
+    msg->id = hdr->id;
+    msg->rsize = rsize ? rsize : hdr->size;
+    msg->type = VFIO_MSG_NOWAIT;
+
+    ret = vfio_user_send_queued(proxy, msg);
+    if (ret < 0) {
+        vfio_user_recycle(proxy, msg);
+        return;
+    }
+
+    proxy->last_nowait = msg;
+}
+
 static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
                                 VFIOUserFDs *fds, int rsize, bool nobql)
 {
@@ -610,6 +643,56 @@  static void vfio_user_send_wait(VFIOProxy *proxy, VFIOUserHdr *hdr,
     }
 }
 
+static void vfio_user_wait_reqs(VFIOProxy *proxy)
+{
+    VFIOUserMsg *msg;
+    bool iolock = false;
+
+    /*
+     * Any DMA map/unmap requests sent in the middle
+     * of a memory region transaction were sent nowait.
+     * Wait for them here.
+     */
+    qemu_mutex_lock(&proxy->lock);
+    if (proxy->last_nowait != NULL) {
+        iolock = qemu_mutex_iothread_locked();
+        if (iolock) {
+            qemu_mutex_unlock_iothread();
+        }
+
+        /*
+         * Change type to WAIT to wait for reply
+         */
+        msg = proxy->last_nowait;
+        msg->type = VFIO_MSG_WAIT;
+        while (!msg->complete) {
+            if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+                error_printf("vfio_wait_reqs - timed out\n");
+                break;
+            }
+        }
+
+        if (msg->hdr->flags & VFIO_USER_ERROR) {
+            error_printf("vfio_user_wait_reqs - error reply on async request ");
+            error_printf("command %x error %s\n", msg->hdr->command,
+                         strerror(msg->hdr->error_reply));
+        }
+
+        proxy->last_nowait = NULL;
+        /*
+         * Change type back to NOWAIT to free
+         */
+        msg->type = VFIO_MSG_NOWAIT;
+        vfio_user_recycle(proxy, msg);
+    }
+
+    /* lock order is BQL->proxy - don't hold proxy when getting BQL */
+    qemu_mutex_unlock(&proxy->lock);
+    if (iolock) {
+        qemu_mutex_lock_iothread();
+    }
+}
+
 static QLIST_HEAD(, VFIOProxy) vfio_user_sockets =
     QLIST_HEAD_INITIALIZER(vfio_user_sockets);
 
@@ -935,6 +1018,102 @@  int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp)
     return 0;
 }
 
+static int vfio_user_dma_map(VFIOProxy *proxy,
+                             struct vfio_iommu_type1_dma_map *map,
+                             int fd, bool will_commit)
+{
+    VFIOUserFDs *fds = NULL;
+    VFIOUserDMAMap *msgp = g_malloc0(sizeof(*msgp));
+    int ret;
+
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_MAP, sizeof(*msgp), 0);
+    msgp->argsz = map->argsz;
+    msgp->flags = map->flags;
+    msgp->offset = map->vaddr;
+    msgp->iova = map->iova;
+    msgp->size = map->size;
+
+    /*
+     * The will_commit case sends without blocking or dropping BQL.
+     * They're later waited for in vfio_send_wait_reqs.
+     */
+    if (will_commit) {
+        /* can't use auto variable since we don't block */
+        if (fd != -1) {
+            fds = vfio_user_getfds(1);
+            fds->send_fds = 1;
+            fds->fds[0] = fd;
+        }
+        vfio_user_send_nowait(proxy, &msgp->hdr, fds, 0);
+        ret = 0;
+    } else {
+        VFIOUserFDs local_fds = { 1, 0, &fd };
+
+        fds = fd != -1 ? &local_fds : NULL;
+        vfio_user_send_wait(proxy, &msgp->hdr, fds, 0, will_commit);
+        ret = (msgp->hdr.flags & VFIO_USER_ERROR) ? -msgp->hdr.error_reply : 0;
+        g_free(msgp);
+    }
+
+    return ret;
+}
+
+static int vfio_user_dma_unmap(VFIOProxy *proxy,
+                               struct vfio_iommu_type1_dma_unmap *unmap,
+                               struct vfio_bitmap *bitmap, bool will_commit)
+{
+    struct {
+        VFIOUserDMAUnmap msg;
+        VFIOUserBitmap bitmap;
+    } *msgp = NULL;
+    int msize, rsize;
+    bool blocking = !will_commit;
+
+    if (bitmap == NULL &&
+        (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)) {
+        error_printf("vfio_user_dma_unmap mismatched flags and bitmap\n");
+        return -EINVAL;
+    }
+
+    /*
+     * If a dirty bitmap is returned, allocate extra space for it
+     * and block for reply even in the will_commit case.
+     * Otherwise, can send the unmap request without waiting.
+     */
+    if (bitmap != NULL) {
+        blocking = true;
+        msize = sizeof(*msgp);
+        rsize = msize + bitmap->size;
+        msgp = g_malloc0(rsize);
+        msgp->bitmap.pgsize = bitmap->pgsize;
+        msgp->bitmap.size = bitmap->size;
+    } else {
+        msize = rsize = sizeof(VFIOUserDMAUnmap);
+        msgp = g_malloc0(rsize);
+    }
+
+    vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DMA_UNMAP, msize, 0);
+    msgp->msg.argsz = unmap->argsz;
+    msgp->msg.flags = unmap->flags;
+    msgp->msg.iova = unmap->iova;
+    msgp->msg.size = unmap->size;
+
+    if (blocking) {
+        vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, will_commit);
+        if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
+            return -msgp->msg.hdr.error_reply;
+        }
+        if (bitmap != NULL) {
+            memcpy(bitmap->data, &msgp->bitmap.data, bitmap->size);
+        }
+        g_free(msgp);
+    } else {
+        vfio_user_send_nowait(proxy, &msgp->msg.hdr, NULL, rsize);
+    }
+
+    return 0;
+}
+
 static int vfio_user_get_info(VFIOProxy *proxy, struct vfio_device_info *info)
 {
     VFIOUserDeviceInfo msg;
@@ -1225,5 +1404,32 @@  VFIODevIO vfio_dev_io_sock = {
 };
 
 
+static int vfio_user_io_dma_map(VFIOContainer *container,
+                                struct vfio_iommu_type1_dma_map *map,
+                                int fd, bool will_commit)
+{
+    if (fd != -1) {
+        return vfio_user_dma_map(container->proxy, map, fd, will_commit);
+    } else {
+        map->vaddr = 0;
+        return vfio_user_dma_map(container->proxy, map, -1, will_commit);
+    }
+}
+
+static int vfio_user_io_dma_unmap(VFIOContainer *container,
+                                  struct vfio_iommu_type1_dma_unmap *unmap,
+                                  struct vfio_bitmap *bitmap, bool will_commit)
+{
+    return vfio_user_dma_unmap(container->proxy, unmap, bitmap, will_commit);
+}
+
+static void vfio_user_io_wait_commit(VFIOContainer *container)
+{
+    vfio_user_wait_reqs(container->proxy);
+}
+
 VFIOContIO vfio_cont_io_sock = {
+    .dma_map = vfio_user_io_dma_map,
+    .dma_unmap = vfio_user_io_dma_unmap,
+    .wait_commit = vfio_user_io_wait_commit,
 };