diff mbox series

[v2,12/23] vfio-user: region read/write

Message ID 83ec17255d41c90eb3950364dd853b240398705b.1675228037.git.john.g.johnson@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user client | expand

Commit Message

John Johnson Feb. 2, 2023, 5:55 a.m. UTC
Add support for posted writes on remote devices

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/vfio/user-protocol.h       |  12 +++++
 hw/vfio/user.h                |   1 +
 include/hw/vfio/vfio-common.h |   3 +-
 hw/vfio/common.c              |  23 ++++++---
 hw/vfio/pci.c                 |   5 +-
 hw/vfio/user-pci.c            |   5 ++
 hw/vfio/user.c                | 112 ++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   1 +
 8 files changed, 154 insertions(+), 8 deletions(-)

Comments

Alex Williamson Feb. 6, 2023, 7:07 p.m. UTC | #1
On Wed,  1 Feb 2023 21:55:48 -0800
John Johnson <john.g.johnson@oracle.com> wrote:

> Add support for posted writes on remote devices
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  hw/vfio/user-protocol.h       |  12 +++++
>  hw/vfio/user.h                |   1 +
>  include/hw/vfio/vfio-common.h |   3 +-
>  hw/vfio/common.c              |  23 ++++++---
>  hw/vfio/pci.c                 |   5 +-
>  hw/vfio/user-pci.c            |   5 ++
>  hw/vfio/user.c                | 112 ++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |   1 +
>  8 files changed, 154 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
> index 6f70a48..6987435 100644
> --- a/hw/vfio/user-protocol.h
> +++ b/hw/vfio/user-protocol.h
> @@ -139,4 +139,16 @@ typedef struct {
>      uint64_t offset;
>  } VFIOUserRegionInfo;
>  
> +/*
> + * VFIO_USER_REGION_READ
> + * VFIO_USER_REGION_WRITE
> + */
> +typedef struct {
> +    VFIOUserHdr hdr;
> +    uint64_t offset;
> +    uint32_t region;
> +    uint32_t count;
> +    char data[];
> +} VFIOUserRegionRW;
> +
>  #endif /* VFIO_USER_PROTOCOL_H */
> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> index e6485dc..3012a86 100644
> --- a/hw/vfio/user.h
> +++ b/hw/vfio/user.h
> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
>  /* VFIOProxy flags */
>  #define VFIO_PROXY_CLIENT        0x1
>  #define VFIO_PROXY_FORCE_QUEUED  0x4
> +#define VFIO_PROXY_NO_POST       0x8
>  
>  VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
>  void vfio_user_disconnect(VFIOUserProxy *proxy);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9fb4c80..bbc4b15 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -57,6 +57,7 @@ typedef struct VFIORegion {
>      VFIOMmap *mmaps;
>      uint8_t nr; /* cache the region number for debug */
>      int fd; /* fd to mmap() region */
> +    bool post_wr; /* writes can be posted */
>  } VFIORegion;
>  
>  typedef struct VFIOMigration {
> @@ -180,7 +181,7 @@ struct VFIODeviceIO {
>      int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>                         void *data);
>      int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> -                        void *data);
> +                        void *data, bool post);
>  };
>  
>  struct VFIOContainerIO {
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index d26b325..de64e53 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          uint32_t dword;
>          uint64_t qword;
>      } buf;
> +    bool post = region->post_wr;
>      int ret;
>  
>      switch (size) {
> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          break;
>      }
>  
> -    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
> +    /* read-after-write hazard if guest can directly access region */
> +    if (region->nr_mmaps) {
> +        post = false;
> +    }
> +    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
> +                                     post);
>      if (ret != size) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
>          error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> -                     ",%d) failed: %m",
> +                     ",%d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, data, size);
> +                     addr, data, size, err);
>      }
>      trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
>  
> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque,
>  
>      ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
>      if (ret != size) {
> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, size);
> +                     addr, size, err);
>          return (uint64_t)-1;
>      }
>  
> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>      region->size = info->size;
>      region->fd_offset = info->offset;
>      region->nr = index;
> +    region->post_wr = false;
>      if (vbasedev->regfds != NULL) {
>          region->fd = vbasedev->regfds[index];
>      } else {
> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
>  }
>  
>  static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
> -                                uint32_t size, void *data)
> +                                uint32_t size, void *data, bool post)
>  {

This is all a bit confusing as a non-posted write on bare metal would
require a follow-up read to flush the write, but in the kernel case we
rely on both the bus protocols and the userspace driver to perform such
actions to enforce coherency.  The read-after-write hazard comment above
suggests the issue is split access between mmap and read/write across
the region, where the mmap access bypasses the socket protocol.  But
isn't this actually across the whole device?  For example, PCI doesn't
care which BAR a write targets, reads to another BAR cannot bypass the
write, aiui.  IOW, couldn't a write to a BAR that doesn't support mmap
affect the behavior of a BAR that does support mmap, and therefore
there should be no posted writes for the entire device if any region
supports mmap access?

I wonder if a better approach wouldn't be to make all writes operations
synchronous and avoid read-after-write hazard by performing writes
through the mmap when available, ie. make use of the same bypass to
avoid the hazard.

In any case, I think this deserves more comments, both why vfio-user
needs it and why vfio-kernel doesn't, and ideally the initial
read/write implementation would be entirely synchronous with posted
write support added as a separate patch with full explanation and
justification.  Thanks,

Alex
John Johnson Feb. 8, 2023, 6:38 a.m. UTC | #2
> On Feb 6, 2023, at 11:07 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Wed,  1 Feb 2023 21:55:48 -0800
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>> Add support for posted writes on remote devices
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> hw/vfio/user-protocol.h       |  12 +++++
>> hw/vfio/user.h                |   1 +
>> include/hw/vfio/vfio-common.h |   3 +-
>> hw/vfio/common.c              |  23 ++++++---
>> hw/vfio/pci.c                 |   5 +-
>> hw/vfio/user-pci.c            |   5 ++
>> hw/vfio/user.c                | 112 ++++++++++++++++++++++++++++++++++++++++++
>> hw/vfio/trace-events          |   1 +
>> 8 files changed, 154 insertions(+), 8 deletions(-)
>> 
>> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
>> index 6f70a48..6987435 100644
>> --- a/hw/vfio/user-protocol.h
>> +++ b/hw/vfio/user-protocol.h
>> @@ -139,4 +139,16 @@ typedef struct {
>>     uint64_t offset;
>> } VFIOUserRegionInfo;
>> 
>> +/*
>> + * VFIO_USER_REGION_READ
>> + * VFIO_USER_REGION_WRITE
>> + */
>> +typedef struct {
>> +    VFIOUserHdr hdr;
>> +    uint64_t offset;
>> +    uint32_t region;
>> +    uint32_t count;
>> +    char data[];
>> +} VFIOUserRegionRW;
>> +
>> #endif /* VFIO_USER_PROTOCOL_H */
>> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
>> index e6485dc..3012a86 100644
>> --- a/hw/vfio/user.h
>> +++ b/hw/vfio/user.h
>> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
>> /* VFIOProxy flags */
>> #define VFIO_PROXY_CLIENT        0x1
>> #define VFIO_PROXY_FORCE_QUEUED  0x4
>> +#define VFIO_PROXY_NO_POST       0x8
>> 
>> VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
>> void vfio_user_disconnect(VFIOUserProxy *proxy);
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 9fb4c80..bbc4b15 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -57,6 +57,7 @@ typedef struct VFIORegion {
>>     VFIOMmap *mmaps;
>>     uint8_t nr; /* cache the region number for debug */
>>     int fd; /* fd to mmap() region */
>> +    bool post_wr; /* writes can be posted */
>> } VFIORegion;
>> 
>> typedef struct VFIOMigration {
>> @@ -180,7 +181,7 @@ struct VFIODeviceIO {
>>     int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>>                        void *data);
>>     int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>> -                        void *data);
>> +                        void *data, bool post);
>> };
>> 
>> struct VFIOContainerIO {
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index d26b325..de64e53 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>>         uint32_t dword;
>>         uint64_t qword;
>>     } buf;
>> +    bool post = region->post_wr;
>>     int ret;
>> 
>>     switch (size) {
>> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr,
>>         break;
>>     }
>> 
>> -    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
>> +    /* read-after-write hazard if guest can directly access region */
>> +    if (region->nr_mmaps) {
>> +        post = false;
>> +    }
>> +    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
>> +                                     post);
>>     if (ret != size) {
>> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
>> +
>>         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
>> -                     ",%d) failed: %m",
>> +                     ",%d) failed: %s",
>>                      __func__, vbasedev->name, region->nr,
>> -                     addr, data, size);
>> +                     addr, data, size, err);
>>     }
>>     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
>> 
>> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque,
>> 
>>     ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
>>     if (ret != size) {
>> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
>> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
>> +
>> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>>                      __func__, vbasedev->name, region->nr,
>> -                     addr, size);
>> +                     addr, size, err);
>>         return (uint64_t)-1;
>>     }
>> 
>> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>>     region->size = info->size;
>>     region->fd_offset = info->offset;
>>     region->nr = index;
>> +    region->post_wr = false;
>>     if (vbasedev->regfds != NULL) {
>>         region->fd = vbasedev->regfds[index];
>>     } else {
>> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
>> }
>> 
>> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
>> -                                uint32_t size, void *data)
>> +                                uint32_t size, void *data, bool post)
>> {
> 
> This is all a bit confusing as a non-posted write on bare metal would
> require a follow-up read to flush the write, but in the kernel case we
> rely on both the bus protocols and the userspace driver to perform such
> actions to enforce coherency.  The read-after-write hazard comment above
> suggests the issue is split access between mmap and read/write across
> the region, where the mmap access bypasses the socket protocol.  But
> isn't this actually across the whole device?  For example, PCI doesn't
> care which BAR a write targets, reads to another BAR cannot bypass the
> write, aiui.  IOW, couldn't a write to a BAR that doesn't support mmap
> affect the behavior of a BAR that does support mmap, and therefore
> there should be no posted writes for the entire device if any region
> supports mmap access?
> 

	The protocol has sequential ordering, so issues arise only when
reads to mapped regions pass writes that were sent asynchronously.  The code
is designed to handle common driver ordering operations: reading config space
(which can’t be mapped, so they’re sequentially ordered by the protocol)
and reading a nearby register (in the same region).  There is a ’no-post’
vfio-user option in case the driver relies on ordering reads to other
(non-config) regions

	I made this the default since of the the primary customers is an
NVME device server that polls a mapped doorbell region and wants posted
writes.


> I wonder if a better approach wouldn't be to make all writes operations
> synchronous and avoid read-after-write hazard by performing writes
> through the mmap when available, ie. make use of the same bypass to
> avoid the hazard.
> 

	I can flip the default to be strict PCI, and force the NVME
folks to use an option to enable performance.


> In any case, I think this deserves more comments, both why vfio-user
> needs it and why vfio-kernel doesn't, and ideally the initial
> read/write implementation would be entirely synchronous with posted
> write support added as a separate patch with full explanation and
> justification.  Thanks,
> 

	Sure.  Ordering is always a pain to reason about.

						JJ
Alex Williamson Feb. 8, 2023, 8:33 p.m. UTC | #3
On Wed, 8 Feb 2023 06:38:27 +0000
John Johnson <john.g.johnson@oracle.com> wrote:

> > On Feb 6, 2023, at 11:07 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > On Wed,  1 Feb 2023 21:55:48 -0800
> > John Johnson <john.g.johnson@oracle.com> wrote:
> >   
> >> Add support for posted writes on remote devices
> >> 
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> hw/vfio/user-protocol.h       |  12 +++++
> >> hw/vfio/user.h                |   1 +
> >> include/hw/vfio/vfio-common.h |   3 +-
> >> hw/vfio/common.c              |  23 ++++++---
> >> hw/vfio/pci.c                 |   5 +-
> >> hw/vfio/user-pci.c            |   5 ++
> >> hw/vfio/user.c                | 112 ++++++++++++++++++++++++++++++++++++++++++
> >> hw/vfio/trace-events          |   1 +
> >> 8 files changed, 154 insertions(+), 8 deletions(-)
> >> 
> >> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
> >> index 6f70a48..6987435 100644
> >> --- a/hw/vfio/user-protocol.h
> >> +++ b/hw/vfio/user-protocol.h
> >> @@ -139,4 +139,16 @@ typedef struct {
> >>     uint64_t offset;
> >> } VFIOUserRegionInfo;
> >> 
> >> +/*
> >> + * VFIO_USER_REGION_READ
> >> + * VFIO_USER_REGION_WRITE
> >> + */
> >> +typedef struct {
> >> +    VFIOUserHdr hdr;
> >> +    uint64_t offset;
> >> +    uint32_t region;
> >> +    uint32_t count;
> >> +    char data[];
> >> +} VFIOUserRegionRW;
> >> +
> >> #endif /* VFIO_USER_PROTOCOL_H */
> >> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> >> index e6485dc..3012a86 100644
> >> --- a/hw/vfio/user.h
> >> +++ b/hw/vfio/user.h
> >> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
> >> /* VFIOProxy flags */
> >> #define VFIO_PROXY_CLIENT        0x1
> >> #define VFIO_PROXY_FORCE_QUEUED  0x4
> >> +#define VFIO_PROXY_NO_POST       0x8
> >> 
> >> VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
> >> void vfio_user_disconnect(VFIOUserProxy *proxy);
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> index 9fb4c80..bbc4b15 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -57,6 +57,7 @@ typedef struct VFIORegion {
> >>     VFIOMmap *mmaps;
> >>     uint8_t nr; /* cache the region number for debug */
> >>     int fd; /* fd to mmap() region */
> >> +    bool post_wr; /* writes can be posted */
> >> } VFIORegion;
> >> 
> >> typedef struct VFIOMigration {
> >> @@ -180,7 +181,7 @@ struct VFIODeviceIO {
> >>     int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> >>                        void *data);
> >>     int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> >> -                        void *data);
> >> +                        void *data, bool post);
> >> };
> >> 
> >> struct VFIOContainerIO {
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index d26b325..de64e53 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
> >>         uint32_t dword;
> >>         uint64_t qword;
> >>     } buf;
> >> +    bool post = region->post_wr;
> >>     int ret;
> >> 
> >>     switch (size) {
> >> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr,
> >>         break;
> >>     }
> >> 
> >> -    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
> >> +    /* read-after-write hazard if guest can directly access region */
> >> +    if (region->nr_mmaps) {
> >> +        post = false;
> >> +    }
> >> +    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
> >> +                                     post);
> >>     if (ret != size) {
> >> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
> >> +
> >>         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> >> -                     ",%d) failed: %m",
> >> +                     ",%d) failed: %s",
> >>                      __func__, vbasedev->name, region->nr,
> >> -                     addr, data, size);
> >> +                     addr, data, size, err);
> >>     }
> >>     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
> >> 
> >> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque,
> >> 
> >>     ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
> >>     if (ret != size) {
> >> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> >> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> >> +
> >> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
> >>                      __func__, vbasedev->name, region->nr,
> >> -                     addr, size);
> >> +                     addr, size, err);
> >>         return (uint64_t)-1;
> >>     }
> >> 
> >> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
> >>     region->size = info->size;
> >>     region->fd_offset = info->offset;
> >>     region->nr = index;
> >> +    region->post_wr = false;
> >>     if (vbasedev->regfds != NULL) {
> >>         region->fd = vbasedev->regfds[index];
> >>     } else {
> >> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
> >> }
> >> 
> >> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
> >> -                                uint32_t size, void *data)
> >> +                                uint32_t size, void *data, bool post)
> >> {  
> > 
> > This is all a bit confusing as a non-posted write on bare metal would
> > require a follow-up read to flush the write, but in the kernel case we
> > rely on both the bus protocols and the userspace driver to perform such
> > actions to enforce coherency.  The read-after-write hazard comment above
> > suggests the issue is split access between mmap and read/write across
> > the region, where the mmap access bypasses the socket protocol.  But
> > isn't this actually across the whole device?  For example, PCI doesn't
> > care which BAR a write targets, reads to another BAR cannot bypass the
> > write, aiui.  IOW, couldn't a write to a BAR that doesn't support mmap
> > affect the behavior of a BAR that does support mmap, and therefore
> > there should be no posted writes for the entire device if any region
> > supports mmap access?
> >   
> 
> 	The protocol has sequential ordering, so issues arise only when
> reads to mapped regions pass writes that were sent asynchronously.  The code
> is designed to handle common driver ordering operations: reading config space
> (which can’t be mapped, so they’re sequentially ordered by the protocol)
> and reading a nearby register (in the same region).  There is a ’no-post’
> vfio-user option in case the driver relies on ordering reads to other
> (non-config) regions
> 
> 	I made this the default since of the the primary customers is an
> NVME device server that polls a mapped doorbell region and wants posted
> writes.

This sounds like we're targeting a very specific use case then since
the code explicitly disables posted writes when mmaps within the same
region are available.  Another device might have these spread across
different regions, where the mmap test might not force the non-posted
write behavior and the user write could return with the operation still
in-flight through the socket protocol, allowing a read through the mmap
to bypass.

Device options are great for debugging and isolating certain features,
but users expect that devices "just work".

> > I wonder if a better approach wouldn't be to make all writes operations
> > synchronous and avoid read-after-write hazard by performing writes
> > through the mmap when available, ie. make use of the same bypass to
> > avoid the hazard.
> >   
> 
> 	I can flip the default to be strict PCI, and force the NVME
> folks to use an option to enable performance.

All the physical NVME devices I have around only have a single BAR, so
the test to disable posted writes if mmaps are available within the
same region, ie. BAR, would necessarily cause all writes through the
trapped space to be non-posted.  Is your NVME device already working
with a non-standard layout to "enable performance"?

I'd rather see default behavior that focuses on creating a compatible
interface to bare metal, which I think means we need to demote all
writes to non-posted when there's a mechanism where reads can bypass
writes.  From that we can build quirks for known devices which can
overlay memory regions that only perform posted writes.  Perhaps a
side-band interface modeled after sparse mmaps would allow
standardization of such a device quirk to allow the device on the
server side to direct areas where posted writes are always safe.
Thanks,

Alex
John Johnson Feb. 10, 2023, 5:28 a.m. UTC | #4
> On Feb 8, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Wed, 8 Feb 2023 06:38:27 +0000
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>>> On Feb 6, 2023, at 11:07 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>> 
>>> On Wed,  1 Feb 2023 21:55:48 -0800
>>> John Johnson <john.g.johnson@oracle.com> wrote:
>>> 
>>>> Add support for posted writes on remote devices
>>>> 
>>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>>> ---
>>>> hw/vfio/user-protocol.h       |  12 +++++
>>>> hw/vfio/user.h                |   1 +
>>>> include/hw/vfio/vfio-common.h |   3 +-
>>>> hw/vfio/common.c              |  23 ++++++---
>>>> hw/vfio/pci.c                 |   5 +-
>>>> hw/vfio/user-pci.c            |   5 ++
>>>> hw/vfio/user.c                | 112 ++++++++++++++++++++++++++++++++++++++++++
>>>> hw/vfio/trace-events          |   1 +
>>>> 8 files changed, 154 insertions(+), 8 deletions(-)
>>>> 
>>>> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
>>>> index 6f70a48..6987435 100644
>>>> --- a/hw/vfio/user-protocol.h
>>>> +++ b/hw/vfio/user-protocol.h
>>>> @@ -139,4 +139,16 @@ typedef struct {
>>>>    uint64_t offset;
>>>> } VFIOUserRegionInfo;
>>>> 
>>>> +/*
>>>> + * VFIO_USER_REGION_READ
>>>> + * VFIO_USER_REGION_WRITE
>>>> + */
>>>> +typedef struct {
>>>> +    VFIOUserHdr hdr;
>>>> +    uint64_t offset;
>>>> +    uint32_t region;
>>>> +    uint32_t count;
>>>> +    char data[];
>>>> +} VFIOUserRegionRW;
>>>> +
>>>> #endif /* VFIO_USER_PROTOCOL_H */
>>>> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
>>>> index e6485dc..3012a86 100644
>>>> --- a/hw/vfio/user.h
>>>> +++ b/hw/vfio/user.h
>>>> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
>>>> /* VFIOProxy flags */
>>>> #define VFIO_PROXY_CLIENT        0x1
>>>> #define VFIO_PROXY_FORCE_QUEUED  0x4
>>>> +#define VFIO_PROXY_NO_POST       0x8
>>>> 
>>>> VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
>>>> void vfio_user_disconnect(VFIOUserProxy *proxy);
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> index 9fb4c80..bbc4b15 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -57,6 +57,7 @@ typedef struct VFIORegion {
>>>>    VFIOMmap *mmaps;
>>>>    uint8_t nr; /* cache the region number for debug */
>>>>    int fd; /* fd to mmap() region */
>>>> +    bool post_wr; /* writes can be posted */
>>>> } VFIORegion;
>>>> 
>>>> typedef struct VFIOMigration {
>>>> @@ -180,7 +181,7 @@ struct VFIODeviceIO {
>>>>    int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>>>>                       void *data);
>>>>    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>>>> -                        void *data);
>>>> +                        void *data, bool post);
>>>> };
>>>> 
>>>> struct VFIOContainerIO {
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index d26b325..de64e53 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>>>>        uint32_t dword;
>>>>        uint64_t qword;
>>>>    } buf;
>>>> +    bool post = region->post_wr;
>>>>    int ret;
>>>> 
>>>>    switch (size) {
>>>> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr,
>>>>        break;
>>>>    }
>>>> 
>>>> -    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
>>>> +    /* read-after-write hazard if guest can directly access region */
>>>> +    if (region->nr_mmaps) {
>>>> +        post = false;
>>>> +    }
>>>> +    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
>>>> +                                     post);
>>>>    if (ret != size) {
>>>> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
>>>> +
>>>>        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
>>>> -                     ",%d) failed: %m",
>>>> +                     ",%d) failed: %s",
>>>>                     __func__, vbasedev->name, region->nr,
>>>> -                     addr, data, size);
>>>> +                     addr, data, size, err);
>>>>    }
>>>>    trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
>>>> 
>>>> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque,
>>>> 
>>>>    ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
>>>>    if (ret != size) {
>>>> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
>>>> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
>>>> +
>>>> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>>>>                     __func__, vbasedev->name, region->nr,
>>>> -                     addr, size);
>>>> +                     addr, size, err);
>>>>        return (uint64_t)-1;
>>>>    }
>>>> 
>>>> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>>>>    region->size = info->size;
>>>>    region->fd_offset = info->offset;
>>>>    region->nr = index;
>>>> +    region->post_wr = false;
>>>>    if (vbasedev->regfds != NULL) {
>>>>        region->fd = vbasedev->regfds[index];
>>>>    } else {
>>>> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
>>>> }
>>>> 
>>>> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
>>>> -                                uint32_t size, void *data)
>>>> +                                uint32_t size, void *data, bool post)
>>>> {  
>>> 
>>> This is all a bit confusing as a non-posted write on bare metal would
>>> require a follow-up read to flush the write, but in the kernel case we
>>> rely on both the bus protocols and the userspace driver to perform such
>>> actions to enforce coherency.  The read-after-write hazard comment above
>>> suggests the issue is split access between mmap and read/write across
>>> the region, where the mmap access bypasses the socket protocol.  But
>>> isn't this actually across the whole device?  For example, PCI doesn't
>>> care which BAR a write targets, reads to another BAR cannot bypass the
>>> write, aiui.  IOW, couldn't a write to a BAR that doesn't support mmap
>>> affect the behavior of a BAR that does support mmap, and therefore
>>> there should be no posted writes for the entire device if any region
>>> supports mmap access?
>>> 
>> 
>> 	The protocol has sequential ordering, so issues arise only when
>> reads to mapped regions pass writes that were sent asynchronously.  The code
>> is designed to handle common driver ordering operations: reading config space
>> (which can’t be mapped, so they’re sequentially ordered by the protocol)
>> and reading a nearby register (in the same region).  There is a ’no-post’
>> vfio-user option in case the driver relies on ordering reads to other
>> (non-config) regions
>> 
>> 	I made this the default since of the the primary customers is an
>> NVME device server that polls a mapped doorbell region and wants posted
>> writes.
> 
> This sounds like we're targeting a very specific use case then since
> the code explicitly disables posted writes when mmaps within the same
> region are available.  Another device might have these spread across
> different regions, where the mmap test might not force the non-posted
> write behavior and the user write could return with the operation still
> in-flight through the socket protocol, allowing a read through the mmap
> to bypass.
> 
> Device options are great for debugging and isolating certain features,
> but users expect that devices "just work".
> 
>>> I wonder if a better approach wouldn't be to make all writes operations
>>> synchronous and avoid read-after-write hazard by performing writes
>>> through the mmap when available, ie. make use of the same bypass to
>>> avoid the hazard.
>>> 
>> 
>> 	I can flip the default to be strict PCI, and force the NVME
>> folks to use an option to enable performance.
> 
> All the physical NVME devices I have around only have a single BAR, so
> the test to disable posted writes if mmaps are available within the
> same region, ie. BAR, would necessarily cause all writes through the
> trapped space to be non-posted.  Is your NVME device already working
> with a non-standard layout to "enable performance"?
> 

	It looks like they have 1 BAR and use 2 different modes depending
on if they want to dedicate threads to polling the doorbells.


> I'd rather see default behavior that focuses on creating a compatible
> interface to bare metal, which I think means we need to demote all
> writes to non-posted when there's a mechanism where reads can bypass
> writes.  From that we can build quirks for known devices which can
> overlay memory regions that only perform posted writes.  Perhaps a
> side-band interface modeled after sparse mmaps would allow
> standardization of such a device quirk to allow the device on the
> server side to direct areas where posted writes are always safe.
> Thanks,
> 

	ok - the default will be strict PCI (no posted writes if any part
of the device is mapped)  If the NVME folks want a looser mode, I can add it
as an option.  I’m not sure this is a property of the HW, more of the driver
using config space reads for ordering, and not depending on memory region
access ordering.

								JJ
diff mbox series

Patch

diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
index 6f70a48..6987435 100644
--- a/hw/vfio/user-protocol.h
+++ b/hw/vfio/user-protocol.h
@@ -139,4 +139,16 @@  typedef struct {
     uint64_t offset;
 } VFIOUserRegionInfo;
 
+/*
+ * VFIO_USER_REGION_READ
+ * VFIO_USER_REGION_WRITE
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint64_t offset;
+    uint32_t region;
+    uint32_t count;
+    char data[];
+} VFIOUserRegionRW;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
index e6485dc..3012a86 100644
--- a/hw/vfio/user.h
+++ b/hw/vfio/user.h
@@ -84,6 +84,7 @@  typedef struct VFIOUserProxy {
 /* VFIOProxy flags */
 #define VFIO_PROXY_CLIENT        0x1
 #define VFIO_PROXY_FORCE_QUEUED  0x4
+#define VFIO_PROXY_NO_POST       0x8
 
 VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
 void vfio_user_disconnect(VFIOUserProxy *proxy);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9fb4c80..bbc4b15 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -57,6 +57,7 @@  typedef struct VFIORegion {
     VFIOMmap *mmaps;
     uint8_t nr; /* cache the region number for debug */
     int fd; /* fd to mmap() region */
+    bool post_wr; /* writes can be posted */
 } VFIORegion;
 
 typedef struct VFIOMigration {
@@ -180,7 +181,7 @@  struct VFIODeviceIO {
     int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
                        void *data);
     int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
-                        void *data);
+                        void *data, bool post);
 };
 
 struct VFIOContainerIO {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index d26b325..de64e53 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -215,6 +215,7 @@  void vfio_region_write(void *opaque, hwaddr addr,
         uint32_t dword;
         uint64_t qword;
     } buf;
+    bool post = region->post_wr;
     int ret;
 
     switch (size) {
@@ -235,12 +236,19 @@  void vfio_region_write(void *opaque, hwaddr addr,
         break;
     }
 
-    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf);
+    /* read-after-write hazard if guest can directly access region */
+    if (region->nr_mmaps) {
+        post = false;
+    }
+    ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf,
+                                     post);
     if (ret != size) {
+        const char *err = ret < 0 ? strerror(-ret) : "short write";
+
         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
-                     ",%d) failed: %m",
+                     ",%d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, data, size);
+                     addr, data, size, err);
     }
     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
 
@@ -271,9 +279,11 @@  uint64_t vfio_region_read(void *opaque,
 
     ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf);
     if (ret != size) {
-        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, size);
+                     addr, size, err);
         return (uint64_t)-1;
     }
 
@@ -1584,6 +1594,7 @@  int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
     region->size = info->size;
     region->fd_offset = info->offset;
     region->nr = index;
+    region->post_wr = false;
     if (vbasedev->regfds != NULL) {
         region->fd = vbasedev->regfds[index];
     } else {
@@ -2711,7 +2722,7 @@  static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
 }
 
 static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
-                                uint32_t size, void *data)
+                                uint32_t size, void *data, bool post)
 {
     struct vfio_region_info *info = vbasedev->regions[index];
     int ret;
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 935d247..be714b7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -49,7 +49,7 @@ 
                                  (off), (size), (data)))
 #define VDEV_CONFIG_WRITE(vbasedev, off, size, data) \
     ((vbasedev)->io->region_write((vbasedev), VFIO_PCI_CONFIG_REGION_INDEX, \
-                                  (off), (size), (data)))
+                                  (off), (size), (data), false))
 
 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
 
@@ -1702,6 +1702,9 @@  static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
                                          ~PCI_BASE_ADDRESS_MEM_MASK);
     bar->size = bar->region.size;
+
+    /* IO regions are sync, memory can be async */
+    bar->region.post_wr = (bar->ioport == 0);
 }
 
 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c
index 09c6c98..900ab5f 100644
--- a/hw/vfio/user-pci.c
+++ b/hw/vfio/user-pci.c
@@ -41,6 +41,7 @@  struct VFIOUserPCIDevice {
     VFIOPCIDevice device;
     char *sock_name;
     bool send_queued;   /* all sends are queued */
+    bool no_post;       /* all regions write are sync */
 };
 
 /*
@@ -105,6 +106,9 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
     if (udev->send_queued) {
         proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
     }
+    if (udev->no_post) {
+        proxy->flags |= VFIO_PROXY_NO_POST;
+    }
 
     vfio_user_validate_version(proxy, &err);
     if (err != NULL) {
@@ -145,6 +149,7 @@  static void vfio_user_instance_finalize(Object *obj)
 static Property vfio_user_pci_dev_properties[] = {
     DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
     DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
+    DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index a05ba80..389c807 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -58,6 +58,8 @@  static void vfio_user_cb(void *opaque);
 
 static void vfio_user_request(void *opaque);
 static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg);
+static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+                                 VFIOUserFDs *fds);
 static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
                                 VFIOUserFDs *fds, int rsize, bool nobql);
 static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
@@ -631,6 +633,33 @@  static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg)
     return 0;
 }
 
+/*
+ * async send - msg can be queued, but will be freed when sent
+ */
+static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+                                 VFIOUserFDs *fds)
+{
+    VFIOUserMsg *msg;
+    int ret;
+
+    if (!(hdr->flags & (VFIO_USER_NO_REPLY | VFIO_USER_REPLY))) {
+        error_printf("vfio_user_send_async on sync message\n");
+        return;
+    }
+
+    QEMU_LOCK_GUARD(&proxy->lock);
+
+    msg = vfio_user_getmsg(proxy, hdr, fds);
+    msg->id = hdr->id;
+    msg->rsize = 0;
+    msg->type = VFIO_MSG_ASYNC;
+
+    ret = vfio_user_send_queued(proxy, msg);
+    if (ret < 0) {
+        vfio_user_recycle(proxy, msg);
+    }
+}
+
 static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
                                 VFIOUserFDs *fds, int rsize, bool nobql)
 {
@@ -1179,6 +1208,73 @@  static int vfio_user_get_region_info(VFIOUserProxy *proxy,
     return 0;
 }
 
+static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index,
+                                 off_t offset, uint32_t count, void *data)
+{
+    g_autofree VFIOUserRegionRW *msgp = NULL;
+    int size = sizeof(*msgp) + count;
+
+    if (count > proxy->max_xfer_size) {
+        return -EINVAL;
+    }
+
+    msgp = g_malloc0(size);
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0);
+    msgp->offset = offset;
+    msgp->region = index;
+    msgp->count = count;
+    trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count);
+
+    vfio_user_send_wait(proxy, &msgp->hdr, NULL, size, false);
+    if (msgp->hdr.flags & VFIO_USER_ERROR) {
+        return -msgp->hdr.error_reply;
+    } else if (msgp->count > count) {
+        return -E2BIG;
+    } else {
+        memcpy(data, &msgp->data, msgp->count);
+    }
+
+    return msgp->count;
+}
+
+static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index,
+                                  off_t offset, uint32_t count, void *data,
+                                  bool post)
+{
+    VFIOUserRegionRW *msgp = NULL;
+    int flags = post ? VFIO_USER_NO_REPLY : 0;
+    int size = sizeof(*msgp) + count;
+    int ret;
+
+    if (count > proxy->max_xfer_size) {
+        return -EINVAL;
+    }
+
+    msgp = g_malloc0(size);
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, flags);
+    msgp->offset = offset;
+    msgp->region = index;
+    msgp->count = count;
+    memcpy(&msgp->data, data, count);
+    trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count);
+
+    /* async send will free msg after it's sent */
+    if (post && !(proxy->flags & VFIO_PROXY_NO_POST)) {
+        vfio_user_send_async(proxy, &msgp->hdr, NULL);
+        return count;
+    }
+
+    vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, false);
+    if (msgp->hdr.flags & VFIO_USER_ERROR) {
+        ret = -msgp->hdr.error_reply;
+    } else {
+        ret = count;
+    }
+
+    g_free(msgp);
+    return ret;
+}
+
 
 /*
  * Socket-based io_ops
@@ -1208,6 +1304,22 @@  static int vfio_user_io_get_region_info(VFIODevice *vbasedev,
     return 0;
 }
 
+static int vfio_user_io_region_read(VFIODevice *vbasedev, uint8_t index,
+                                    off_t off, uint32_t size, void *data)
+{
+    return vfio_user_region_read(vbasedev->proxy, index, off, size, data);
+}
+
+static int vfio_user_io_region_write(VFIODevice *vbasedev, uint8_t index,
+                                     off_t off, unsigned size, void *data,
+                                     bool post)
+{
+    return vfio_user_region_write(vbasedev->proxy, index, off, size, data,
+                                  post);
+}
+
 VFIODeviceIO vfio_dev_io_sock = {
     .get_region_info = vfio_user_io_get_region_info,
+    .region_read = vfio_user_io_region_read,
+    .region_write = vfio_user_io_region_write,
 };
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 939113a..1f3688f 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -175,3 +175,4 @@  vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x"
 vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s"
 vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d"
 vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d flags 0x%x size 0x%"PRIx64
+vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d"