diff mbox series

[RFC,v2,09/16] vfio-user: region read/write

Message ID 92fb16181e71a1c4049f9995294dbd7ff4270627.1629131628.git.elena.ufimtseva@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user implementation | expand

Commit Message

Elena Ufimtseva Aug. 16, 2021, 4:42 p.m. UTC
From: John Johnson <john.g.johnson@oracle.com>

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/vfio/user-protocol.h | 12 ++++++++++++
 hw/vfio/user.h          |  4 ++++
 hw/vfio/common.c        | 16 +++++++++++++--
 hw/vfio/user.c          | 43 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 2 deletions(-)

Comments

Stefan Hajnoczi Sept. 7, 2021, 2:41 p.m. UTC | #1
On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 7d667b0533..a8b1ea9358 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          uint32_t dword;
>          uint64_t qword;
>      } buf;
> +    int ret;
>  
>      switch (size) {
>      case 1:
> @@ -234,7 +235,12 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          break;
>      }
>  
> -    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> +    if (vbasedev->proxy != NULL) {
> +        ret = vfio_user_region_write(vbasedev, region->nr, addr, size, &data);
> +    } else {
> +        ret = pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr);
> +    }

The vfio-user spec says everything is little-endian. Why does
vfio_user_region_write() take the host-endian uint64_t data value
instead of the little-endian buf value?

> +    if (ret != size) {
>          error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
>                       ",%d) failed: %m",
>                       __func__, vbasedev->name, region->nr,
> @@ -266,8 +272,14 @@ uint64_t vfio_region_read(void *opaque,
>          uint64_t qword;
>      } buf;
>      uint64_t data = 0;
> +    int ret;
>  
> -    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> +    if (vbasedev->proxy != NULL) {
> +        ret = vfio_user_region_read(vbasedev, region->nr, addr, size, &buf);
> +    } else {
> +        ret = pread(vbasedev->fd, &buf, size, region->fd_offset + addr);
> +    }
> +    if (ret != size) {
>          error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
>                       __func__, vbasedev->name, region->nr,
>                       addr, size);
> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
> index 91b51f37df..83235b2411 100644
> --- a/hw/vfio/user.c
> +++ b/hw/vfio/user.c
> @@ -767,3 +767,46 @@ int vfio_user_get_region_info(VFIODevice *vbasedev, int index,
>      memcpy(info, &msgp->argsz, info->argsz);
>      return 0;
>  }
> +
> +int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset,
> +                                 uint32_t count, void *data)
> +{
> +    g_autofree VFIOUserRegionRW *msgp = NULL;
> +    int size = sizeof(*msgp) + count;
> +
> +    msgp = g_malloc0(size);
> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0);
> +    msgp->offset = offset;
> +    msgp->region = index;
> +    msgp->count = count;
> +
> +    vfio_user_send_recv(vbasedev->proxy, &msgp->hdr, NULL, size, 0);
> +    if (msgp->hdr.flags & VFIO_USER_ERROR) {
> +        return -msgp->hdr.error_reply;
> +    } else if (msgp->count > count) {
> +        return -E2BIG;
> +    } else {
> +        memcpy(data, &msgp->data, msgp->count);
> +    }
> +
> +    return msgp->count;
> +}
> +
> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
> +                           uint64_t offset, uint32_t count, void *data)
> +{
> +    g_autofree VFIOUserRegionRW *msgp = NULL;
> +    int size = sizeof(*msgp) + count;
> +
> +    msgp = g_malloc0(size);
> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
> +                          VFIO_USER_NO_REPLY);
> +    msgp->offset = offset;
> +    msgp->region = index;
> +    msgp->count = count;
> +    memcpy(&msgp->data, data, count);
> +
> +    vfio_user_send(vbasedev->proxy, &msgp->hdr, NULL);

Are VFIO region writes posted writes (VFIO_USER_NO_REPLY)? This can be a
problem if the device driver performs a write to the region followed by
another access (e.g. to an mmap region) and expected the write to
complete before the second access takes place.
John Levon Sept. 7, 2021, 5:24 p.m. UTC | #2
On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:

> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
> +                           uint64_t offset, uint32_t count, void *data)
> +{
> +    g_autofree VFIOUserRegionRW *msgp = NULL;
> +    int size = sizeof(*msgp) + count;
> +
> +    msgp = g_malloc0(size);
> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
> +                          VFIO_USER_NO_REPLY);

Mirroring https://github.com/oracle/qemu/issues/10 here for visibility:

Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally,
meaning essentially all writes are posted. But that shouldn't be the case, for
example for PCI config space, where it's expected that writes will wait for an
ack before the VCPU continues.

regards
john
John Johnson Sept. 9, 2021, 6 a.m. UTC | #3
> On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote:
> 
> On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:
> 
>> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
>> +                           uint64_t offset, uint32_t count, void *data)
>> +{
>> +    g_autofree VFIOUserRegionRW *msgp = NULL;
>> +    int size = sizeof(*msgp) + count;
>> +
>> +    msgp = g_malloc0(size);
>> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
>> +                          VFIO_USER_NO_REPLY);
> 
> Mirroring https://github.com/oracle/qemu/issues/10 here for visibility:
> 
> Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally,
> meaning essentially all writes are posted. But that shouldn't be the case, for
> example for PCI config space, where it's expected that writes will wait for an
> ack before the VCPU continues.
> 

	I’m not sure following the PCI spec (mem writes posted, config & IO
are not) completely solves the issue if the device uses sparse mmap.  A store
to went over the socket can be passed by a load that goes directly to memory,
which could break a driver that assumes a load completion means older stores
to the same device have also completed.

								JJ
John Levon Sept. 9, 2021, 12:05 p.m. UTC | #4
On Thu, Sep 09, 2021 at 06:00:36AM +0000, John Johnson wrote:

> > On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote:
> > 
> > On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:
> > 
> >> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
> >> +                           uint64_t offset, uint32_t count, void *data)
> >> +{
> >> +    g_autofree VFIOUserRegionRW *msgp = NULL;
> >> +    int size = sizeof(*msgp) + count;
> >> +
> >> +    msgp = g_malloc0(size);
> >> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
> >> +                          VFIO_USER_NO_REPLY);
> > 
> > Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e=  here for visibility:
> > 
> > Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally,
> > meaning essentially all writes are posted. But that shouldn't be the case, for
> > example for PCI config space, where it's expected that writes will wait for an
> > ack before the VCPU continues.
> 
> 	I’m not sure following the PCI spec (mem writes posted, config & IO
> are not) completely solves the issue if the device uses sparse mmap.  A store
> to went over the socket can be passed by a load that goes directly to memory,
> which could break a driver that assumes a load completion means older stores
> to the same device have also completed.

Sure, but sparse mmaps are under the device's control - so wouldn't that be
something of a "don't do that" scenario?

regards
john
John Johnson Sept. 10, 2021, 6:07 a.m. UTC | #5
> On Sep 9, 2021, at 5:05 AM, John Levon <john.levon@nutanix.com> wrote:
> 
> On Thu, Sep 09, 2021 at 06:00:36AM +0000, John Johnson wrote:
> 
>>> On Sep 7, 2021, at 10:24 AM, John Levon <john.levon@nutanix.com> wrote:
>>> 
>>> On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:
>>> 
>>>> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
>>>> +                           uint64_t offset, uint32_t count, void *data)
>>>> +{
>>>> +    g_autofree VFIOUserRegionRW *msgp = NULL;
>>>> +    int size = sizeof(*msgp) + count;
>>>> +
>>>> +    msgp = g_malloc0(size);
>>>> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
>>>> +                          VFIO_USER_NO_REPLY);
>>> 
>>> Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e=  here for visibility:
>>> 
>>> Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally,
>>> meaning essentially all writes are posted. But that shouldn't be the case, for
>>> example for PCI config space, where it's expected that writes will wait for an
>>> ack before the VCPU continues.
>> 
>> 	I’m not sure following the PCI spec (mem writes posted, config & IO
>> are not) completely solves the issue if the device uses sparse mmap.  A store
>> to went over the socket can be passed by a load that goes directly to memory,
>> which could break a driver that assumes a load completion means older stores
>> to the same device have also completed.
> 
> Sure, but sparse mmaps are under the device's control - so wouldn't that be
> something of a "don't do that" scenario?
> 

	The sparse mmaps are under the emulation program’s control, but it
doesn’t know what registers the guest device driver is using to force stores
to complete.  The Linux device drivers doc on kernel.org just says the driver
must read from the same device.

								JJ


https://www.kernel.org/doc/Documentation/driver-api/device-io.rst

While the basic functions are defined to be synchronous with respect to
each other and ordered with respect to each other the busses the devices
sit on may themselves have asynchronicity. In particular many authors
are burned by the fact that PCI bus writes are posted asynchronously. A
driver author must issue a read from the same device to ensure that
writes have occurred in the specific cases the author cares.
John Levon Sept. 10, 2021, 12:16 p.m. UTC | #6
On Fri, Sep 10, 2021 at 06:07:56AM +0000, John Johnson wrote:

> >>> On Mon, Aug 16, 2021 at 09:42:42AM -0700, Elena Ufimtseva wrote:
> >>> 
> >>>> +int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
> >>>> +                           uint64_t offset, uint32_t count, void *data)
> >>>> +{
> >>>> +    g_autofree VFIOUserRegionRW *msgp = NULL;
> >>>> +    int size = sizeof(*msgp) + count;
> >>>> +
> >>>> +    msgp = g_malloc0(size);
> >>>> +    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
> >>>> +                          VFIO_USER_NO_REPLY);
> >>> 
> >>> Mirroring https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_oracle_qemu_issues_10&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=v7SNLJqx7b9Vfc7ZO82Wg4nnZ8O5XkACFQ30bVKxotI&m=PJ390CfKPdTFUffSi02whMSqey2en8OJv7dm9VAQKI0&s=Mfp1xRKET3LEucEeZwUVUvSJ3V0zzGEktOzFwMsTfEE&e=  here for visibility:
> >>> 
> >>> Currently, vfio_user_region_write uses VFIO_USER_NO_REPLY unconditionally,
> >>> meaning essentially all writes are posted. But that shouldn't be the case, for
> >>> example for PCI config space, where it's expected that writes will wait for an
> >>> ack before the VCPU continues.
> >> 
> >> 	I’m not sure following the PCI spec (mem writes posted, config & IO
> >> are not) completely solves the issue if the device uses sparse mmap.  A store
> >> to went over the socket can be passed by a load that goes directly to memory,
> >> which could break a driver that assumes a load completion means older stores
> >> to the same device have also completed.
> > 
> > Sure, but sparse mmaps are under the device's control - so wouldn't that be
> > something of a "don't do that" scenario?
> 
> 	The sparse mmaps are under the emulation program’s control, but it
> doesn’t know what registers the guest device driver is using to force stores
> to complete.  The Linux device drivers doc on kernel.org just says the driver
> must read from the same device.

Sure, but any device where that is important wouldn't use the sparse mmaps, no?

There's no other alternative.

regards
john
diff mbox series

Patch

diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
index 104bf4ff31..56904cf872 100644
--- a/hw/vfio/user-protocol.h
+++ b/hw/vfio/user-protocol.h
@@ -109,4 +109,16 @@  typedef struct {
     uint64_t offset;
 } VFIOUserRegionInfo;
 
+/*
+ * VFIO_USER_REGION_READ
+ * VFIO_USER_REGION_WRITE
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint64_t offset;
+    uint32_t region;
+    uint32_t count;
+    char data[];
+} VFIOUserRegionRW;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
index f0122539ba..02f832a173 100644
--- a/hw/vfio/user.h
+++ b/hw/vfio/user.h
@@ -74,5 +74,9 @@  int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp);
 int vfio_user_get_info(VFIODevice *vbasedev);
 int vfio_user_get_region_info(VFIODevice *vbasedev, int index,
                               struct vfio_region_info *info, VFIOUserFDs *fds);
+int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset,
+                          uint32_t count, void *data);
+int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
+                           uint64_t offset, uint32_t count, void *data);
 
 #endif /* VFIO_USER_H */
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7d667b0533..a8b1ea9358 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -215,6 +215,7 @@  void vfio_region_write(void *opaque, hwaddr addr,
         uint32_t dword;
         uint64_t qword;
     } buf;
+    int ret;
 
     switch (size) {
     case 1:
@@ -234,7 +235,12 @@  void vfio_region_write(void *opaque, hwaddr addr,
         break;
     }
 
-    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+    if (vbasedev->proxy != NULL) {
+        ret = vfio_user_region_write(vbasedev, region->nr, addr, size, &data);
+    } else {
+        ret = pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr);
+    }
+    if (ret != size) {
         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
                      ",%d) failed: %m",
                      __func__, vbasedev->name, region->nr,
@@ -266,8 +272,14 @@  uint64_t vfio_region_read(void *opaque,
         uint64_t qword;
     } buf;
     uint64_t data = 0;
+    int ret;
 
-    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+    if (vbasedev->proxy != NULL) {
+        ret = vfio_user_region_read(vbasedev, region->nr, addr, size, &buf);
+    } else {
+        ret = pread(vbasedev->fd, &buf, size, region->fd_offset + addr);
+    }
+    if (ret != size) {
         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
                      __func__, vbasedev->name, region->nr,
                      addr, size);
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index 91b51f37df..83235b2411 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -767,3 +767,46 @@  int vfio_user_get_region_info(VFIODevice *vbasedev, int index,
     memcpy(info, &msgp->argsz, info->argsz);
     return 0;
 }
+
+int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset,
+                                 uint32_t count, void *data)
+{
+    g_autofree VFIOUserRegionRW *msgp = NULL;
+    int size = sizeof(*msgp) + count;
+
+    msgp = g_malloc0(size);
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0);
+    msgp->offset = offset;
+    msgp->region = index;
+    msgp->count = count;
+
+    vfio_user_send_recv(vbasedev->proxy, &msgp->hdr, NULL, size, 0);
+    if (msgp->hdr.flags & VFIO_USER_ERROR) {
+        return -msgp->hdr.error_reply;
+    } else if (msgp->count > count) {
+        return -E2BIG;
+    } else {
+        memcpy(data, &msgp->data, msgp->count);
+    }
+
+    return msgp->count;
+}
+
+int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
+                           uint64_t offset, uint32_t count, void *data)
+{
+    g_autofree VFIOUserRegionRW *msgp = NULL;
+    int size = sizeof(*msgp) + count;
+
+    msgp = g_malloc0(size);
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size,
+                          VFIO_USER_NO_REPLY);
+    msgp->offset = offset;
+    msgp->region = index;
+    msgp->count = count;
+    memcpy(&msgp->data, data, count);
+
+    vfio_user_send(vbasedev->proxy, &msgp->hdr, NULL);
+
+    return count;
+}