diff mbox series

[v3,1/4] x86/msi: passthrough all MSI-X vector ctrl writes to device model

Message ID f799fdc6b6899fa65a07eae0d6401753f7d61ef2.1680752649.git-series.marmarek@invisiblethingslab.com (mailing list archive)
State New, archived
Headers show
Series MSI-X support with qemu in stubdomain, and other related changes | expand

Commit Message

Marek Marczykowski-Górecki April 6, 2023, 3:57 a.m. UTC
QEMU needs to know whether clearing maskbit of a vector is really
clearing, or was already cleared before. Currently Xen sends only
clearing that bit to the device model, but not setting it, so QEMU
cannot detect it. Because of that, QEMU is working this around by
checking via /dev/mem, but that isn't the proper approach. It's just a
workaround which in fact is racy.

Give all necessary information to QEMU by passing all ctrl writes,
including masking a vector.

While this commit doesn't move the whole maskbit handling to QEMU (as
discussed on xen-devel as one of the possibilities), it is a necessary
first step anyway. Including telling QEMU it will get all the required
information to do so. The actual implementation would need to include:
 - a hypercall for QEMU to control just maskbit (without (re)binding the
   interrupt again
 - a methor for QEMU to tell Xen it will actually do the work
Those are not part of this series.

Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
v3:
 - advertise changed behavior in XEN_DMOP_get_ioreq_server_info - make
   "flags" parameter IN/OUT
 - move len check back to msixtbl_write() - will be needed there anyway
   in a later patch
v2:
 - passthrough quad writes to emulator too (Jan)
 - (ab)use len==0 for write len=4 completion (Jan), but add descriptive
   #define for this magic value

Should flags on output include only "out" values (current version), or
also include those passed in by the caller unchanged?
---
 xen/arch/x86/hvm/vmsi.c        | 18 ++++++++++++++----
 xen/common/ioreq.c             |  9 +++++++--
 xen/include/public/hvm/dm_op.h | 12 ++++++++----
 3 files changed, 29 insertions(+), 10 deletions(-)

Comments

Jan Beulich April 24, 2023, 1:06 p.m. UTC | #1
On 06.04.2023 05:57, Marek Marczykowski-Górecki wrote:
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -272,6 +272,15 @@ out:
>      return r;
>  }
>  
> +/*
> + * This function returns X86EMUL_UNHANDLEABLE even if write is properly
> + * handled, to propagate it to the device model (so it can keep its internal
> + * state in sync).
> + * len==0 means really len==4, but as a write completion that will return
> + * X86EMUL_OKAY on successful processing. Use WRITE_LEN4_COMPLETION to make it
> + * less confusing.
> + */
> +#define WRITE_LEN4_COMPLETION 0
>  static int msixtbl_write(struct vcpu *v, unsigned long address,
>                           unsigned int len, unsigned long val)
>  {
> @@ -283,8 +292,9 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
>      unsigned long flags;
>      struct irq_desc *desc;
>  
> -    if ( (len != 4 && len != 8) || (address & (len - 1)) )
> -        return r;
> +    if ( (len != 4 && len != 8 && len != WRITE_LEN4_COMPLETION) ||
> +         (len && (address & (len - 1))) )
> +        return X86EMUL_UNHANDLEABLE;
>  
>      rcu_read_lock(&msixtbl_rcu_lock);
>  
> @@ -345,7 +355,7 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
>  
>  unlock:
>      spin_unlock_irqrestore(&desc->lock, flags);
> -    if ( len == 4 )
> +    if ( len == WRITE_LEN4_COMPLETION )
>          r = X86EMUL_OKAY;
>  
>  out:
> @@ -635,7 +645,7 @@ void msix_write_completion(struct vcpu *v)
>          return;
>  
>      v->arch.hvm.hvm_io.msix_unmask_address = 0;
> -    if ( msixtbl_write(v, ctrl_address, 4, 0) != X86EMUL_OKAY )
> +    if ( msixtbl_write(v, ctrl_address, WRITE_LEN4_COMPLETION, 0) != X86EMUL_OKAY )
>          gdprintk(XENLOG_WARNING, "MSI-X write completion failure\n");
>  }
>  

This part is okay with me, but ...

> --- a/xen/common/ioreq.c
> +++ b/xen/common/ioreq.c
> @@ -743,7 +743,8 @@ static int ioreq_server_destroy(struct domain *d, ioservid_t id)
>  static int ioreq_server_get_info(struct domain *d, ioservid_t id,
>                                   unsigned long *ioreq_gfn,
>                                   unsigned long *bufioreq_gfn,
> -                                 evtchn_port_t *bufioreq_port)
> +                                 evtchn_port_t *bufioreq_port,
> +                                 uint16_t *flags)
>  {
>      struct ioreq_server *s;
>      int rc;
> @@ -779,6 +780,9 @@ static int ioreq_server_get_info(struct domain *d, ioservid_t id,
>              *bufioreq_port = s->bufioreq_evtchn;
>      }
>  
> +    /* Advertise supported features/behaviors. */
> +    *flags = XEN_DMOP_all_msix_writes;
> +
>      rc = 0;
>  
>   out:
> @@ -1374,7 +1378,8 @@ int ioreq_server_dm_op(struct xen_dm_op *op, struct domain *d, bool *const_op)
>                                     NULL : (unsigned long *)&data->ioreq_gfn,
>                                     (data->flags & XEN_DMOP_no_gfns) ?
>                                     NULL : (unsigned long *)&data->bufioreq_gfn,
> -                                   &data->bufioreq_port);
> +                                   &data->bufioreq_port, &data->flags);
> +
>          break;
>      }
>  
> diff --git a/xen/include/public/hvm/dm_op.h b/xen/include/public/hvm/dm_op.h
> index acdf91693d0b..490b151c5dd7 100644
> --- a/xen/include/public/hvm/dm_op.h
> +++ b/xen/include/public/hvm/dm_op.h
> @@ -70,7 +70,9 @@ typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
>   * not contain XEN_DMOP_no_gfns then these pages will be made available and
>   * the frame numbers passed back in gfns <ioreq_gfn> and <bufioreq_gfn>
>   * respectively. (If the IOREQ Server is not handling buffered emulation
> - * only <ioreq_gfn> will be valid).
> + * only <ioreq_gfn> will be valid). When Xen returns XEN_DMOP_all_msix_writes
> + * flag set, it will notify the IOREQ server about all writes to MSI-X table
> + * (if it's handled by this IOREQ server), not only those clearing a mask bit.
>   *
>   * NOTE: To access the synchronous ioreq structures and buffered ioreq
>   *       ring, it is preferable to use the XENMEM_acquire_resource memory
> @@ -81,11 +83,13 @@ typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
>  struct xen_dm_op_get_ioreq_server_info {
>      /* IN - server id */
>      ioservid_t id;
> -    /* IN - flags */
> +    /* IN/OUT - flags */
>      uint16_t flags;
>  
> -#define _XEN_DMOP_no_gfns 0
> -#define XEN_DMOP_no_gfns (1u << _XEN_DMOP_no_gfns)
> +#define _XEN_DMOP_no_gfns         0  /* IN */
> +#define _XEN_DMOP_all_msix_writes 1  /* OUT */
> +#define XEN_DMOP_no_gfns         (1u << _XEN_DMOP_no_gfns)
> +#define XEN_DMOP_all_msix_writes (1u << _XEN_DMOP_all_msix_writes)
>  
>      /* OUT - buffered ioreq port */
>      evtchn_port_t bufioreq_port;

... this isn't: What is obtained through this hypercall is information
pertaining to a particular ioreq server. I don't think Xen behavior
should be reported there. To me this information much rather fits in
what public/features.h has / offers. And XENVER_get_features isn't
constrained in any way, so any dm ought to be able to retrieve the
information that way.

Jan
Roger Pau Monné May 3, 2023, 9:01 a.m. UTC | #2
On Thu, Apr 06, 2023 at 05:57:23AM +0200, Marek Marczykowski-Górecki wrote:
> QEMU needs to know whether clearing maskbit of a vector is really
> clearing, or was already cleared before. Currently Xen sends only
> clearing that bit to the device model, but not setting it, so QEMU
> cannot detect it. Because of that, QEMU is working this around by
> checking via /dev/mem, but that isn't the proper approach. It's just a
> workaround which in fact is racy.
> 
> Give all necessary information to QEMU by passing all ctrl writes,
> including masking a vector.
> 
> While this commit doesn't move the whole maskbit handling to QEMU (as
> discussed on xen-devel as one of the possibilities), it is a necessary
> first step anyway. Including telling QEMU it will get all the required
> information to do so. The actual implementation would need to include:
>  - a hypercall for QEMU to control just maskbit (without (re)binding the
>    interrupt again
>  - a methor for QEMU to tell Xen it will actually do the work
> Those are not part of this series.
> 
> Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
> ---
> v3:
>  - advertise changed behavior in XEN_DMOP_get_ioreq_server_info - make
>    "flags" parameter IN/OUT
>  - move len check back to msixtbl_write() - will be needed there anyway
>    in a later patch
> v2:
>  - passthrough quad writes to emulator too (Jan)
>  - (ab)use len==0 for write len=4 completion (Jan), but add descriptive
>    #define for this magic value
> 
> Should flags on output include only "out" values (current version), or
> also include those passed in by the caller unchanged?
> ---
>  xen/arch/x86/hvm/vmsi.c        | 18 ++++++++++++++----
>  xen/common/ioreq.c             |  9 +++++++--
>  xen/include/public/hvm/dm_op.h | 12 ++++++++----
>  3 files changed, 29 insertions(+), 10 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
> index 3cd4923060c8..231253a2cbd4 100644
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -272,6 +272,15 @@ out:
>      return r;
>  }
>  
> +/*
> + * This function returns X86EMUL_UNHANDLEABLE even if write is properly
> + * handled, to propagate it to the device model (so it can keep its internal
> + * state in sync).
> + * len==0 means really len==4, but as a write completion that will return
> + * X86EMUL_OKAY on successful processing. Use WRITE_LEN4_COMPLETION to make it
> + * less confusing.

Isn't it fine to just forward every (valid) write to the dm, and so
not introduce WRITE_LEN4_COMPLETION? (see my comment about
_msixtbl_write()).

> + */
> +#define WRITE_LEN4_COMPLETION 0
>  static int msixtbl_write(struct vcpu *v, unsigned long address,
>                           unsigned int len, unsigned long val)
>  {
> @@ -283,8 +292,9 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
>      unsigned long flags;
>      struct irq_desc *desc;
>  
> -    if ( (len != 4 && len != 8) || (address & (len - 1)) )
> -        return r;
> +    if ( (len != 4 && len != 8 && len != WRITE_LEN4_COMPLETION) ||
> +         (len && (address & (len - 1))) )
> +        return X86EMUL_UNHANDLEABLE;

I think you want to just return X86EMUL_OKAY here, and ignore the
access since it's not properly sized or aligned?

>  
>      rcu_read_lock(&msixtbl_rcu_lock);
>  
> @@ -345,7 +355,7 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
>  
>  unlock:
>      spin_unlock_irqrestore(&desc->lock, flags);
> -    if ( len == 4 )
> +    if ( len == WRITE_LEN4_COMPLETION )
>          r = X86EMUL_OKAY;
>  
>  out:
> @@ -635,7 +645,7 @@ void msix_write_completion(struct vcpu *v)
>          return;
>  
>      v->arch.hvm.hvm_io.msix_unmask_address = 0;
> -    if ( msixtbl_write(v, ctrl_address, 4, 0) != X86EMUL_OKAY )
> +    if ( msixtbl_write(v, ctrl_address, WRITE_LEN4_COMPLETION, 0) != X86EMUL_OKAY )
>          gdprintk(XENLOG_WARNING, "MSI-X write completion failure\n");

Would it be possible to always return X86EMUL_UNHANDLEABLE from
_msixtbl_write() and keep the return values of msixtbl_write()
as-is?

>  }
>  
> diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c
> index ecb8f545e1c4..bd6f074c1e85 100644
> --- a/xen/common/ioreq.c
> +++ b/xen/common/ioreq.c
> @@ -743,7 +743,8 @@ static int ioreq_server_destroy(struct domain *d, ioservid_t id)
>  static int ioreq_server_get_info(struct domain *d, ioservid_t id,
>                                   unsigned long *ioreq_gfn,
>                                   unsigned long *bufioreq_gfn,
> -                                 evtchn_port_t *bufioreq_port)
> +                                 evtchn_port_t *bufioreq_port,
> +                                 uint16_t *flags)
>  {
>      struct ioreq_server *s;
>      int rc;
> @@ -779,6 +780,9 @@ static int ioreq_server_get_info(struct domain *d, ioservid_t id,
>              *bufioreq_port = s->bufioreq_evtchn;
>      }
>  
> +    /* Advertise supported features/behaviors. */
> +    *flags = XEN_DMOP_all_msix_writes;
> +
>      rc = 0;
>  
>   out:
> @@ -1374,7 +1378,8 @@ int ioreq_server_dm_op(struct xen_dm_op *op, struct domain *d, bool *const_op)
>                                     NULL : (unsigned long *)&data->ioreq_gfn,
>                                     (data->flags & XEN_DMOP_no_gfns) ?
>                                     NULL : (unsigned long *)&data->bufioreq_gfn,
> -                                   &data->bufioreq_port);
> +                                   &data->bufioreq_port, &data->flags);
> +
>          break;
>      }
>  
> diff --git a/xen/include/public/hvm/dm_op.h b/xen/include/public/hvm/dm_op.h
> index acdf91693d0b..490b151c5dd7 100644
> --- a/xen/include/public/hvm/dm_op.h
> +++ b/xen/include/public/hvm/dm_op.h
> @@ -70,7 +70,9 @@ typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
>   * not contain XEN_DMOP_no_gfns then these pages will be made available and
>   * the frame numbers passed back in gfns <ioreq_gfn> and <bufioreq_gfn>
>   * respectively. (If the IOREQ Server is not handling buffered emulation
> - * only <ioreq_gfn> will be valid).
> + * only <ioreq_gfn> will be valid). When Xen returns XEN_DMOP_all_msix_writes
> + * flag set, it will notify the IOREQ server about all writes to MSI-X table
> + * (if it's handled by this IOREQ server), not only those clearing a mask bit.
>   *
>   * NOTE: To access the synchronous ioreq structures and buffered ioreq
>   *       ring, it is preferable to use the XENMEM_acquire_resource memory
> @@ -81,11 +83,13 @@ typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
>  struct xen_dm_op_get_ioreq_server_info {
>      /* IN - server id */
>      ioservid_t id;
> -    /* IN - flags */
> +    /* IN/OUT - flags */
>      uint16_t flags;
>  
> -#define _XEN_DMOP_no_gfns 0
> -#define XEN_DMOP_no_gfns (1u << _XEN_DMOP_no_gfns)
> +#define _XEN_DMOP_no_gfns         0  /* IN */
> +#define _XEN_DMOP_all_msix_writes 1  /* OUT */
> +#define XEN_DMOP_no_gfns         (1u << _XEN_DMOP_no_gfns)
> +#define XEN_DMOP_all_msix_writes (1u << _XEN_DMOP_all_msix_writes)

FWIW, we usually interleave _XEN_DMOP_no_gfns and XEN_DMOP_no_gfns,
ie:

#define _XEN_DMOP_no_gfns         0  /* IN */
#define XEN_DMOP_no_gfns          (1u << _XEN_DMOP_no_gfns)
#define _XEN_DMOP_all_msix_writes 1  /* OUT */
#define XEN_DMOP_all_msix_writes  (1u << _XEN_DMOP_all_msix_writes)

I wonder whether XEN_DMOP_all_msix_writes should be a feature
requested by the dm, as to not change the existing behaviour of how
MSIX writes are handled (which might work for QEMU, but could cause
issues with other out of tree users of ioreqs)?

That would turn XEN_DMOP_all_msix_writes into an IN flag also.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 3cd4923060c8..231253a2cbd4 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -272,6 +272,15 @@  out:
     return r;
 }
 
+/*
+ * This function returns X86EMUL_UNHANDLEABLE even if write is properly
+ * handled, to propagate it to the device model (so it can keep its internal
+ * state in sync).
+ * len==0 means really len==4, but as a write completion that will return
+ * X86EMUL_OKAY on successful processing. Use WRITE_LEN4_COMPLETION to make it
+ * less confusing.
+ */
+#define WRITE_LEN4_COMPLETION 0
 static int msixtbl_write(struct vcpu *v, unsigned long address,
                          unsigned int len, unsigned long val)
 {
@@ -283,8 +292,9 @@  static int msixtbl_write(struct vcpu *v, unsigned long address,
     unsigned long flags;
     struct irq_desc *desc;
 
-    if ( (len != 4 && len != 8) || (address & (len - 1)) )
-        return r;
+    if ( (len != 4 && len != 8 && len != WRITE_LEN4_COMPLETION) ||
+         (len && (address & (len - 1))) )
+        return X86EMUL_UNHANDLEABLE;
 
     rcu_read_lock(&msixtbl_rcu_lock);
 
@@ -345,7 +355,7 @@  static int msixtbl_write(struct vcpu *v, unsigned long address,
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
-    if ( len == 4 )
+    if ( len == WRITE_LEN4_COMPLETION )
         r = X86EMUL_OKAY;
 
 out:
@@ -635,7 +645,7 @@  void msix_write_completion(struct vcpu *v)
         return;
 
     v->arch.hvm.hvm_io.msix_unmask_address = 0;
-    if ( msixtbl_write(v, ctrl_address, 4, 0) != X86EMUL_OKAY )
+    if ( msixtbl_write(v, ctrl_address, WRITE_LEN4_COMPLETION, 0) != X86EMUL_OKAY )
         gdprintk(XENLOG_WARNING, "MSI-X write completion failure\n");
 }
 
diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c
index ecb8f545e1c4..bd6f074c1e85 100644
--- a/xen/common/ioreq.c
+++ b/xen/common/ioreq.c
@@ -743,7 +743,8 @@  static int ioreq_server_destroy(struct domain *d, ioservid_t id)
 static int ioreq_server_get_info(struct domain *d, ioservid_t id,
                                  unsigned long *ioreq_gfn,
                                  unsigned long *bufioreq_gfn,
-                                 evtchn_port_t *bufioreq_port)
+                                 evtchn_port_t *bufioreq_port,
+                                 uint16_t *flags)
 {
     struct ioreq_server *s;
     int rc;
@@ -779,6 +780,9 @@  static int ioreq_server_get_info(struct domain *d, ioservid_t id,
             *bufioreq_port = s->bufioreq_evtchn;
     }
 
+    /* Advertise supported features/behaviors. */
+    *flags = XEN_DMOP_all_msix_writes;
+
     rc = 0;
 
  out:
@@ -1374,7 +1378,8 @@  int ioreq_server_dm_op(struct xen_dm_op *op, struct domain *d, bool *const_op)
                                    NULL : (unsigned long *)&data->ioreq_gfn,
                                    (data->flags & XEN_DMOP_no_gfns) ?
                                    NULL : (unsigned long *)&data->bufioreq_gfn,
-                                   &data->bufioreq_port);
+                                   &data->bufioreq_port, &data->flags);
+
         break;
     }
 
diff --git a/xen/include/public/hvm/dm_op.h b/xen/include/public/hvm/dm_op.h
index acdf91693d0b..490b151c5dd7 100644
--- a/xen/include/public/hvm/dm_op.h
+++ b/xen/include/public/hvm/dm_op.h
@@ -70,7 +70,9 @@  typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
  * not contain XEN_DMOP_no_gfns then these pages will be made available and
  * the frame numbers passed back in gfns <ioreq_gfn> and <bufioreq_gfn>
  * respectively. (If the IOREQ Server is not handling buffered emulation
- * only <ioreq_gfn> will be valid).
+ * only <ioreq_gfn> will be valid). When Xen returns XEN_DMOP_all_msix_writes
+ * flag set, it will notify the IOREQ server about all writes to MSI-X table
+ * (if it's handled by this IOREQ server), not only those clearing a mask bit.
  *
  * NOTE: To access the synchronous ioreq structures and buffered ioreq
  *       ring, it is preferable to use the XENMEM_acquire_resource memory
@@ -81,11 +83,13 @@  typedef struct xen_dm_op_create_ioreq_server xen_dm_op_create_ioreq_server_t;
 struct xen_dm_op_get_ioreq_server_info {
     /* IN - server id */
     ioservid_t id;
-    /* IN - flags */
+    /* IN/OUT - flags */
     uint16_t flags;
 
-#define _XEN_DMOP_no_gfns 0
-#define XEN_DMOP_no_gfns (1u << _XEN_DMOP_no_gfns)
+#define _XEN_DMOP_no_gfns         0  /* IN */
+#define _XEN_DMOP_all_msix_writes 1  /* OUT */
+#define XEN_DMOP_no_gfns         (1u << _XEN_DMOP_no_gfns)
+#define XEN_DMOP_all_msix_writes (1u << _XEN_DMOP_all_msix_writes)
 
     /* OUT - buffered ioreq port */
     evtchn_port_t bufioreq_port;