diff mbox

x86/vMSI-X: avoid missing first unmask of vectors

Message ID 5718BBB202000078000E4484@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich April 21, 2016, 9:38 a.m. UTC
Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
x86/vMSI-X: avoid missing first unmask of vectors

Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) ==
+         PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -410,9 +434,6 @@ int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +458,21 @@ found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+        for_each_vcpu ( d, v )
+        {
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +496,6 @@ void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +558,8 @@ void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@ struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };

Comments

Jan Beulich April 21, 2016, 9:54 a.m. UTC | #1
>>> On 21.04.16 at 11:38, <JBeulich@suse.com> wrote:
> Recent changes to Linux result in there just being a single unmask
> operation prior to expecting the first interrupts to arrive. However,
> we've had a chicken-and-egg problem here: Qemu invokes
> xc_domain_update_msi_irq(), ultimately leading to
> msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> for msixtbl_range() to return true (in order to msixtbl_write() to get
> invoked at all) msixtbl_pt_register() must have completed.
> 
> Deal with this by snooping suitable writes in msixtbl_range() and
> triggering the invocation of msix_write_completion() from
> msixtbl_pt_register() when that happens in the context of a still in
> progress vector control field write.
> 
> Note that the seemingly unrelated deletion of the redundant
> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> any compiler version used that the "msi_desc" local variable isn't
> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> just for consistency reasons.)
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?

One option I just now thought of might be to also look at batches
of up to 4 dword or 2 qword writes there, tracking the address of
the one that writes to the vector control field. One might even
limit this to taking into consideration only the last iteration on such
batches, on the assumption that if at all it should be that last one
which writes the field of interest. (The fundamental assumption
then would be that no OS uses REP MOVS to fill data for more than
one table entry at a time.) This would then also cover various
Windows versions.

Jan
Paul Durrant April 21, 2016, 11:33 a.m. UTC | #2
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 21 April 2016 10:38
> To: xen-devel
> Cc: Andrew Cooper; Paul Durrant; Wei Liu; Keir (Xen.org)
> Subject: [PATCH] x86/vMSI-X: avoid missing first unmask of vectors
> 
> Recent changes to Linux result in there just being a single unmask
> operation prior to expecting the first interrupts to arrive. However,
> we've had a chicken-and-egg problem here: Qemu invokes
> xc_domain_update_msi_irq(), ultimately leading to
> msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> for msixtbl_range() to return true (in order to msixtbl_write() to get
> invoked at all) msixtbl_pt_register() must have completed.
> 
> Deal with this by snooping suitable writes in msixtbl_range() and
> triggering the invocation of msix_write_completion() from
> msixtbl_pt_register() when that happens in the context of a still in
> progress vector control field write.
> 
> Note that the seemingly unrelated deletion of the redundant
> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> any compiler version used that the "msi_desc" local variable isn't
> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> just for consistency reasons.)
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
> 
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v,
>      desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
>      rcu_read_unlock(&msixtbl_rcu_lock);
> 
> -    return !!desc;
> +    if ( desc )
> +        return 1;
> +
> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) ==
> +         PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
> +    {
> +        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
> +

If you need to start digging into the ioreq here then I think it would be better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops. That way it gets the ioreq passed the accept method without any need to dig.

  Paul

> +        ASSERT(r->type == IOREQ_TYPE_COPY);
> +        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
> +             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
> +            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
> +    }
> +
> +    return 0;
>  }
> 
>  static const struct hvm_mmio_ops msixtbl_mmio_ops = {
> @@ -410,9 +434,6 @@ int msixtbl_pt_register(struct domain *d
>          return r;
>      }
> 
> -    if ( !irq_desc->msi_desc )
> -        goto out;
> -
>      msi_desc = irq_desc->msi_desc;
>      if ( !msi_desc )
>          goto out;
> @@ -437,6 +458,21 @@ found:
>  out:
>      spin_unlock_irq(&irq_desc->lock);
>      xfree(new_entry);
> +
> +    if ( !r )
> +    {
> +        struct vcpu *v;
> +
> +        for_each_vcpu ( d, v )
> +        {
> +            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
> +                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE
> +
> +                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
> +                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
> +                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
> +        }
> +    }
> +
>      return r;
>  }
> 
> @@ -457,9 +496,6 @@ void msixtbl_pt_unregister(struct domain
>      if ( !irq_desc )
>          return;
> 
> -    if ( !irq_desc->msi_desc )
> -        goto out;
> -
>      msi_desc = irq_desc->msi_desc;
>      if ( !msi_desc )
>          goto out;
> @@ -522,6 +558,8 @@ void msix_write_completion(struct vcpu *
>  {
>      unsigned long ctrl_address = v-
> >arch.hvm_vcpu.hvm_io.msix_unmask_address;
> 
> +    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
> +
>      if ( !ctrl_address )
>          return;
> 
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -85,6 +85,7 @@ struct hvm_vcpu_io {
>      bool_t mmio_retry;
> 
>      unsigned long msix_unmask_address;
> +    unsigned long msix_snoop_address;
> 
>      const struct g2m_ioport *g2m_ioport;
>  };
> 
>
Jan Beulich April 21, 2016, 11:44 a.m. UTC | #3
>>> On 21.04.16 at 13:33, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 21 April 2016 10:38
>> --- a/xen/arch/x86/hvm/vmsi.c
>> +++ b/xen/arch/x86/hvm/vmsi.c
>> @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v,
>>      desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
>>      rcu_read_unlock(&msixtbl_rcu_lock);
>> 
>> -    return !!desc;
>> +    if ( desc )
>> +        return 1;
>> +
>> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) ==
>> +         PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
>> +    {
>> +        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
>> +
> 
> If you need to start digging into the ioreq here then I think it would be 
> better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops. That 
> way it gets the ioreq passed the accept method without any need to dig.

I can certainly try and see how this works out, but it's relatively
clear that it'll make the patch bigger and backporting more difficult.
So maybe this would better be left as a subsequent cleanup
exercise?

Jan
Paul Durrant April 21, 2016, 11:54 a.m. UTC | #4
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 21 April 2016 12:44
> To: Paul Durrant
> Cc: Andrew Cooper; Wei Liu; xen-devel; Keir (Xen.org)
> Subject: RE: [PATCH] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 21.04.16 at 13:33, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 21 April 2016 10:38
> >> --- a/xen/arch/x86/hvm/vmsi.c
> >> +++ b/xen/arch/x86/hvm/vmsi.c
> >> @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v,
> >>      desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
> >>      rcu_read_unlock(&msixtbl_rcu_lock);
> >>
> >> -    return !!desc;
> >> +    if ( desc )
> >> +        return 1;
> >> +
> >> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) ==
> >> +         PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
> >> +    {
> >> +        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
> >> +
> >
> > If you need to start digging into the ioreq here then I think it would be
> > better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops.
> That
> > way it gets the ioreq passed the accept method without any need to dig.
> 
> I can certainly try and see how this works out, but it's relatively
> clear that it'll make the patch bigger and backporting more difficult.
> So maybe this would better be left as a subsequent cleanup
> exercise?
>

True, it would make backporting more tricky so maybe a follow-up patch would be the better approach. In which case...

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> Jan
Wei Liu April 25, 2016, 1:25 p.m. UTC | #5
On Thu, Apr 21, 2016 at 03:38:26AM -0600, Jan Beulich wrote:
> Recent changes to Linux result in there just being a single unmask
> operation prior to expecting the first interrupts to arrive. However,
> we've had a chicken-and-egg problem here: Qemu invokes
> xc_domain_update_msi_irq(), ultimately leading to
> msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> for msixtbl_range() to return true (in order to msixtbl_write() to get
> invoked at all) msixtbl_pt_register() must have completed.
> 
> Deal with this by snooping suitable writes in msixtbl_range() and
> triggering the invocation of msix_write_completion() from
> msixtbl_pt_register() when that happens in the context of a still in
> progress vector control field write.
> 
> Note that the seemingly unrelated deletion of the redundant
> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> any compiler version used that the "msi_desc" local variable isn't
> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> just for consistency reasons.)
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
> 

As I understand it, this should be future improvement. The patch itself
is an improvement in its own right so it can go in:

Release-acked-by: Wei Liu <wei.liu2@citrix.com>
Jan Beulich April 25, 2016, 2:12 p.m. UTC | #6
>>> On 25.04.16 at 15:25, <wei.liu2@citrix.com> wrote:
> On Thu, Apr 21, 2016 at 03:38:26AM -0600, Jan Beulich wrote:
>> Recent changes to Linux result in there just being a single unmask
>> operation prior to expecting the first interrupts to arrive. However,
>> we've had a chicken-and-egg problem here: Qemu invokes
>> xc_domain_update_msi_irq(), ultimately leading to
>> msixtbl_pt_register(), upon seeing that first unmask operation. Yet
>> for msixtbl_range() to return true (in order to msixtbl_write() to get
>> invoked at all) msixtbl_pt_register() must have completed.
>> 
>> Deal with this by snooping suitable writes in msixtbl_range() and
>> triggering the invocation of msix_write_completion() from
>> msixtbl_pt_register() when that happens in the context of a still in
>> progress vector control field write.
>> 
>> Note that the seemingly unrelated deletion of the redundant
>> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
>> any compiler version used that the "msi_desc" local variable isn't
>> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
>> just for consistency reasons.)
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
>> 
> 
> As I understand it, this should be future improvement.

Yes. I've actually been working on this the last couple of hours.

> The patch itself
> is an improvement in its own right so it can go in:
> 
> Release-acked-by: Wei Liu <wei.liu2@citrix.com>

Thanks, Jan
diff mbox

Patch

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -341,7 +352,21 @@  static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) ==
+         PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -410,9 +434,6 @@  int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +458,21 @@  found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+        for_each_vcpu ( d, v )
+        {
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +496,6 @@  void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +558,8 @@  void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@  struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };