diff mbox series

[for-4.13,v4,2/3] x86/passthrough: fix migration of MSI when using posted interrupts

Message ID 20191113155940.81837-3-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86/passthrough: fix interrupt migration when using posting | expand

Commit Message

Roger Pau Monne Nov. 13, 2019, 3:59 p.m. UTC
When using posted interrupts and the guest migrates MSI from vCPUs Xen
needs to flush any pending PIRR vectors on the previous vCPU, or else
those vectors could get wrongly injected at a later point when the MSI
fields are already updated.

The usage of a fixed vCPU in lowest priority mode when using VT-d
posted interrupts is also removed, and as a result VT-d posted
interrupts are not used together with lowest priority mode and
multiple destinations. That forces vlapic_lowest_prio to be called in
order to select the destination vCPU during interrupt dispatch.

Note that PIRR is synced to IRR both in pt_irq_destroy_bind and
pt_irq_create_bind when the interrupt delivery data is being updated.

Reported-by: Joe Jin <joe.jin@oracle.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Joe Jin <joe.jin@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
---
Changes since v3:
 - In multi-destination mode make sure all destination vCPUs have PIR
   synced to IRR by using a bitmap.
 - Drop the bogus selection of a fixed vCPU when using lowest priority
   mode.

Changes since v2:
 - Also sync PIRR with IRR when using CPU posted interrupts.
 - Force the selection of a specific vCPU when using posted interrupts
   for multi-dest.
 - Change vmsi_deliver_pirq to honor dest_vcpu_id.

Changes since v1:
 - Store the vcpu id also in multi-dest mode if the interrupt is bound
   to a vcpu for posted delivery.
 - s/#if/#ifdef/.
---
 xen/arch/x86/hvm/hvm.c           |  31 ++++++++
 xen/arch/x86/hvm/vlapic.c        |  19 +++++
 xen/arch/x86/hvm/vmsi.c          |  23 ------
 xen/drivers/passthrough/io.c     | 118 ++++++++++++++-----------------
 xen/include/asm-x86/hvm/hvm.h    |   5 +-
 xen/include/asm-x86/hvm/vlapic.h |   3 +
 6 files changed, 110 insertions(+), 89 deletions(-)

Comments

Joe Jin Nov. 13, 2019, 9:22 p.m. UTC | #1
Ran same test and did not hit the issue.

Tested-by: Joe Jin <joe.jin@oracle.com>

On 11/13/19 7:59 AM, Roger Pau Monne wrote:
> When using posted interrupts and the guest migrates MSI from vCPUs Xen
> needs to flush any pending PIRR vectors on the previous vCPU, or else
> those vectors could get wrongly injected at a later point when the MSI
> fields are already updated.
> 
> The usage of a fixed vCPU in lowest priority mode when using VT-d
> posted interrupts is also removed, and as a result VT-d posted
> interrupts are not used together with lowest priority mode and
> multiple destinations. That forces vlapic_lowest_prio to be called in
> order to select the destination vCPU during interrupt dispatch.
> 
> Note that PIRR is synced to IRR both in pt_irq_destroy_bind and
> pt_irq_create_bind when the interrupt delivery data is being updated.
> 
> Reported-by: Joe Jin <joe.jin@oracle.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Cc: Joe Jin <joe.jin@oracle.com>
> Cc: Juergen Gross <jgross@suse.com>
> ---
> Changes since v3:
>  - In multi-destination mode make sure all destination vCPUs have PIR
>    synced to IRR by using a bitmap.
>  - Drop the bogus selection of a fixed vCPU when using lowest priority
>    mode.
> 
> Changes since v2:
>  - Also sync PIRR with IRR when using CPU posted interrupts.
>  - Force the selection of a specific vCPU when using posted interrupts
>    for multi-dest.
>  - Change vmsi_deliver_pirq to honor dest_vcpu_id.
> 
> Changes since v1:
>  - Store the vcpu id also in multi-dest mode if the interrupt is bound
>    to a vcpu for posted delivery.
>  - s/#if/#ifdef/.
> ---
>  xen/arch/x86/hvm/hvm.c           |  31 ++++++++
>  xen/arch/x86/hvm/vlapic.c        |  19 +++++
>  xen/arch/x86/hvm/vmsi.c          |  23 ------
>  xen/drivers/passthrough/io.c     | 118 ++++++++++++++-----------------
>  xen/include/asm-x86/hvm/hvm.h    |   5 +-
>  xen/include/asm-x86/hvm/vlapic.h |   3 +
>  6 files changed, 110 insertions(+), 89 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 06a7b40107..0e3379fa6f 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -43,6 +43,7 @@
>  #include <asm/current.h>
>  #include <asm/e820.h>
>  #include <asm/io.h>
> +#include <asm/io_apic.h>
>  #include <asm/regs.h>
>  #include <asm/cpufeature.h>
>  #include <asm/processor.h>
> @@ -5266,6 +5267,36 @@ void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
>      alternative_vcall(hvm_funcs.set_segment_register, v, seg, reg);
>  }
>  
> +int hvm_intr_get_dests(struct domain *d, uint8_t dest, uint8_t dest_mode,
> +                       uint8_t delivery_mode, unsigned long *vcpus)
> +{
> +    struct vcpu *v;
> +
> +    switch ( delivery_mode )
> +    {
> +    case dest_LowestPrio:
> +        /*
> +         * Get all the possible destinations, but note that lowest priority
> +         * mode is only going to inject the interrupt to the vCPU running at
> +         * the least privilege level.
> +         *
> +         * Fallthrough
> +         */
> +    case dest_Fixed:
> +        for_each_vcpu ( d, v )
> +            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
> +                __set_bit(v->vcpu_id, vcpus);
> +        break;
> +
> +    default:
> +        gprintk(XENLOG_WARNING, "unsupported interrupt delivery mode %u\n",
> +                delivery_mode);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
> index 9466258d6f..9d9c6d391a 100644
> --- a/xen/arch/x86/hvm/vlapic.c
> +++ b/xen/arch/x86/hvm/vlapic.c
> @@ -112,6 +112,25 @@ static void sync_pir_to_irr(struct vcpu *v)
>          alternative_vcall(hvm_funcs.sync_pir_to_irr, v);
>  }
>  
> +void domain_sync_vlapic_pir(struct domain *d, unsigned long *vcpus)
> +{
> +    unsigned int id;
> +
> +    if ( !bitmap_weight(vcpus, d->max_vcpus) )
> +        return;
> +
> +    for ( id = find_first_bit(vcpus, d->max_vcpus);
> +          id < d->max_vcpus;
> +          id = find_next_bit(vcpus, d->max_vcpus, id + 1) )
> +    {
> +        if ( d->vcpu[id] != current )
> +            vcpu_pause(d->vcpu[id]);
> +        sync_pir_to_irr(d->vcpu[id]);
> +        if ( d->vcpu[id] != current )
> +            vcpu_unpause(d->vcpu[id]);
> +    }
> +}
> +
>  static int vlapic_find_highest_irr(struct vlapic *vlapic)
>  {
>      sync_pir_to_irr(vlapic_vcpu(vlapic));
> diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
> index 6597d9f719..66891d7d20 100644
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -121,29 +121,6 @@ void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci)
>      vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode);
>  }
>  
> -/* Return value, -1 : multi-dests, non-negative value: dest_vcpu_id */
> -int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode)
> -{
> -    int dest_vcpu_id = -1, w = 0;
> -    struct vcpu *v;
> -    
> -    if ( d->max_vcpus == 1 )
> -        return 0;
> - 
> -    for_each_vcpu ( d, v )
> -    {
> -        if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) ) 
> -        {
> -            w++;
> -            dest_vcpu_id = v->vcpu_id;
> -        }
> -    }
> -    if ( w > 1 )
> -        return -1;
> -
> -    return dest_vcpu_id;
> -}
> -
>  /* MSI-X mask bit hypervisor interception */
>  struct msixtbl_entry
>  {
> diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
> index b292e79382..5289e89bc1 100644
> --- a/xen/drivers/passthrough/io.c
> +++ b/xen/drivers/passthrough/io.c
> @@ -219,62 +219,6 @@ void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci)
>      xfree(dpci);
>  }
>  
> -/*
> - * This routine handles lowest-priority interrupts using vector-hashing
> - * mechanism. As an example, modern Intel CPUs use this method to handle
> - * lowest-priority interrupts.
> - *
> - * Here is the details about the vector-hashing mechanism:
> - * 1. For lowest-priority interrupts, store all the possible destination
> - *    vCPUs in an array.
> - * 2. Use "gvec % max number of destination vCPUs" to find the right
> - *    destination vCPU in the array for the lowest-priority interrupt.
> - */
> -static struct vcpu *vector_hashing_dest(const struct domain *d,
> -                                        uint32_t dest_id,
> -                                        bool dest_mode,
> -                                        uint8_t gvec)
> -
> -{
> -    unsigned long *dest_vcpu_bitmap;
> -    unsigned int dest_vcpus = 0;
> -    struct vcpu *v, *dest = NULL;
> -    unsigned int i;
> -
> -    dest_vcpu_bitmap = xzalloc_array(unsigned long,
> -                                     BITS_TO_LONGS(d->max_vcpus));
> -    if ( !dest_vcpu_bitmap )
> -        return NULL;
> -
> -    for_each_vcpu ( d, v )
> -    {
> -        if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT,
> -                                dest_id, dest_mode) )
> -            continue;
> -
> -        __set_bit(v->vcpu_id, dest_vcpu_bitmap);
> -        dest_vcpus++;
> -    }
> -
> -    if ( dest_vcpus != 0 )
> -    {
> -        unsigned int mod = gvec % dest_vcpus;
> -        unsigned int idx = 0;
> -
> -        for ( i = 0; i <= mod; i++ )
> -        {
> -            idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1;
> -            BUG_ON(idx > d->max_vcpus);
> -        }
> -
> -        dest = d->vcpu[idx - 1];
> -    }
> -
> -    xfree(dest_vcpu_bitmap);
> -
> -    return dest;
> -}
> -
>  int pt_irq_create_bind(
>      struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
>  {
> @@ -345,6 +289,8 @@ int pt_irq_create_bind(
>          const struct vcpu *vcpu;
>          uint32_t gflags = pt_irq_bind->u.msi.gflags &
>                            ~XEN_DOMCTL_VMSI_X86_UNMASKED;
> +        DECLARE_BITMAP(dest_vcpus, MAX_VIRT_CPUS) = { };
> +        DECLARE_BITMAP(prev_vcpus, MAX_VIRT_CPUS) = { };
>  
>          if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
>          {
> @@ -411,6 +357,24 @@ int pt_irq_create_bind(
>  
>                  pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
>                  pirq_dpci->gmsi.gflags = gflags;
> +                if ( pirq_dpci->gmsi.dest_vcpu_id != -1 )
> +                    __set_bit(pirq_dpci->gmsi.dest_vcpu_id, prev_vcpus);
> +                else
> +                {
> +                    /*
> +                     * If previous configuration has multiple possible
> +                     * destinations record them in order to sync the PIR to IRR
> +                     * afterwards.
> +                     */
> +                    dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
> +                                     XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
> +                    dest_mode = pirq_dpci->gmsi.gflags &
> +                                XEN_DOMCTL_VMSI_X86_DM_MASK;
> +                    delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
> +                                              XEN_DOMCTL_VMSI_X86_DELIV_MASK);
> +                    hvm_intr_get_dests(d, dest, dest_mode, delivery_mode,
> +                                       prev_vcpus);
> +                }
>              }
>          }
>          /* Calculate dest_vcpu_id for MSI-type pirq migration. */
> @@ -420,20 +384,16 @@ int pt_irq_create_bind(
>          delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
>                                    XEN_DOMCTL_VMSI_X86_DELIV_MASK);
>  
> -        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
> +        hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, dest_vcpus);
> +        dest_vcpu_id = bitmap_weight(dest_vcpus, d->max_vcpus) != 1 ?
> +            -1 : find_first_bit(dest_vcpus, d->max_vcpus);
>          pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
>          spin_unlock(&d->event_lock);
>  
>          pirq_dpci->gmsi.posted = false;
>          vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
> -        if ( iommu_intpost )
> -        {
> -            if ( delivery_mode == dest_LowestPrio )
> -                vcpu = vector_hashing_dest(d, dest, dest_mode,
> -                                           pirq_dpci->gmsi.gvec);
> -            if ( vcpu )
> -                pirq_dpci->gmsi.posted = true;
> -        }
> +        if ( vcpu && iommu_intpost )
> +            pirq_dpci->gmsi.posted = true;
>          if ( vcpu && is_iommu_enabled(d) )
>              hvm_migrate_pirq(pirq_dpci, vcpu);
>  
> @@ -442,6 +402,9 @@ int pt_irq_create_bind(
>              pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
>                             info, pirq_dpci->gmsi.gvec);
>  
> +        if ( hvm_funcs.deliver_posted_intr )
> +            domain_sync_vlapic_pir(d, prev_vcpus);
> +
>          if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
>          {
>              unsigned long flags;
> @@ -731,6 +694,31 @@ int pt_irq_destroy_bind(
>      else if ( pirq_dpci && pirq_dpci->gmsi.posted )
>          pi_update_irte(NULL, pirq, 0);
>  
> +    if ( hvm_funcs.deliver_posted_intr )
> +    {
> +        DECLARE_BITMAP(vcpus, MAX_VIRT_CPUS) = { };
> +
> +        if ( pirq_dpci->gmsi.dest_vcpu_id != -1 )
> +            __set_bit(pirq_dpci->gmsi.dest_vcpu_id, vcpus);
> +        else
> +        {
> +            /*
> +             * If previous configuration has multiple possible
> +             * destinations record them in order to sync the PIR to IRR.
> +             */
> +            uint8_t dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
> +                                     XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
> +            uint8_t dest_mode = pirq_dpci->gmsi.gflags &
> +                                XEN_DOMCTL_VMSI_X86_DM_MASK;
> +            uint8_t delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
> +                                              XEN_DOMCTL_VMSI_X86_DELIV_MASK);
> +
> +            hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, vcpus);
> +        }
> +
> +        domain_sync_vlapic_pir(d, vcpus);
> +    }
> +
>      if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
>           list_empty(&pirq_dpci->digl_list) )
>      {
> diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
> index f86af09898..899665fed8 100644
> --- a/xen/include/asm-x86/hvm/hvm.h
> +++ b/xen/include/asm-x86/hvm/hvm.h
> @@ -266,7 +266,6 @@ int vmsi_deliver(
>      uint8_t delivery_mode, uint8_t trig_mode);
>  struct hvm_pirq_dpci;
>  void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *);
> -int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode);
>  
>  enum hvm_intblk
>  hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack);
> @@ -336,6 +335,10 @@ unsigned long hvm_cr4_guest_valid_bits(const struct domain *d, bool restore);
>  bool hvm_flush_vcpu_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
>                          void *ctxt);
>  
> +/* Get all the possible destination vCPUs of an interrupt. */
> +int hvm_intr_get_dests(struct domain *d, uint8_t dest, uint8_t dest_mode,
> +                       uint8_t delivery_mode, unsigned long *vcpus);
> +
>  #ifdef CONFIG_HVM
>  
>  #define hvm_get_guest_tsc(v) hvm_get_guest_tsc_fixed(v, 0)
> diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h
> index dde66b4f0f..2bc2ebadf0 100644
> --- a/xen/include/asm-x86/hvm/vlapic.h
> +++ b/xen/include/asm-x86/hvm/vlapic.h
> @@ -150,4 +150,7 @@ bool_t vlapic_match_dest(
>      const struct vlapic *target, const struct vlapic *source,
>      int short_hand, uint32_t dest, bool_t dest_mode);
>  
> +/* Sync the PIR to IRR of all vlapics in the vcpus bitmap. */
> +void domain_sync_vlapic_pir(struct domain *d, unsigned long *vcpus);
> +
>  #endif /* __ASM_X86_HVM_VLAPIC_H__ */
>
Jan Beulich Nov. 14, 2019, 1:35 p.m. UTC | #2
On 13.11.2019 16:59, Roger Pau Monne wrote:
> @@ -5266,6 +5267,36 @@ void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
>      alternative_vcall(hvm_funcs.set_segment_register, v, seg, reg);
>  }
>  
> +int hvm_intr_get_dests(struct domain *d, uint8_t dest, uint8_t dest_mode,

While for IO-APIC and MSI "dest" can't be wider than 8 bits without
virtual interrupt remapping, "intr" imo is generic enough to also
(potentially) include IPIs. I'd therefore recommend in new code to
make this uint32_t right away to cover for all current and future
cases.

Also - const for d?

> +                       uint8_t delivery_mode, unsigned long *vcpus)
> +{
> +    struct vcpu *v;
> +
> +    switch ( delivery_mode )
> +    {
> +    case dest_LowestPrio:
> +        /*
> +         * Get all the possible destinations, but note that lowest priority
> +         * mode is only going to inject the interrupt to the vCPU running at
> +         * the least privilege level.

"privilege level"? I think you mean "lowest priority" here?

> +         *
> +         * Fallthrough
> +         */
> +    case dest_Fixed:

There's not really any fall through here, and hence I think this part
of the comment would better be dropped.

> +        for_each_vcpu ( d, v )
> +            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
> +                __set_bit(v->vcpu_id, vcpus);
> +        break;
> +
> +    default:
> +        gprintk(XENLOG_WARNING, "unsupported interrupt delivery mode %u\n",
> +                delivery_mode);

gdprintk() perhaps, so keep at least release builds' logs tidy
(and useful)?

> --- a/xen/arch/x86/hvm/vlapic.c
> +++ b/xen/arch/x86/hvm/vlapic.c
> @@ -112,6 +112,25 @@ static void sync_pir_to_irr(struct vcpu *v)
>          alternative_vcall(hvm_funcs.sync_pir_to_irr, v);
>  }
>  
> +void domain_sync_vlapic_pir(struct domain *d, unsigned long *vcpus)

const (twice)?

> +{
> +    unsigned int id;
> +
> +    if ( !bitmap_weight(vcpus, d->max_vcpus) )
> +        return;

Why go over the entire bitmap en extra time here? The find-first
will do exactly the same, and hence the loop body below won't be
entered in this case.

> +    for ( id = find_first_bit(vcpus, d->max_vcpus);
> +          id < d->max_vcpus;
> +          id = find_next_bit(vcpus, d->max_vcpus, id + 1) )
> +    {
> +        if ( d->vcpu[id] != current )
> +            vcpu_pause(d->vcpu[id]);

Isn't this setting us up for a deadlock if two parties come here
for the same domain, and both on a vCPU belonging to that domain
(and with the opposite one's bit set in the bitmap)? But it looks
like d would never be the current domain here - you will want to
assert and comment on this, though. At that point the comparisons
against current can then go away as well afaict.

> @@ -345,6 +289,8 @@ int pt_irq_create_bind(
>          const struct vcpu *vcpu;
>          uint32_t gflags = pt_irq_bind->u.msi.gflags &
>                            ~XEN_DOMCTL_VMSI_X86_UNMASKED;
> +        DECLARE_BITMAP(dest_vcpus, MAX_VIRT_CPUS) = { };
> +        DECLARE_BITMAP(prev_vcpus, MAX_VIRT_CPUS) = { };

This is reachable for HVM domains only, isn't it? In which case
why the much larger MAX_VIRT_CPUS (creating two unreasonably big
local variables) instead of HVM_MAX_VCPUS? However, even once
switched I'd be opposed to this - There'd be a fair chance that
the need to deal with these variables might go unnoticed once
the maximum vCPU count for HVM gets increased (which has been a
pending todo item for many years now).

> @@ -420,20 +384,16 @@ int pt_irq_create_bind(
>          delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
>                                    XEN_DOMCTL_VMSI_X86_DELIV_MASK);
>  
> -        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
> +        hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, dest_vcpus);
> +        dest_vcpu_id = bitmap_weight(dest_vcpus, d->max_vcpus) != 1 ?
> +            -1 : find_first_bit(dest_vcpus, d->max_vcpus);
>          pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
>          spin_unlock(&d->event_lock);
>  
>          pirq_dpci->gmsi.posted = false;
>          vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
> -        if ( iommu_intpost )
> -        {
> -            if ( delivery_mode == dest_LowestPrio )
> -                vcpu = vector_hashing_dest(d, dest, dest_mode,
> -                                           pirq_dpci->gmsi.gvec);
> -            if ( vcpu )
> -                pirq_dpci->gmsi.posted = true;
> -        }
> +        if ( vcpu && iommu_intpost )
> +            pirq_dpci->gmsi.posted = true;

One aspect that I'm curious about: How much posting opportunity do
we lose in practice by no longer posting when the guest uses lowest
priority mode with multiple destinations?

> @@ -442,6 +402,9 @@ int pt_irq_create_bind(
>              pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
>                             info, pirq_dpci->gmsi.gvec);
>  
> +        if ( hvm_funcs.deliver_posted_intr )
> +            domain_sync_vlapic_pir(d, prev_vcpus);

Accessing hvm_funcs here looks like a layering violation. This
wants either moving into the function or (seeing the other use)
abstracting away. Seeing the conditional here (and below) I also
notice that you calculate prev_vcpus in vein when there's no
interrupt posting in use.

I guess together with the variable size issue mentioned above a
possible solution would be:
- have one bitmap hanging off of pirq_dpci->gmsi,
- have one bitmap per pCPU,
- populate the new destination bits into the per-pCPU one,
- issue the PIR->IRR sync,
- exchange the per-pCPU and per-DPCI pointers.
You could then leave the pointers at NULL when no posting is to
be used, addressing the apparent layering violation here at the
same time.

Jan
Roger Pau Monne Nov. 14, 2019, 2:42 p.m. UTC | #3
On Thu, Nov 14, 2019 at 02:35:56PM +0100, Jan Beulich wrote:
> On 13.11.2019 16:59, Roger Pau Monne wrote:
> > +    for ( id = find_first_bit(vcpus, d->max_vcpus);
> > +          id < d->max_vcpus;
> > +          id = find_next_bit(vcpus, d->max_vcpus, id + 1) )
> > +    {
> > +        if ( d->vcpu[id] != current )
> > +            vcpu_pause(d->vcpu[id]);
> 
> Isn't this setting us up for a deadlock if two parties come here
> for the same domain, and both on a vCPU belonging to that domain
> (and with the opposite one's bit set in the bitmap)? But it looks
> like d would never be the current domain here - you will want to
> assert and comment on this, though. At that point the comparisons
> against current can then go away as well afaict.

The above is true for syncs triggered by MSI changes that bounce to
QEMU and then get forwarded to Xen as DOMCTLs, but AFAICT syncs that
result from a vIO-APIC entry write (patch #3) will have v ==
current.

vIO-APIC writes however use the d->arch.hvm.irq_lock, so it's not
possible to process multiple vCPUs vIO-APIC accesses at the same time.
I'm afraid I don't know how which kind of assert should be added
here. I could add a comment, but seems fragile.

> 
> > @@ -345,6 +289,8 @@ int pt_irq_create_bind(
> >          const struct vcpu *vcpu;
> >          uint32_t gflags = pt_irq_bind->u.msi.gflags &
> >                            ~XEN_DOMCTL_VMSI_X86_UNMASKED;
> > +        DECLARE_BITMAP(dest_vcpus, MAX_VIRT_CPUS) = { };
> > +        DECLARE_BITMAP(prev_vcpus, MAX_VIRT_CPUS) = { };
> 
> This is reachable for HVM domains only, isn't it? In which case
> why the much larger MAX_VIRT_CPUS (creating two unreasonably big
> local variables) instead of HVM_MAX_VCPUS? However, even once
> switched I'd be opposed to this - There'd be a fair chance that
> the need to deal with these variables might go unnoticed once
> the maximum vCPU count for HVM gets increased (which has been a
> pending todo item for many years now).

See below, after your rant about how to fix it.

> > @@ -420,20 +384,16 @@ int pt_irq_create_bind(
> >          delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
> >                                    XEN_DOMCTL_VMSI_X86_DELIV_MASK);
> >  
> > -        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
> > +        hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, dest_vcpus);
> > +        dest_vcpu_id = bitmap_weight(dest_vcpus, d->max_vcpus) != 1 ?
> > +            -1 : find_first_bit(dest_vcpus, d->max_vcpus);
> >          pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
> >          spin_unlock(&d->event_lock);
> >  
> >          pirq_dpci->gmsi.posted = false;
> >          vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
> > -        if ( iommu_intpost )
> > -        {
> > -            if ( delivery_mode == dest_LowestPrio )
> > -                vcpu = vector_hashing_dest(d, dest, dest_mode,
> > -                                           pirq_dpci->gmsi.gvec);
> > -            if ( vcpu )
> > -                pirq_dpci->gmsi.posted = true;
> > -        }
> > +        if ( vcpu && iommu_intpost )
> > +            pirq_dpci->gmsi.posted = true;
> 
> One aspect that I'm curious about: How much posting opportunity do
> we lose in practice by no longer posting when the guest uses lowest
> priority mode with multiple destinations?

Linux seems to use dest_Fixed exclusively, and the same goes to
FreeBSD.

> > @@ -442,6 +402,9 @@ int pt_irq_create_bind(
> >              pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
> >                             info, pirq_dpci->gmsi.gvec);
> >  
> > +        if ( hvm_funcs.deliver_posted_intr )
> > +            domain_sync_vlapic_pir(d, prev_vcpus);
> 
> Accessing hvm_funcs here looks like a layering violation. This
> wants either moving into the function or (seeing the other use)
> abstracting away. Seeing the conditional here (and below) I also
> notice that you calculate prev_vcpus in vein when there's no
> interrupt posting in use.

I could indeed only fill prev_vcpus when posting is in use.

> I guess together with the variable size issue mentioned above a
> possible solution would be:
> - have one bitmap hanging off of pirq_dpci->gmsi,
> - have one bitmap per pCPU,
> - populate the new destination bits into the per-pCPU one,
> - issue the PIR->IRR sync,
> - exchange the per-pCPU and per-DPCI pointers.
> You could then leave the pointers at NULL when no posting is to
> be used, addressing the apparent layering violation here at the
> same time.

Right, the above option avoids having to calculate the possible
destinations twice (once on setup and once on teardown), however it
expands the size of gmsi.

While here, I've also realized that interrupts injected using
XEN_DMOP_inject_msi will also be posted, and I'm afraid there's no way
to track and flush those unless we provide a posted-flush hypercall or
some such, so that emulators can request a PIR flush when interrupts
of fully emulated devices are reconfigured.

OTOH, maybe XEN_DMOP_inject_msi should pause the vCPU, set the bit in
IRR and unpause?

Thanks, Roger.
Jan Beulich Nov. 14, 2019, 3:35 p.m. UTC | #4
On 14.11.2019 15:42, Roger Pau Monné  wrote:
> On Thu, Nov 14, 2019 at 02:35:56PM +0100, Jan Beulich wrote:
>> On 13.11.2019 16:59, Roger Pau Monne wrote:
>>> +    for ( id = find_first_bit(vcpus, d->max_vcpus);
>>> +          id < d->max_vcpus;
>>> +          id = find_next_bit(vcpus, d->max_vcpus, id + 1) )
>>> +    {
>>> +        if ( d->vcpu[id] != current )
>>> +            vcpu_pause(d->vcpu[id]);
>>
>> Isn't this setting us up for a deadlock if two parties come here
>> for the same domain, and both on a vCPU belonging to that domain
>> (and with the opposite one's bit set in the bitmap)? But it looks
>> like d would never be the current domain here - you will want to
>> assert and comment on this, though. At that point the comparisons
>> against current can then go away as well afaict.
> 
> The above is true for syncs triggered by MSI changes that bounce to
> QEMU and then get forwarded to Xen as DOMCTLs, but AFAICT syncs that
> result from a vIO-APIC entry write (patch #3) will have v ==
> current.

Ah, yes. But let's please handle the needs of the vIO-APIC code in
that other patch. That'll also ...

> vIO-APIC writes however use the d->arch.hvm.irq_lock, so it's not
> possible to process multiple vCPUs vIO-APIC accesses at the same time.
> I'm afraid I don't know how which kind of assert should be added
> here. I could add a comment, but seems fragile.

... help with the assertions to add. You'd put the strict one in
here, as suggested, and then relax it there as needed. One option
would be to require callers in the context of the current domain
to acquire the lock you mention (holding of which you could assert
here, in case current->domain == d).

>>> @@ -420,20 +384,16 @@ int pt_irq_create_bind(
>>>          delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
>>>                                    XEN_DOMCTL_VMSI_X86_DELIV_MASK);
>>>  
>>> -        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
>>> +        hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, dest_vcpus);
>>> +        dest_vcpu_id = bitmap_weight(dest_vcpus, d->max_vcpus) != 1 ?
>>> +            -1 : find_first_bit(dest_vcpus, d->max_vcpus);
>>>          pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
>>>          spin_unlock(&d->event_lock);
>>>  
>>>          pirq_dpci->gmsi.posted = false;
>>>          vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
>>> -        if ( iommu_intpost )
>>> -        {
>>> -            if ( delivery_mode == dest_LowestPrio )
>>> -                vcpu = vector_hashing_dest(d, dest, dest_mode,
>>> -                                           pirq_dpci->gmsi.gvec);
>>> -            if ( vcpu )
>>> -                pirq_dpci->gmsi.posted = true;
>>> -        }
>>> +        if ( vcpu && iommu_intpost )
>>> +            pirq_dpci->gmsi.posted = true;
>>
>> One aspect that I'm curious about: How much posting opportunity do
>> we lose in practice by no longer posting when the guest uses lowest
>> priority mode with multiple destinations?
> 
> Linux seems to use dest_Fixed exclusively, and the same goes to
> FreeBSD.

That's for recent Linux. Go back to 3.0 (just as an example), and
you'll see it was different back then. But yes, our decision here
would of course be influenced more by more recent guest OS versions.

>>> @@ -442,6 +402,9 @@ int pt_irq_create_bind(
>>>              pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
>>>                             info, pirq_dpci->gmsi.gvec);
>>>  
>>> +        if ( hvm_funcs.deliver_posted_intr )
>>> +            domain_sync_vlapic_pir(d, prev_vcpus);
>>
>> Accessing hvm_funcs here looks like a layering violation. This
>> wants either moving into the function or (seeing the other use)
>> abstracting away. Seeing the conditional here (and below) I also
>> notice that you calculate prev_vcpus in vein when there's no
>> interrupt posting in use.
> 
> I could indeed only fill prev_vcpus when posting is in use.
> 
>> I guess together with the variable size issue mentioned above a
>> possible solution would be:
>> - have one bitmap hanging off of pirq_dpci->gmsi,
>> - have one bitmap per pCPU,
>> - populate the new destination bits into the per-pCPU one,
>> - issue the PIR->IRR sync,
>> - exchange the per-pCPU and per-DPCI pointers.
>> You could then leave the pointers at NULL when no posting is to
>> be used, addressing the apparent layering violation here at the
>> same time.
> 
> Right, the above option avoids having to calculate the possible
> destinations twice (once on setup and once on teardown), however it
> expands the size of gmsi.

Well, size of dynamically allocated structures is clearly secondary
compared to size of objects we put on the stack. (It had bothered
me from the beginning that the multiple destination information was
simply discarded, yet not discarding it would likely have meant
allocating such bitmaps anyway.)

> While here, I've also realized that interrupts injected using
> XEN_DMOP_inject_msi will also be posted, and I'm afraid there's no way
> to track and flush those unless we provide a posted-flush hypercall or
> some such, so that emulators can request a PIR flush when interrupts
> of fully emulated devices are reconfigured.
> 
> OTOH, maybe XEN_DMOP_inject_msi should pause the vCPU, set the bit in
> IRR and unpause?

Might be an option, yes. This op is - afaict - not for production
use anyway, so ought to be fine to be slow. (The pausing wouldn't
be needed when not posting the interrupt, iiuc.) Another option
(without having looked at the code) might be to suppress posting,
if that can be suitably requested through the interface layers.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 06a7b40107..0e3379fa6f 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -43,6 +43,7 @@ 
 #include <asm/current.h>
 #include <asm/e820.h>
 #include <asm/io.h>
+#include <asm/io_apic.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -5266,6 +5267,36 @@  void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
     alternative_vcall(hvm_funcs.set_segment_register, v, seg, reg);
 }
 
+int hvm_intr_get_dests(struct domain *d, uint8_t dest, uint8_t dest_mode,
+                       uint8_t delivery_mode, unsigned long *vcpus)
+{
+    struct vcpu *v;
+
+    switch ( delivery_mode )
+    {
+    case dest_LowestPrio:
+        /*
+         * Get all the possible destinations, but note that lowest priority
+         * mode is only going to inject the interrupt to the vCPU running at
+         * the least privilege level.
+         *
+         * Fallthrough
+         */
+    case dest_Fixed:
+        for_each_vcpu ( d, v )
+            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
+                __set_bit(v->vcpu_id, vcpus);
+        break;
+
+    default:
+        gprintk(XENLOG_WARNING, "unsupported interrupt delivery mode %u\n",
+                delivery_mode);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 9466258d6f..9d9c6d391a 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -112,6 +112,25 @@  static void sync_pir_to_irr(struct vcpu *v)
         alternative_vcall(hvm_funcs.sync_pir_to_irr, v);
 }
 
+void domain_sync_vlapic_pir(struct domain *d, unsigned long *vcpus)
+{
+    unsigned int id;
+
+    if ( !bitmap_weight(vcpus, d->max_vcpus) )
+        return;
+
+    for ( id = find_first_bit(vcpus, d->max_vcpus);
+          id < d->max_vcpus;
+          id = find_next_bit(vcpus, d->max_vcpus, id + 1) )
+    {
+        if ( d->vcpu[id] != current )
+            vcpu_pause(d->vcpu[id]);
+        sync_pir_to_irr(d->vcpu[id]);
+        if ( d->vcpu[id] != current )
+            vcpu_unpause(d->vcpu[id]);
+    }
+}
+
 static int vlapic_find_highest_irr(struct vlapic *vlapic)
 {
     sync_pir_to_irr(vlapic_vcpu(vlapic));
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 6597d9f719..66891d7d20 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -121,29 +121,6 @@  void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci)
     vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode);
 }
 
-/* Return value, -1 : multi-dests, non-negative value: dest_vcpu_id */
-int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode)
-{
-    int dest_vcpu_id = -1, w = 0;
-    struct vcpu *v;
-    
-    if ( d->max_vcpus == 1 )
-        return 0;
- 
-    for_each_vcpu ( d, v )
-    {
-        if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) ) 
-        {
-            w++;
-            dest_vcpu_id = v->vcpu_id;
-        }
-    }
-    if ( w > 1 )
-        return -1;
-
-    return dest_vcpu_id;
-}
-
 /* MSI-X mask bit hypervisor interception */
 struct msixtbl_entry
 {
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index b292e79382..5289e89bc1 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -219,62 +219,6 @@  void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci)
     xfree(dpci);
 }
 
-/*
- * This routine handles lowest-priority interrupts using vector-hashing
- * mechanism. As an example, modern Intel CPUs use this method to handle
- * lowest-priority interrupts.
- *
- * Here is the details about the vector-hashing mechanism:
- * 1. For lowest-priority interrupts, store all the possible destination
- *    vCPUs in an array.
- * 2. Use "gvec % max number of destination vCPUs" to find the right
- *    destination vCPU in the array for the lowest-priority interrupt.
- */
-static struct vcpu *vector_hashing_dest(const struct domain *d,
-                                        uint32_t dest_id,
-                                        bool dest_mode,
-                                        uint8_t gvec)
-
-{
-    unsigned long *dest_vcpu_bitmap;
-    unsigned int dest_vcpus = 0;
-    struct vcpu *v, *dest = NULL;
-    unsigned int i;
-
-    dest_vcpu_bitmap = xzalloc_array(unsigned long,
-                                     BITS_TO_LONGS(d->max_vcpus));
-    if ( !dest_vcpu_bitmap )
-        return NULL;
-
-    for_each_vcpu ( d, v )
-    {
-        if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT,
-                                dest_id, dest_mode) )
-            continue;
-
-        __set_bit(v->vcpu_id, dest_vcpu_bitmap);
-        dest_vcpus++;
-    }
-
-    if ( dest_vcpus != 0 )
-    {
-        unsigned int mod = gvec % dest_vcpus;
-        unsigned int idx = 0;
-
-        for ( i = 0; i <= mod; i++ )
-        {
-            idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1;
-            BUG_ON(idx > d->max_vcpus);
-        }
-
-        dest = d->vcpu[idx - 1];
-    }
-
-    xfree(dest_vcpu_bitmap);
-
-    return dest;
-}
-
 int pt_irq_create_bind(
     struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
 {
@@ -345,6 +289,8 @@  int pt_irq_create_bind(
         const struct vcpu *vcpu;
         uint32_t gflags = pt_irq_bind->u.msi.gflags &
                           ~XEN_DOMCTL_VMSI_X86_UNMASKED;
+        DECLARE_BITMAP(dest_vcpus, MAX_VIRT_CPUS) = { };
+        DECLARE_BITMAP(prev_vcpus, MAX_VIRT_CPUS) = { };
 
         if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
         {
@@ -411,6 +357,24 @@  int pt_irq_create_bind(
 
                 pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
                 pirq_dpci->gmsi.gflags = gflags;
+                if ( pirq_dpci->gmsi.dest_vcpu_id != -1 )
+                    __set_bit(pirq_dpci->gmsi.dest_vcpu_id, prev_vcpus);
+                else
+                {
+                    /*
+                     * If previous configuration has multiple possible
+                     * destinations record them in order to sync the PIR to IRR
+                     * afterwards.
+                     */
+                    dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                     XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+                    dest_mode = pirq_dpci->gmsi.gflags &
+                                XEN_DOMCTL_VMSI_X86_DM_MASK;
+                    delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                              XEN_DOMCTL_VMSI_X86_DELIV_MASK);
+                    hvm_intr_get_dests(d, dest, dest_mode, delivery_mode,
+                                       prev_vcpus);
+                }
             }
         }
         /* Calculate dest_vcpu_id for MSI-type pirq migration. */
@@ -420,20 +384,16 @@  int pt_irq_create_bind(
         delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
                                   XEN_DOMCTL_VMSI_X86_DELIV_MASK);
 
-        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
+        hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, dest_vcpus);
+        dest_vcpu_id = bitmap_weight(dest_vcpus, d->max_vcpus) != 1 ?
+            -1 : find_first_bit(dest_vcpus, d->max_vcpus);
         pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
         spin_unlock(&d->event_lock);
 
         pirq_dpci->gmsi.posted = false;
         vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
-        if ( iommu_intpost )
-        {
-            if ( delivery_mode == dest_LowestPrio )
-                vcpu = vector_hashing_dest(d, dest, dest_mode,
-                                           pirq_dpci->gmsi.gvec);
-            if ( vcpu )
-                pirq_dpci->gmsi.posted = true;
-        }
+        if ( vcpu && iommu_intpost )
+            pirq_dpci->gmsi.posted = true;
         if ( vcpu && is_iommu_enabled(d) )
             hvm_migrate_pirq(pirq_dpci, vcpu);
 
@@ -442,6 +402,9 @@  int pt_irq_create_bind(
             pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
                            info, pirq_dpci->gmsi.gvec);
 
+        if ( hvm_funcs.deliver_posted_intr )
+            domain_sync_vlapic_pir(d, prev_vcpus);
+
         if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
         {
             unsigned long flags;
@@ -731,6 +694,31 @@  int pt_irq_destroy_bind(
     else if ( pirq_dpci && pirq_dpci->gmsi.posted )
         pi_update_irte(NULL, pirq, 0);
 
+    if ( hvm_funcs.deliver_posted_intr )
+    {
+        DECLARE_BITMAP(vcpus, MAX_VIRT_CPUS) = { };
+
+        if ( pirq_dpci->gmsi.dest_vcpu_id != -1 )
+            __set_bit(pirq_dpci->gmsi.dest_vcpu_id, vcpus);
+        else
+        {
+            /*
+             * If previous configuration has multiple possible
+             * destinations record them in order to sync the PIR to IRR.
+             */
+            uint8_t dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                     XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+            uint8_t dest_mode = pirq_dpci->gmsi.gflags &
+                                XEN_DOMCTL_VMSI_X86_DM_MASK;
+            uint8_t delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                              XEN_DOMCTL_VMSI_X86_DELIV_MASK);
+
+            hvm_intr_get_dests(d, dest, dest_mode, delivery_mode, vcpus);
+        }
+
+        domain_sync_vlapic_pir(d, vcpus);
+    }
+
     if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
          list_empty(&pirq_dpci->digl_list) )
     {
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index f86af09898..899665fed8 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -266,7 +266,6 @@  int vmsi_deliver(
     uint8_t delivery_mode, uint8_t trig_mode);
 struct hvm_pirq_dpci;
 void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *);
-int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode);
 
 enum hvm_intblk
 hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack);
@@ -336,6 +335,10 @@  unsigned long hvm_cr4_guest_valid_bits(const struct domain *d, bool restore);
 bool hvm_flush_vcpu_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
                         void *ctxt);
 
+/* Get all the possible destination vCPUs of an interrupt. */
+int hvm_intr_get_dests(struct domain *d, uint8_t dest, uint8_t dest_mode,
+                       uint8_t delivery_mode, unsigned long *vcpus);
+
 #ifdef CONFIG_HVM
 
 #define hvm_get_guest_tsc(v) hvm_get_guest_tsc_fixed(v, 0)
diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h
index dde66b4f0f..2bc2ebadf0 100644
--- a/xen/include/asm-x86/hvm/vlapic.h
+++ b/xen/include/asm-x86/hvm/vlapic.h
@@ -150,4 +150,7 @@  bool_t vlapic_match_dest(
     const struct vlapic *target, const struct vlapic *source,
     int short_hand, uint32_t dest, bool_t dest_mode);
 
+/* Sync the PIR to IRR of all vlapics in the vcpus bitmap. */
+void domain_sync_vlapic_pir(struct domain *d, unsigned long *vcpus);
+
 #endif /* __ASM_X86_HVM_VLAPIC_H__ */