diff mbox

[v2,09/27] ARM: GICv3: introduce separate pending_irq structs for LPIs

Message ID 20170316112030.20419-10-andre.przywara@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara March 16, 2017, 11:20 a.m. UTC
For the same reason that allocating a struct irq_desc for each
possible LPI is not an option, having a struct pending_irq for each LPI
is also not feasible. However we actually only need those when an
interrupt is on a vCPU (or is about to be injected).
Maintain a list of those structs that we can use for the lifecycle of
a guest LPI. We allocate new entries if necessary, however reuse
pre-owned entries whenever possible.
I added some locking around this list here, however my gut feeling is
that we don't need one because this a per-VCPU structure anyway.
If someone could confirm this, I'd be grateful.
Teach the existing VGIC functions to find the right pointer when being
given a virtual LPI number.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 xen/arch/arm/gic.c           |  3 +++
 xen/arch/arm/vgic-v3.c       |  3 +++
 xen/arch/arm/vgic.c          | 64 +++++++++++++++++++++++++++++++++++++++++---
 xen/include/asm-arm/domain.h |  2 ++
 xen/include/asm-arm/vgic.h   | 14 ++++++++++
 5 files changed, 83 insertions(+), 3 deletions(-)

Comments

Stefano Stabellini March 22, 2017, 11:44 p.m. UTC | #1
On Thu, 16 Mar 2017, Andre Przywara wrote:
> For the same reason that allocating a struct irq_desc for each
> possible LPI is not an option, having a struct pending_irq for each LPI
> is also not feasible. However we actually only need those when an
> interrupt is on a vCPU (or is about to be injected).
> Maintain a list of those structs that we can use for the lifecycle of
> a guest LPI. We allocate new entries if necessary, however reuse
> pre-owned entries whenever possible.
> I added some locking around this list here, however my gut feeling is
> that we don't need one because this a per-VCPU structure anyway.
> If someone could confirm this, I'd be grateful.
> Teach the existing VGIC functions to find the right pointer when being
> given a virtual LPI number.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>

Still all past comments are unaddress ;-(

It makes very hard to continue doing reviews. I am sorry but I'll have
to stop reviewing until I see a series with my past comments addressed.
I encourage Julien to do the same.


> ---
>  xen/arch/arm/gic.c           |  3 +++
>  xen/arch/arm/vgic-v3.c       |  3 +++
>  xen/arch/arm/vgic.c          | 64 +++++++++++++++++++++++++++++++++++++++++---
>  xen/include/asm-arm/domain.h |  2 ++
>  xen/include/asm-arm/vgic.h   | 14 ++++++++++
>  5 files changed, 83 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
> index a5348f2..bd3c032 100644
> --- a/xen/arch/arm/gic.c
> +++ b/xen/arch/arm/gic.c
> @@ -509,6 +509,9 @@ static void gic_update_one_lr(struct vcpu *v, int i)
>                  struct vcpu *v_target = vgic_get_target_vcpu(v, irq);
>                  irq_set_affinity(p->desc, cpumask_of(v_target->processor));
>              }
> +            /* If this was an LPI, mark this struct as available again. */
> +            if ( is_lpi(p->irq) )
> +                p->irq = 0;
>          }
>      }
>  }
> diff --git a/xen/arch/arm/vgic-v3.c b/xen/arch/arm/vgic-v3.c
> index 1fadb00..b0653c2 100644
> --- a/xen/arch/arm/vgic-v3.c
> +++ b/xen/arch/arm/vgic-v3.c
> @@ -1426,6 +1426,9 @@ static int vgic_v3_vcpu_init(struct vcpu *v)
>      if ( v->vcpu_id == last_cpu || (v->vcpu_id == (d->max_vcpus - 1)) )
>          v->arch.vgic.flags |= VGIC_V3_RDIST_LAST;
>  
> +    spin_lock_init(&v->arch.vgic.pending_lpi_list_lock);
> +    INIT_LIST_HEAD(&v->arch.vgic.pending_lpi_list);
> +
>      return 0;
>  }
>  
> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> index 364d5f0..e5cfa54 100644
> --- a/xen/arch/arm/vgic.c
> +++ b/xen/arch/arm/vgic.c
> @@ -30,6 +30,8 @@
>  
>  #include <asm/mmio.h>
>  #include <asm/gic.h>
> +#include <asm/gic_v3_defs.h>
> +#include <asm/gic_v3_its.h>
>  #include <asm/vgic.h>
>  
>  static inline struct vgic_irq_rank *vgic_get_rank(struct vcpu *v, int rank)
> @@ -61,7 +63,7 @@ struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq)
>      return vgic_get_rank(v, rank);
>  }
>  
> -static void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
> +void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
>  {
>      INIT_LIST_HEAD(&p->inflight);
>      INIT_LIST_HEAD(&p->lr_queue);
> @@ -244,10 +246,14 @@ struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq)
>  
>  static int vgic_get_virq_priority(struct vcpu *v, unsigned int virq)
>  {
> -    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
> +    struct vgic_irq_rank *rank;
>      unsigned long flags;
>      int priority;
>  
> +    if ( is_lpi(virq) )
> +        return vgic_lpi_get_priority(v->domain, virq);
> +
> +    rank = vgic_rank_irq(v, virq);
>      vgic_lock_rank(v, rank, flags);
>      priority = rank->priority[virq & INTERRUPT_RANK_MASK];
>      vgic_unlock_rank(v, rank, flags);
> @@ -446,13 +452,63 @@ bool vgic_to_sgi(struct vcpu *v, register_t sgir, enum gic_sgi_mode irqmode,
>      return true;
>  }
>  
> +/*
> + * Holding struct pending_irq's for each possible virtual LPI in each domain
> + * requires too much Xen memory, also a malicious guest could potentially
> + * spam Xen with LPI map requests. We cannot cover those with (guest allocated)
> + * ITS memory, so we use a dynamic scheme of allocating struct pending_irq's
> + * on demand.
> + */
> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
> +                                   bool allocate)
> +{
> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
> +
> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
> +    {
> +        if ( lpi_irq->pirq.irq == lpi )
> +        {
> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +            return &lpi_irq->pirq;
> +        }
> +
> +        if ( lpi_irq->pirq.irq == 0 && !empty )
> +            empty = lpi_irq;
> +    }
> +
> +    if ( !allocate )
> +    {
> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +        return NULL;
> +    }
> +
> +    if ( !empty )
> +    {
> +        empty = xzalloc(struct lpi_pending_irq);
> +        vgic_init_pending_irq(&empty->pirq, lpi);
> +        list_add_tail(&empty->entry, &v->arch.vgic.pending_lpi_list);
> +    } else
> +    {
> +        empty->pirq.status = 0;
> +        empty->pirq.irq = lpi;
> +    }
> +
> +    spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +
> +    return &empty->pirq;
> +}
> +
>  struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
>  {
>      struct pending_irq *n;
> +
>      /* Pending irqs allocation strategy: the first vgic.nr_spis irqs
>       * are used for SPIs; the rests are used for per cpu irqs */
>      if ( irq < 32 )
>          n = &v->arch.vgic.pending_irqs[irq];
> +    else if ( is_lpi(irq) )
> +        n = lpi_to_pending(v, irq, true);
>      else
>          n = &v->domain->arch.vgic.pending_irqs[irq - 32];
>      return n;
> @@ -480,7 +536,7 @@ void vgic_clear_pending_irqs(struct vcpu *v)
>  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>  {
>      uint8_t priority;
> -    struct pending_irq *iter, *n = irq_to_pending(v, virq);
> +    struct pending_irq *iter, *n;
>      unsigned long flags;
>      bool running;
>  
> @@ -488,6 +544,8 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>  
>      spin_lock_irqsave(&v->arch.vgic.lock, flags);
>  
> +    n = irq_to_pending(v, virq);
> +
>      /* vcpu offline */
>      if ( test_bit(_VPF_down, &v->pause_flags) )
>      {
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 00b9c1a..f44a84b 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -257,6 +257,8 @@ struct arch_vcpu
>          paddr_t rdist_base;
>  #define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
>          uint8_t flags;
> +        struct list_head pending_lpi_list;
> +        spinlock_t pending_lpi_list_lock;   /* protects the pending_lpi_list */
>      } vgic;
>  
>      /* Timer registers  */
> diff --git a/xen/include/asm-arm/vgic.h b/xen/include/asm-arm/vgic.h
> index 467333c..8f1099c 100644
> --- a/xen/include/asm-arm/vgic.h
> +++ b/xen/include/asm-arm/vgic.h
> @@ -83,6 +83,12 @@ struct pending_irq
>      struct list_head lr_queue;
>  };
>  
> +struct lpi_pending_irq
> +{
> +    struct list_head entry;
> +    struct pending_irq pirq;
> +};
> +
>  #define NR_INTERRUPT_PER_RANK   32
>  #define INTERRUPT_RANK_MASK (NR_INTERRUPT_PER_RANK - 1)
>  
> @@ -296,13 +302,21 @@ extern struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq);
>  extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq);
>  extern void vgic_vcpu_inject_spi(struct domain *d, unsigned int virq);
>  extern void vgic_clear_pending_irqs(struct vcpu *v);
> +extern void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq);
>  extern struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq);
>  extern struct pending_irq *spi_to_pending(struct domain *d, unsigned int irq);
> +extern struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int irq,
> +                                          bool allocate);
>  extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b, int n, int s);
>  extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq);
>  extern bool vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
>  extern void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n);
>  extern void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n);
> +/* placeholder function until the property table gets introduced */
> +static inline int vgic_lpi_get_priority(struct domain *d, uint32_t vlpi)
> +{
> +    return 0xa;
> +}
>  extern void register_vgic_ops(struct domain *d, const struct vgic_ops *ops);
>  int vgic_v2_init(struct domain *d, int *mmio_count);
>  int vgic_v3_init(struct domain *d, int *mmio_count);
> -- 
> 2.9.0
>
Andre Przywara March 23, 2017, 8:08 p.m. UTC | #2
On 22/03/17 23:44, Stefano Stabellini wrote:
> On Thu, 16 Mar 2017, Andre Przywara wrote:
>> For the same reason that allocating a struct irq_desc for each
>> possible LPI is not an option, having a struct pending_irq for each LPI
>> is also not feasible. However we actually only need those when an
>> interrupt is on a vCPU (or is about to be injected).
>> Maintain a list of those structs that we can use for the lifecycle of
>> a guest LPI. We allocate new entries if necessary, however reuse
>> pre-owned entries whenever possible.
>> I added some locking around this list here, however my gut feeling is
>> that we don't need one because this a per-VCPU structure anyway.
>> If someone could confirm this, I'd be grateful.
>> Teach the existing VGIC functions to find the right pointer when being
>> given a virtual LPI number.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> 
> Still all past comments are unaddress ;-(

Sorry for that. I just see that the reviews for those and later emails
were not in the same thread as the other ones, so they were swamped in
my inbox and didn't show up in my TODO thread. This might be due to
races between direct email and list bounces. Interestingly this seems to
affect previous emails as well, which isn't really an excuse, but
explains my perceived rudeness in not addressing your comments (simply
because I didn't see them).
I will try to sort this mess out in the next few days and send a new
version early next week.
If you know from the top of your head any serious architectural change
requests you raised, it would be nice if you could repeat them now, so
that I can address them early enough.
I will now sort my mails by author name to find any comment I might have
missed before.

Sorry again for the mess!

Cheers,
Andre.

> It makes very hard to continue doing reviews. I am sorry but I'll have
> to stop reviewing until I see a series with my past comments addressed.
> I encourage Julien to do the same.
> 
> 
>> ---
>>  xen/arch/arm/gic.c           |  3 +++
>>  xen/arch/arm/vgic-v3.c       |  3 +++
>>  xen/arch/arm/vgic.c          | 64 +++++++++++++++++++++++++++++++++++++++++---
>>  xen/include/asm-arm/domain.h |  2 ++
>>  xen/include/asm-arm/vgic.h   | 14 ++++++++++
>>  5 files changed, 83 insertions(+), 3 deletions(-)
>>
>> diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
>> index a5348f2..bd3c032 100644
>> --- a/xen/arch/arm/gic.c
>> +++ b/xen/arch/arm/gic.c
>> @@ -509,6 +509,9 @@ static void gic_update_one_lr(struct vcpu *v, int i)
>>                  struct vcpu *v_target = vgic_get_target_vcpu(v, irq);
>>                  irq_set_affinity(p->desc, cpumask_of(v_target->processor));
>>              }
>> +            /* If this was an LPI, mark this struct as available again. */
>> +            if ( is_lpi(p->irq) )
>> +                p->irq = 0;
>>          }
>>      }
>>  }
>> diff --git a/xen/arch/arm/vgic-v3.c b/xen/arch/arm/vgic-v3.c
>> index 1fadb00..b0653c2 100644
>> --- a/xen/arch/arm/vgic-v3.c
>> +++ b/xen/arch/arm/vgic-v3.c
>> @@ -1426,6 +1426,9 @@ static int vgic_v3_vcpu_init(struct vcpu *v)
>>      if ( v->vcpu_id == last_cpu || (v->vcpu_id == (d->max_vcpus - 1)) )
>>          v->arch.vgic.flags |= VGIC_V3_RDIST_LAST;
>>  
>> +    spin_lock_init(&v->arch.vgic.pending_lpi_list_lock);
>> +    INIT_LIST_HEAD(&v->arch.vgic.pending_lpi_list);
>> +
>>      return 0;
>>  }
>>  
>> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
>> index 364d5f0..e5cfa54 100644
>> --- a/xen/arch/arm/vgic.c
>> +++ b/xen/arch/arm/vgic.c
>> @@ -30,6 +30,8 @@
>>  
>>  #include <asm/mmio.h>
>>  #include <asm/gic.h>
>> +#include <asm/gic_v3_defs.h>
>> +#include <asm/gic_v3_its.h>
>>  #include <asm/vgic.h>
>>  
>>  static inline struct vgic_irq_rank *vgic_get_rank(struct vcpu *v, int rank)
>> @@ -61,7 +63,7 @@ struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq)
>>      return vgic_get_rank(v, rank);
>>  }
>>  
>> -static void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
>> +void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
>>  {
>>      INIT_LIST_HEAD(&p->inflight);
>>      INIT_LIST_HEAD(&p->lr_queue);
>> @@ -244,10 +246,14 @@ struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq)
>>  
>>  static int vgic_get_virq_priority(struct vcpu *v, unsigned int virq)
>>  {
>> -    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
>> +    struct vgic_irq_rank *rank;
>>      unsigned long flags;
>>      int priority;
>>  
>> +    if ( is_lpi(virq) )
>> +        return vgic_lpi_get_priority(v->domain, virq);
>> +
>> +    rank = vgic_rank_irq(v, virq);
>>      vgic_lock_rank(v, rank, flags);
>>      priority = rank->priority[virq & INTERRUPT_RANK_MASK];
>>      vgic_unlock_rank(v, rank, flags);
>> @@ -446,13 +452,63 @@ bool vgic_to_sgi(struct vcpu *v, register_t sgir, enum gic_sgi_mode irqmode,
>>      return true;
>>  }
>>  
>> +/*
>> + * Holding struct pending_irq's for each possible virtual LPI in each domain
>> + * requires too much Xen memory, also a malicious guest could potentially
>> + * spam Xen with LPI map requests. We cannot cover those with (guest allocated)
>> + * ITS memory, so we use a dynamic scheme of allocating struct pending_irq's
>> + * on demand.
>> + */
>> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
>> +                                   bool allocate)
>> +{
>> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
>> +
>> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
>> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
>> +    {
>> +        if ( lpi_irq->pirq.irq == lpi )
>> +        {
>> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +            return &lpi_irq->pirq;
>> +        }
>> +
>> +        if ( lpi_irq->pirq.irq == 0 && !empty )
>> +            empty = lpi_irq;
>> +    }
>> +
>> +    if ( !allocate )
>> +    {
>> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +        return NULL;
>> +    }
>> +
>> +    if ( !empty )
>> +    {
>> +        empty = xzalloc(struct lpi_pending_irq);
>> +        vgic_init_pending_irq(&empty->pirq, lpi);
>> +        list_add_tail(&empty->entry, &v->arch.vgic.pending_lpi_list);
>> +    } else
>> +    {
>> +        empty->pirq.status = 0;
>> +        empty->pirq.irq = lpi;
>> +    }
>> +
>> +    spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +
>> +    return &empty->pirq;
>> +}
>> +
>>  struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
>>  {
>>      struct pending_irq *n;
>> +
>>      /* Pending irqs allocation strategy: the first vgic.nr_spis irqs
>>       * are used for SPIs; the rests are used for per cpu irqs */
>>      if ( irq < 32 )
>>          n = &v->arch.vgic.pending_irqs[irq];
>> +    else if ( is_lpi(irq) )
>> +        n = lpi_to_pending(v, irq, true);
>>      else
>>          n = &v->domain->arch.vgic.pending_irqs[irq - 32];
>>      return n;
>> @@ -480,7 +536,7 @@ void vgic_clear_pending_irqs(struct vcpu *v)
>>  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>>  {
>>      uint8_t priority;
>> -    struct pending_irq *iter, *n = irq_to_pending(v, virq);
>> +    struct pending_irq *iter, *n;
>>      unsigned long flags;
>>      bool running;
>>  
>> @@ -488,6 +544,8 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>>  
>>      spin_lock_irqsave(&v->arch.vgic.lock, flags);
>>  
>> +    n = irq_to_pending(v, virq);
>> +
>>      /* vcpu offline */
>>      if ( test_bit(_VPF_down, &v->pause_flags) )
>>      {
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 00b9c1a..f44a84b 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -257,6 +257,8 @@ struct arch_vcpu
>>          paddr_t rdist_base;
>>  #define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
>>          uint8_t flags;
>> +        struct list_head pending_lpi_list;
>> +        spinlock_t pending_lpi_list_lock;   /* protects the pending_lpi_list */
>>      } vgic;
>>  
>>      /* Timer registers  */
>> diff --git a/xen/include/asm-arm/vgic.h b/xen/include/asm-arm/vgic.h
>> index 467333c..8f1099c 100644
>> --- a/xen/include/asm-arm/vgic.h
>> +++ b/xen/include/asm-arm/vgic.h
>> @@ -83,6 +83,12 @@ struct pending_irq
>>      struct list_head lr_queue;
>>  };
>>  
>> +struct lpi_pending_irq
>> +{
>> +    struct list_head entry;
>> +    struct pending_irq pirq;
>> +};
>> +
>>  #define NR_INTERRUPT_PER_RANK   32
>>  #define INTERRUPT_RANK_MASK (NR_INTERRUPT_PER_RANK - 1)
>>  
>> @@ -296,13 +302,21 @@ extern struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq);
>>  extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq);
>>  extern void vgic_vcpu_inject_spi(struct domain *d, unsigned int virq);
>>  extern void vgic_clear_pending_irqs(struct vcpu *v);
>> +extern void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq);
>>  extern struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq);
>>  extern struct pending_irq *spi_to_pending(struct domain *d, unsigned int irq);
>> +extern struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int irq,
>> +                                          bool allocate);
>>  extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b, int n, int s);
>>  extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq);
>>  extern bool vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
>>  extern void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n);
>>  extern void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n);
>> +/* placeholder function until the property table gets introduced */
>> +static inline int vgic_lpi_get_priority(struct domain *d, uint32_t vlpi)
>> +{
>> +    return 0xa;
>> +}
>>  extern void register_vgic_ops(struct domain *d, const struct vgic_ops *ops);
>>  int vgic_v2_init(struct domain *d, int *mmio_count);
>>  int vgic_v3_init(struct domain *d, int *mmio_count);
>> -- 
>> 2.9.0
>>
Julien Grall March 24, 2017, 10:59 a.m. UTC | #3
Hi Andre,

On 03/23/2017 08:08 PM, André Przywara wrote:
> On 22/03/17 23:44, Stefano Stabellini wrote:
>> On Thu, 16 Mar 2017, Andre Przywara wrote:
>>> For the same reason that allocating a struct irq_desc for each
>>> possible LPI is not an option, having a struct pending_irq for each LPI
>>> is also not feasible. However we actually only need those when an
>>> interrupt is on a vCPU (or is about to be injected).
>>> Maintain a list of those structs that we can use for the lifecycle of
>>> a guest LPI. We allocate new entries if necessary, however reuse
>>> pre-owned entries whenever possible.
>>> I added some locking around this list here, however my gut feeling is
>>> that we don't need one because this a per-VCPU structure anyway.
>>> If someone could confirm this, I'd be grateful.
>>> Teach the existing VGIC functions to find the right pointer when being
>>> given a virtual LPI number.
>>>
>>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>>
>> Still all past comments are unaddress ;-(
>
> Sorry for that. I just see that the reviews for those and later emails
> were not in the same thread as the other ones, so they were swamped in
> my inbox and didn't show up in my TODO thread. This might be due to
> races between direct email and list bounces. Interestingly this seems to
> affect previous emails as well, which isn't really an excuse, but
> explains my perceived rudeness in not addressing your comments (simply
> because I didn't see them).
> I will try to sort this mess out in the next few days and send a new
> version early next week.
> If you know from the top of your head any serious architectural change
> requests you raised, it would be nice if you could repeat them now, so
> that I can address them early enough.

I am planning to have a look at the full series today. And will try to 
point out major architectural changes.

Cheers,
Julien Grall March 24, 2017, 11:40 a.m. UTC | #4
Hi Andre

On 03/16/2017 11:20 AM, Andre Przywara wrote:
> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> index 364d5f0..e5cfa54 100644
> --- a/xen/arch/arm/vgic.c
> +++ b/xen/arch/arm/vgic.c
> @@ -30,6 +30,8 @@
>
>  #include <asm/mmio.h>
>  #include <asm/gic.h>
> +#include <asm/gic_v3_defs.h>
> +#include <asm/gic_v3_its.h>

I really don't want to see gic_v3_* headers included in common code. Why 
do you need them?

>  #include <asm/vgic.h>
>
>  static inline struct vgic_irq_rank *vgic_get_rank(struct vcpu *v, int rank)
> @@ -61,7 +63,7 @@ struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq)
>      return vgic_get_rank(v, rank);
>  }
>
> -static void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
> +void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
>  {
>      INIT_LIST_HEAD(&p->inflight);
>      INIT_LIST_HEAD(&p->lr_queue);
> @@ -244,10 +246,14 @@ struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq)
>
>  static int vgic_get_virq_priority(struct vcpu *v, unsigned int virq)
>  {
> -    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
> +    struct vgic_irq_rank *rank;
>      unsigned long flags;
>      int priority;
>
> +    if ( is_lpi(virq) )
> +        return vgic_lpi_get_priority(v->domain, virq);

This would benefits some comments to explain why LPI pending handling is 
a different path.

> +
> +    rank = vgic_rank_irq(v, virq);
>      vgic_lock_rank(v, rank, flags);
>      priority = rank->priority[virq & INTERRUPT_RANK_MASK];
>      vgic_unlock_rank(v, rank, flags);
> @@ -446,13 +452,63 @@ bool vgic_to_sgi(struct vcpu *v, register_t sgir, enum gic_sgi_mode irqmode,
>      return true;
>  }
>
> +/*
> + * Holding struct pending_irq's for each possible virtual LPI in each domain
> + * requires too much Xen memory, also a malicious guest could potentially
> + * spam Xen with LPI map requests. We cannot cover those with (guest allocated)
> + * ITS memory, so we use a dynamic scheme of allocating struct pending_irq's
> + * on demand.
> + */

I am afraid this will not prevent a guest to use too much Xen memory. 
Let's imagine the guest is mapping thousands of LPIs but decides to 
never handle them or is slow. You would allocate pending_irq for each 
LPI, and never release the memory.

If we use dynamic allocation, we need a way to limit the number of LPIs 
received by a guest to avoid memory exhaustion. The only idea I have is 
an artificial limit, but I don't think it is good. Any other ideas?

> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
> +                                   bool allocate)
> +{
> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
> +
> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
> +    {
> +        if ( lpi_irq->pirq.irq == lpi )
> +        {
> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +            return &lpi_irq->pirq;
> +        }
> +
> +        if ( lpi_irq->pirq.irq == 0 && !empty )
> +            empty = lpi_irq;
> +    }
> +
> +    if ( !allocate )
> +    {
> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +        return NULL;
> +    }
> +
> +    if ( !empty )
> +    {
> +        empty = xzalloc(struct lpi_pending_irq);

xzalloc will return NULL if memory is exhausted. There is a general lack 
of error checking within this series. Any missing error could be a 
potential target from a guest, leading to security issue. Stefano and I 
already spot some, it does not mean we found all. So Before sending the 
next version, please go through the series and verify *every* return.

Also, I can't find the code which release LPIs neither in this patch nor 
in this series. A general rule is too have allocation and free within 
the same patch. It is much easier to spot missing free.

> +        vgic_init_pending_irq(&empty->pirq, lpi);
> +        list_add_tail(&empty->entry, &v->arch.vgic.pending_lpi_list);
> +    } else
> +    {
> +        empty->pirq.status = 0;
> +        empty->pirq.irq = lpi;
> +    }
> +
> +    spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> +
> +    return &empty->pirq;
> +}
> +
>  struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
>  {
>      struct pending_irq *n;
> +

Spurious change.

>      /* Pending irqs allocation strategy: the first vgic.nr_spis irqs
>       * are used for SPIs; the rests are used for per cpu irqs */
>      if ( irq < 32 )
>          n = &v->arch.vgic.pending_irqs[irq];
> +    else if ( is_lpi(irq) )
> +        n = lpi_to_pending(v, irq, true);
>      else
>          n = &v->domain->arch.vgic.pending_irqs[irq - 32];
>      return n;
> @@ -480,7 +536,7 @@ void vgic_clear_pending_irqs(struct vcpu *v)
>  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>  {
>      uint8_t priority;
> -    struct pending_irq *iter, *n = irq_to_pending(v, virq);
> +    struct pending_irq *iter, *n;
>      unsigned long flags;
>      bool running;
>
> @@ -488,6 +544,8 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>
>      spin_lock_irqsave(&v->arch.vgic.lock, flags);
>
> +    n = irq_to_pending(v, virq);

Why did you move this code?

> +
>      /* vcpu offline */
>      if ( test_bit(_VPF_down, &v->pause_flags) )
>      {
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 00b9c1a..f44a84b 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -257,6 +257,8 @@ struct arch_vcpu
>          paddr_t rdist_base;
>  #define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
>          uint8_t flags;
> +        struct list_head pending_lpi_list;
> +        spinlock_t pending_lpi_list_lock;   /* protects the pending_lpi_list */


It would be better to have this structure per-domain limiting the amount 
of memory allocating. Also, Stefano was suggesting to use a more 
efficient data structure, such as an hashtable or a tree.

I would be ok to focus on the correctness so far, but this would need to 
be address before ITS is marked as stable.

>  extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b, int n, int s);
>  extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq);
>  extern bool vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
>  extern void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n);
>  extern void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n);
> +/* placeholder function until the property table gets introduced */
> +static inline int vgic_lpi_get_priority(struct domain *d, uint32_t vlpi)
> +{
> +    return 0xa;
> +}

To be fair, you can avoid this function by re-ordering the patches. As 
suggested earlier, I would introduce some bits of the vITS before. This 
would also make the series easier to read.

Also, looking how it has been implemented later. I would prefer to see a 
new callback get_priority in vgic_ops which will return the correct 
priority.

I agree this would introduce a bit more of duplicated code. But it would 
limit the use of is_lpi in the common code.

Cheers,
Andre Przywara March 24, 2017, 3:50 p.m. UTC | #5
Hi,

On 24/03/17 11:40, Julien Grall wrote:
> Hi Andre
> 
> On 03/16/2017 11:20 AM, Andre Przywara wrote:
>> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
>> index 364d5f0..e5cfa54 100644
>> --- a/xen/arch/arm/vgic.c
>> +++ b/xen/arch/arm/vgic.c
>> @@ -30,6 +30,8 @@
>>
>>  #include <asm/mmio.h>
>>  #include <asm/gic.h>
>> +#include <asm/gic_v3_defs.h>
>> +#include <asm/gic_v3_its.h>
> 
> I really don't want to see gic_v3_* headers included in common code. Why
> do you need them?
> 
>>  #include <asm/vgic.h>
>>
>>  static inline struct vgic_irq_rank *vgic_get_rank(struct vcpu *v, int
>> rank)
>> @@ -61,7 +63,7 @@ struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v,
>> unsigned int irq)
>>      return vgic_get_rank(v, rank);
>>  }
>>
>> -static void vgic_init_pending_irq(struct pending_irq *p, unsigned int
>> virq)
>> +void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
>>  {
>>      INIT_LIST_HEAD(&p->inflight);
>>      INIT_LIST_HEAD(&p->lr_queue);
>> @@ -244,10 +246,14 @@ struct vcpu *vgic_get_target_vcpu(struct vcpu
>> *v, unsigned int virq)
>>
>>  static int vgic_get_virq_priority(struct vcpu *v, unsigned int virq)
>>  {
>> -    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
>> +    struct vgic_irq_rank *rank;
>>      unsigned long flags;
>>      int priority;
>>
>> +    if ( is_lpi(virq) )
>> +        return vgic_lpi_get_priority(v->domain, virq);
> 
> This would benefits some comments to explain why LPI pending handling is
> a different path.
> 
>> +
>> +    rank = vgic_rank_irq(v, virq);
>>      vgic_lock_rank(v, rank, flags);
>>      priority = rank->priority[virq & INTERRUPT_RANK_MASK];
>>      vgic_unlock_rank(v, rank, flags);
>> @@ -446,13 +452,63 @@ bool vgic_to_sgi(struct vcpu *v, register_t
>> sgir, enum gic_sgi_mode irqmode,
>>      return true;
>>  }
>>
>> +/*
>> + * Holding struct pending_irq's for each possible virtual LPI in each
>> domain
>> + * requires too much Xen memory, also a malicious guest could
>> potentially
>> + * spam Xen with LPI map requests. We cannot cover those with (guest
>> allocated)
>> + * ITS memory, so we use a dynamic scheme of allocating struct
>> pending_irq's
>> + * on demand.
>> + */
> 
> I am afraid this will not prevent a guest to use too much Xen memory.
> Let's imagine the guest is mapping thousands of LPIs but decides to
> never handle them or is slow. You would allocate pending_irq for each
> LPI, and never release the memory.

So what would be the alternative here? In the current GICv2/3 scheme a
pending_irq for each (SPI) interrupt is allocated up front. This
approach here tries just to be a bit smarter for LPIs, since the
expectation is that we by far don't need so many. In the worst case the
memory allocation would converge to the upper limit (number of mapped
LPIs), which would probably be used otherwise for the static allocation
scheme. Which means we would never be worse.
What actually is missing is the freeing the memory upon domain
destruction, which can't be tested for Dom0.

> If we use dynamic allocation, we need a way to limit the number of LPIs
> received by a guest to avoid memory exhaustion. The only idea I have is
> an artificial limit, but I don't think it is good. Any other ideas?

I think if we are concerned about this, we shouldn't allow *DomUs* to
allocate too many LPIs, which are bounded by the DeviceID/EventID
combinations. This will be under full control by Dom0, which will
communicate the number of MSIs (events) for each passthrough-ed device,
so we can easily impose a policy limit upon creation.

And just to be sure: we are at most allocating one structure per event.
An MSI interrupt storm would still converge into one pending LPI
(struct), so wouldn't trigger any further allocations.
Remapping LPIs won't increase that as well, since we only can assign one
LPI to a DevID/EvID pair at any given point in time. Remapping requires
the pending bit to be cleared. And the structures are not indexed by the
LPI number, but just take a free slot.

For Dom0 I don't see any good ways of limiting the number of LPIs, but
we shouldn't need any.

>> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
>> +                                   bool allocate)
>> +{
>> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
>> +
>> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
>> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
>> +    {
>> +        if ( lpi_irq->pirq.irq == lpi )
>> +        {
>> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +            return &lpi_irq->pirq;
>> +        }
>> +
>> +        if ( lpi_irq->pirq.irq == 0 && !empty )
>> +            empty = lpi_irq;
>> +    }
>> +
>> +    if ( !allocate )
>> +    {
>> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +        return NULL;
>> +    }
>> +
>> +    if ( !empty )
>> +    {
>> +        empty = xzalloc(struct lpi_pending_irq);
> 
> xzalloc will return NULL if memory is exhausted. There is a general lack
> of error checking within this series. Any missing error could be a
> potential target from a guest, leading to security issue. Stefano and I
> already spot some, it does not mean we found all. So Before sending the
> next version, please go through the series and verify *every* return.
> 
> Also, I can't find the code which release LPIs neither in this patch nor
> in this series. A general rule is too have allocation and free within
> the same patch. It is much easier to spot missing free.

There is no such code, on purpose. We only grow the number, but never
shrink it (to what?, where to stop?, what if we need more again?). As
said above, in the worst case this ends up at something where a static
allocation would have started with from the beginning.

And as mentioned, I only skipped the "free the whole list upon domain
destruction" part.

Cheers,
Andre.

>> +        vgic_init_pending_irq(&empty->pirq, lpi);
>> +        list_add_tail(&empty->entry, &v->arch.vgic.pending_lpi_list);
>> +    } else
>> +    {
>> +        empty->pirq.status = 0;
>> +        empty->pirq.irq = lpi;
>> +    }
>> +
>> +    spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>> +
>> +    return &empty->pirq;
>> +}
>> +
>>  struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
>>  {
>>      struct pending_irq *n;
>> +
> 
> Spurious change.
> 
>>      /* Pending irqs allocation strategy: the first vgic.nr_spis irqs
>>       * are used for SPIs; the rests are used for per cpu irqs */
>>      if ( irq < 32 )
>>          n = &v->arch.vgic.pending_irqs[irq];
>> +    else if ( is_lpi(irq) )
>> +        n = lpi_to_pending(v, irq, true);
>>      else
>>          n = &v->domain->arch.vgic.pending_irqs[irq - 32];
>>      return n;
>> @@ -480,7 +536,7 @@ void vgic_clear_pending_irqs(struct vcpu *v)
>>  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
>>  {
>>      uint8_t priority;
>> -    struct pending_irq *iter, *n = irq_to_pending(v, virq);
>> +    struct pending_irq *iter, *n;
>>      unsigned long flags;
>>      bool running;
>>
>> @@ -488,6 +544,8 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned
>> int virq)
>>
>>      spin_lock_irqsave(&v->arch.vgic.lock, flags);
>>
>> +    n = irq_to_pending(v, virq);
> 
> Why did you move this code?
> 
>> +
>>      /* vcpu offline */
>>      if ( test_bit(_VPF_down, &v->pause_flags) )
>>      {
>> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
>> index 00b9c1a..f44a84b 100644
>> --- a/xen/include/asm-arm/domain.h
>> +++ b/xen/include/asm-arm/domain.h
>> @@ -257,6 +257,8 @@ struct arch_vcpu
>>          paddr_t rdist_base;
>>  #define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
>>          uint8_t flags;
>> +        struct list_head pending_lpi_list;
>> +        spinlock_t pending_lpi_list_lock;   /* protects the
>> pending_lpi_list */
> 
> 
> It would be better to have this structure per-domain limiting the amount
> of memory allocating. Also, Stefano was suggesting to use a more
> efficient data structure, such as an hashtable or a tree.
> 
> I would be ok to focus on the correctness so far, but this would need to
> be address before ITS is marked as stable.
> 
>>  extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b,
>> int n, int s);
>>  extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned
>> int irq);
>>  extern bool vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
>>  extern void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n);
>>  extern void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n);
>> +/* placeholder function until the property table gets introduced */
>> +static inline int vgic_lpi_get_priority(struct domain *d, uint32_t vlpi)
>> +{
>> +    return 0xa;
>> +}
> 
> To be fair, you can avoid this function by re-ordering the patches. As
> suggested earlier, I would introduce some bits of the vITS before. This
> would also make the series easier to read.
> 
> Also, looking how it has been implemented later. I would prefer to see a
> new callback get_priority in vgic_ops which will return the correct
> priority.
> 
> I agree this would introduce a bit more of duplicated code. But it would
> limit the use of is_lpi in the common code.
> 
> Cheers,
>
Julien Grall March 24, 2017, 4:19 p.m. UTC | #6
Hi Andre,

On 03/24/2017 03:50 PM, Andre Przywara wrote:
> On 24/03/17 11:40, Julien Grall wrote:
>>> +/*
>>> + * Holding struct pending_irq's for each possible virtual LPI in each
>>> domain
>>> + * requires too much Xen memory, also a malicious guest could
>>> potentially
>>> + * spam Xen with LPI map requests. We cannot cover those with (guest
>>> allocated)
>>> + * ITS memory, so we use a dynamic scheme of allocating struct
>>> pending_irq's
>>> + * on demand.
>>> + */
>>
>> I am afraid this will not prevent a guest to use too much Xen memory.
>> Let's imagine the guest is mapping thousands of LPIs but decides to
>> never handle them or is slow. You would allocate pending_irq for each
>> LPI, and never release the memory.
>
> So what would be the alternative here? In the current GICv2/3 scheme a
> pending_irq for each (SPI) interrupt is allocated up front. This
> approach here tries just to be a bit smarter for LPIs, since the
> expectation is that we by far don't need so many. In the worst case the
> memory allocation would converge to the upper limit (number of mapped
> LPIs), which would probably be used otherwise for the static allocation
> scheme. Which means we would never be worse.

It is getting worse in the current version because you have the list per 
vCPU. So you may get nLPIs * nvCPUs pending_irq allocated.

> What actually is missing is the freeing the memory upon domain
> destruction, which can't be tested for Dom0.

As I already said multiple time, forgetting to add the domain 
destruction is much worse because this is a call to miss something when 
we will need it. I prefer to see the code now so we can review.

>
>> If we use dynamic allocation, we need a way to limit the number of LPIs
>> received by a guest to avoid memory exhaustion. The only idea I have is
>> an artificial limit, but I don't think it is good. Any other ideas?
>
> I think if we are concerned about this, we shouldn't allow *DomUs* to
> allocate too many LPIs, which are bounded by the DeviceID/EventID
> combinations. This will be under full control by Dom0, which will
> communicate the number of MSIs (events) for each passthrough-ed device,
> so we can easily impose a policy limit upon creation.

If DOM0 is able to control the number of DeviceID/EventID, then why do 
we need to allocate on the fly?

Also, I don't think the spec prevents to populate Device Table for a 
deviceID that has no Device associated. It could be used by a domain to 
inject fake LPI. For instance this could replace polling mode for the 
command queue.

My main concern is memory allocation can fail. Then what will you do? 
You will penalize a well-behave domain that because someone else was nasty.

And you have no way to report to the guest: "I was not able to allocate 
memory, please try later" because this is an interrupt.

>>> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
>>> +                                   bool allocate)
>>> +{
>>> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
>>> +
>>> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
>>> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
>>> +    {
>>> +        if ( lpi_irq->pirq.irq == lpi )
>>> +        {
>>> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>>> +            return &lpi_irq->pirq;
>>> +        }
>>> +
>>> +        if ( lpi_irq->pirq.irq == 0 && !empty )
>>> +            empty = lpi_irq;
>>> +    }
>>> +
>>> +    if ( !allocate )
>>> +    {
>>> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    if ( !empty )
>>> +    {
>>> +        empty = xzalloc(struct lpi_pending_irq);
>>
>> xzalloc will return NULL if memory is exhausted. There is a general lack
>> of error checking within this series. Any missing error could be a
>> potential target from a guest, leading to security issue. Stefano and I
>> already spot some, it does not mean we found all. So Before sending the
>> next version, please go through the series and verify *every* return.
>>
>> Also, I can't find the code which release LPIs neither in this patch nor
>> in this series. A general rule is too have allocation and free within
>> the same patch. It is much easier to spot missing free.
>
> There is no such code, on purpose. We only grow the number, but never
> shrink it (to what?, where to stop?, what if we need more again?). As
> said above, in the worst case this ends up at something where a static
> allocation would have started with from the beginning.
>
> And as mentioned, I only skipped the "free the whole list upon domain
> destruction" part.

A general rule is to explain in the commit message what is done. So you 
avoid argument on the ML.

Cheers,
Stefano Stabellini March 24, 2017, 5:26 p.m. UTC | #7
On Fri, 24 Mar 2017, Andre Przywara wrote:
> >> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
> >> +                                   bool allocate)
> >> +{
> >> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
> >> +
> >> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
> >> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
> >> +    {
> >> +        if ( lpi_irq->pirq.irq == lpi )
> >> +        {
> >> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> >> +            return &lpi_irq->pirq;
> >> +        }
> >> +
> >> +        if ( lpi_irq->pirq.irq == 0 && !empty )
> >> +            empty = lpi_irq;
> >> +    }
> >> +
> >> +    if ( !allocate )
> >> +    {
> >> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
> >> +        return NULL;
> >> +    }
> >> +
> >> +    if ( !empty )
> >> +    {
> >> +        empty = xzalloc(struct lpi_pending_irq);
> > 
> > xzalloc will return NULL if memory is exhausted. There is a general lack
> > of error checking within this series. Any missing error could be a
> > potential target from a guest, leading to security issue. Stefano and I
> > already spot some, it does not mean we found all. So Before sending the
> > next version, please go through the series and verify *every* return.
> > 
> > Also, I can't find the code which release LPIs neither in this patch nor
> > in this series. A general rule is too have allocation and free within
> > the same patch. It is much easier to spot missing free.
> 
> There is no such code, on purpose. We only grow the number, but never
> shrink it (to what?, where to stop?, what if we need more again?). As
> said above, in the worst case this ends up at something where a static
> allocation would have started with from the beginning.

Dellocate struct pending_irq when the domain is destroyed or when an LPI
is disabled?

> to what?, where to stop?

We don't stop, why stop? Let's go back to 0 if we can.

> what if we need more again?

We allocate them again?
Andre Przywara March 27, 2017, 9:02 a.m. UTC | #8
Hi,

On 24/03/17 17:26, Stefano Stabellini wrote:
> On Fri, 24 Mar 2017, Andre Przywara wrote:
>>>> +struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
>>>> +                                   bool allocate)
>>>> +{
>>>> +    struct lpi_pending_irq *lpi_irq, *empty = NULL;
>>>> +
>>>> +    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
>>>> +    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
>>>> +    {
>>>> +        if ( lpi_irq->pirq.irq == lpi )
>>>> +        {
>>>> +            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>>>> +            return &lpi_irq->pirq;
>>>> +        }
>>>> +
>>>> +        if ( lpi_irq->pirq.irq == 0 && !empty )
>>>> +            empty = lpi_irq;
>>>> +    }
>>>> +
>>>> +    if ( !allocate )
>>>> +    {
>>>> +        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    if ( !empty )
>>>> +    {
>>>> +        empty = xzalloc(struct lpi_pending_irq);
>>>
>>> xzalloc will return NULL if memory is exhausted. There is a general lack
>>> of error checking within this series. Any missing error could be a
>>> potential target from a guest, leading to security issue. Stefano and I
>>> already spot some, it does not mean we found all. So Before sending the
>>> next version, please go through the series and verify *every* return.
>>>
>>> Also, I can't find the code which release LPIs neither in this patch nor
>>> in this series. A general rule is too have allocation and free within
>>> the same patch. It is much easier to spot missing free.
>>
>> There is no such code, on purpose. We only grow the number, but never
>> shrink it (to what?, where to stop?, what if we need more again?). As
>> said above, in the worst case this ends up at something where a static
>> allocation would have started with from the beginning.
> 
> Dellocate struct pending_irq when the domain is destroyed

Sure, I think this is what is missing at the moment.

> or when an LPI is disabled?

A certain struct pending_irq is not assigned to a particular LPI, it's
more a member of a pool to allocate from when in need.
I consider them like shadow list registers, which can in addition be
used as spill-overs.
Very much like an LR the pending_irq is only assigned to a certain LPI
for the lifetime of a *pending* LPI on a particular VCPU. As mentioned
earlier, under normal conditions this is very short for always edge
triggered LPIs which don't have an active state.

>> to what?, where to stop?
> 
> We don't stop, why stop? Let's go back to 0 if we can.
> 
>> what if we need more again?
> 
> We allocate them again?

I am afraid that this would lead to situations where we needlessly
allocate and deallocate pending_irqs. Under normal load I'd expect to
have something like zero to three LPIs pending at any given point in
time (mostly zero, to be honest).
So this will lead to a situation where *every* LPI that becomes pending
triggers a memory allocation - in the hot path. That's why the pool
idea. So if we are going to shrink the pool, I'd stop at something like
five entries, to not penalize the common case.
Does that sound useful?

Cheers,
Andre.
Julien Grall March 27, 2017, 2:01 p.m. UTC | #9
Hi,

On 27/03/17 10:02, Andre Przywara wrote:
> On 24/03/17 17:26, Stefano Stabellini wrote:
>> On Fri, 24 Mar 2017, Andre Przywara wrote:
> I am afraid that this would lead to situations where we needlessly
> allocate and deallocate pending_irqs. Under normal load I'd expect to
> have something like zero to three LPIs pending at any given point in
> time (mostly zero, to be honest).
> So this will lead to a situation where *every* LPI that becomes pending
> triggers a memory allocation - in the hot path. That's why the pool
> idea. So if we are going to shrink the pool, I'd stop at something like
> five entries, to not penalize the common case.
> Does that sound useful?

Not answering directly to the question here. I will summarize the face 
to face discussion I had with Andre this morning.

So allocating the pending_irq in the IRQ path is not a solution because 
memory allocation should not happen in IRQ context, see 
ASSERT(!in_irq()) in _xmalloc.

Regardless the ASSERT, it will also increase the time to handle and 
forward an interrupt when there are no pending_irq free because it is 
necessary to allocate a new one. Lastly, we have no way to tell the 
guest: "Try again" if it Xen is running out of memory.

The outcome of the discussion is to pre-allocate the pending_irq when a 
device is assigned to a domain. We know the maximum number of event 
supported by a device and that 1 event = 1 LPI.

This may allocate more memory (a pending_irq is 56 bytes), but at least 
we don't need allocation on the fly and can report error.

One could argue that we could allocate on MAPTI to limit the allocation. 
However, as we are not able to rate-limit/defer the execution of the 
command queue so far, a guest could potentially flood with MAPTI and 
monopolize the pCPU for a long time.

Cheers,
Stefano Stabellini March 27, 2017, 5:44 p.m. UTC | #10
On Mon, 27 Mar 2017, Julien Grall wrote:
> Hi,
> 
> On 27/03/17 10:02, Andre Przywara wrote:
> > On 24/03/17 17:26, Stefano Stabellini wrote:
> > > On Fri, 24 Mar 2017, Andre Przywara wrote:
> > I am afraid that this would lead to situations where we needlessly
> > allocate and deallocate pending_irqs. Under normal load I'd expect to
> > have something like zero to three LPIs pending at any given point in
> > time (mostly zero, to be honest).
> > So this will lead to a situation where *every* LPI that becomes pending
> > triggers a memory allocation - in the hot path. That's why the pool
> > idea. So if we are going to shrink the pool, I'd stop at something like
> > five entries, to not penalize the common case.
> > Does that sound useful?
> 
> Not answering directly to the question here. I will summarize the face to face
> discussion I had with Andre this morning.
> 
> So allocating the pending_irq in the IRQ path is not a solution because memory
> allocation should not happen in IRQ context, see ASSERT(!in_irq()) in
> _xmalloc.
> 
> Regardless the ASSERT, it will also increase the time to handle and forward an
> interrupt when there are no pending_irq free because it is necessary to
> allocate a new one. Lastly, we have no way to tell the guest: "Try again" if
> it Xen is running out of memory.
> 
> The outcome of the discussion is to pre-allocate the pending_irq when a device
> is assigned to a domain. We know the maximum number of event supported by a
> device and that 1 event = 1 LPI.
> 
> This may allocate more memory (a pending_irq is 56 bytes), but at least we
> don't need allocation on the fly and can report error.
> 
> One could argue that we could allocate on MAPTI to limit the allocation.
> However, as we are not able to rate-limit/defer the execution of the command
> queue so far, a guest could potentially flood with MAPTI and monopolize the
> pCPU for a long time.

It makes a lot of sense to keep the allocation out of the irq path.
However, I am wondering if the allocation/deallocation of pending_irq
structs could be done at the point the vLPIs are enabled/disabled,
instead of device assignment time.

In any case, there should be code for the deallocation. If we keep the
allocation at device assignment time, there should be code for the
deallocation when a device is remove from the guest (even if we cannot
test that case well now).
Julien Grall March 27, 2017, 5:49 p.m. UTC | #11
Hi Stefano,

On 27/03/17 18:44, Stefano Stabellini wrote:
> On Mon, 27 Mar 2017, Julien Grall wrote:
>> Hi,
>>
>> On 27/03/17 10:02, Andre Przywara wrote:
>>> On 24/03/17 17:26, Stefano Stabellini wrote:
>>>> On Fri, 24 Mar 2017, Andre Przywara wrote:
>>> I am afraid that this would lead to situations where we needlessly
>>> allocate and deallocate pending_irqs. Under normal load I'd expect to
>>> have something like zero to three LPIs pending at any given point in
>>> time (mostly zero, to be honest).
>>> So this will lead to a situation where *every* LPI that becomes pending
>>> triggers a memory allocation - in the hot path. That's why the pool
>>> idea. So if we are going to shrink the pool, I'd stop at something like
>>> five entries, to not penalize the common case.
>>> Does that sound useful?
>>
>> Not answering directly to the question here. I will summarize the face to face
>> discussion I had with Andre this morning.
>>
>> So allocating the pending_irq in the IRQ path is not a solution because memory
>> allocation should not happen in IRQ context, see ASSERT(!in_irq()) in
>> _xmalloc.
>>
>> Regardless the ASSERT, it will also increase the time to handle and forward an
>> interrupt when there are no pending_irq free because it is necessary to
>> allocate a new one. Lastly, we have no way to tell the guest: "Try again" if
>> it Xen is running out of memory.
>>
>> The outcome of the discussion is to pre-allocate the pending_irq when a device
>> is assigned to a domain. We know the maximum number of event supported by a
>> device and that 1 event = 1 LPI.
>>
>> This may allocate more memory (a pending_irq is 56 bytes), but at least we
>> don't need allocation on the fly and can report error.
>>
>> One could argue that we could allocate on MAPTI to limit the allocation.
>> However, as we are not able to rate-limit/defer the execution of the command
>> queue so far, a guest could potentially flood with MAPTI and monopolize the
>> pCPU for a long time.
>
> It makes a lot of sense to keep the allocation out of the irq path.
> However, I am wondering if the allocation/deallocation of pending_irq
> structs could be done at the point the vLPIs are enabled/disabled,
> instead of device assignment time.

I am not sure what you mean by enabling/disabling vLPIS. Do you mean 
when the guest is enabling/disabling them or the guest will assign a 
vLPI to a (deviceID, event) via MAPTI.

For both guest could potentially flood us. It would take us a lot of 
time to allocate/free memory for each vLPIs modified. Hence, why I 
didn't suggest it and said: "One could argue that we could allocate on 
MAPTI to limit the allocation...".

>
> In any case, there should be code for the deallocation. If we keep the
> allocation at device assignment time, there should be code for the
> deallocation when a device is remove from the guest (even if we cannot
> test that case well now).

It was implied in my answered :).

Cheers,
Stefano Stabellini March 27, 2017, 6:39 p.m. UTC | #12
CC'ing Andrew, Jan and George to get more feedback on the security
impact of this patch.

I'll make a quick summary for you: we need to allocate a 56 bytes struct
(called pending_irq) for each potential interrupt injected to guests
(dom0 and domUs). With the new ARM interrupt controller there could be
thousands.

We could do the allocation upfront, which requires more memory, or we
could do the allocation dynamically at run time when each interrupt is
enabled, and deallocate the struct when an interrupt is disabled.

However, there is a concern that doing xmalloc/xfree in response to an
unprivileged DomU request could end up becoming a potential vector of
denial of service attacks. The guest could enable a thousand interrupts,
then disable a thousand interrupts and so on, monopolizing the usage of
one physical cpu. It only takes the write of 1 bit in memory for a guest
to enable/disable an interrupt.

See below.


On Mon, 27 Mar 2017, Julien Grall wrote:
> Hi Stefano,
> 
> On 27/03/17 18:44, Stefano Stabellini wrote:
> > On Mon, 27 Mar 2017, Julien Grall wrote:
> > > Hi,
> > > 
> > > On 27/03/17 10:02, Andre Przywara wrote:
> > > > On 24/03/17 17:26, Stefano Stabellini wrote:
> > > > > On Fri, 24 Mar 2017, Andre Przywara wrote:
> > > > I am afraid that this would lead to situations where we needlessly
> > > > allocate and deallocate pending_irqs. Under normal load I'd expect to
> > > > have something like zero to three LPIs pending at any given point in
> > > > time (mostly zero, to be honest).
> > > > So this will lead to a situation where *every* LPI that becomes pending
> > > > triggers a memory allocation - in the hot path. That's why the pool
> > > > idea. So if we are going to shrink the pool, I'd stop at something like
> > > > five entries, to not penalize the common case.
> > > > Does that sound useful?
> > > 
> > > Not answering directly to the question here. I will summarize the face to
> > > face
> > > discussion I had with Andre this morning.
> > > 
> > > So allocating the pending_irq in the IRQ path is not a solution because
> > > memory
> > > allocation should not happen in IRQ context, see ASSERT(!in_irq()) in
> > > _xmalloc.
> > > 
> > > Regardless the ASSERT, it will also increase the time to handle and
> > > forward an
> > > interrupt when there are no pending_irq free because it is necessary to
> > > allocate a new one. Lastly, we have no way to tell the guest: "Try again"
> > > if
> > > it Xen is running out of memory.
> > > 
> > > The outcome of the discussion is to pre-allocate the pending_irq when a
> > > device
> > > is assigned to a domain. We know the maximum number of event supported by
> > > a
> > > device and that 1 event = 1 LPI.
> > > 
> > > This may allocate more memory (a pending_irq is 56 bytes), but at least we
> > > don't need allocation on the fly and can report error.
> > > 
> > > One could argue that we could allocate on MAPTI to limit the allocation.
> > > However, as we are not able to rate-limit/defer the execution of the
> > > command
> > > queue so far, a guest could potentially flood with MAPTI and monopolize
> > > the
> > > pCPU for a long time.
> > 
> > It makes a lot of sense to keep the allocation out of the irq path.
> > However, I am wondering if the allocation/deallocation of pending_irq
> > structs could be done at the point the vLPIs are enabled/disabled,
> > instead of device assignment time.
> 
> I am not sure what you mean by enabling/disabling vLPIS. Do you mean when the
> guest is enabling/disabling them or the guest will assign a vLPI to a
> (deviceID, event) via MAPTI.

I was thinking enable/disable on the LPI Configuration table. However,
it would have the same issues you wrote for MAPTI.


> For both guest could potentially flood us. It would take us a lot of time to
> allocate/free memory for each vLPIs modified. Hence, why I didn't suggest it
> and said: "One could argue that we could allocate on MAPTI to limit the
> allocation...".

Given that Xen wouldn't allocate the same pending_irq twice, at most Xen
would allocate the same amount of memory and the same number of
pending_irq that it would otherwise allocate if we did it at assignment
time, right?

Therefore, we are not concerned about memory utilization. We are
concerned about the CPU time to do the allocation itself, right?

However, the CPU time to run xmalloc/xfree should be small and in any
case the scheduler has always the chance to deschedule the vcpu if it
wants to. This is no different than issuing any of the hypercalls that
require some work on the hypervisor side.

I don't think we have reasons to be concerned. Any opinions?
Julien Grall March 27, 2017, 9:24 p.m. UTC | #13
Hi Stefano,

On 27/03/2017 19:39, Stefano Stabellini wrote:
> On Mon, 27 Mar 2017, Julien Grall wrote:
>> For both guest could potentially flood us. It would take us a lot of time to
>> allocate/free memory for each vLPIs modified. Hence, why I didn't suggest it
>> and said: "One could argue that we could allocate on MAPTI to limit the
>> allocation...".
>
> Given that Xen wouldn't allocate the same pending_irq twice, at most Xen
> would allocate the same amount of memory and the same number of
> pending_irq that it would otherwise allocate if we did it at assignment
> time, right?
>
> Therefore, we are not concerned about memory utilization. We are
> concerned about the CPU time to do the allocation itself, right?
>
> However, the CPU time to run xmalloc/xfree should be small and in any
> case the scheduler has always the chance to deschedule the vcpu if it
> wants to. This is no different than issuing any of the hypercalls that
> require some work on the hypervisor side.
>
> I don't think we have reasons to be concerned. Any opinions?

Well, I have already said on the IRQ version but forgot to mention here. 
How do you report error if xmalloc has failed? This could happen if 
memory resource has been exhausted even for a well-behaved guest.

If you do the allocation on enable, this means on a memory trap, you 
would have to inject a data abort to the guest. If you do on the MAPTI 
command, it would be a SError with a IMPLEMENTATION DEFINED error code. 
In both case the guest will likely not able interpret it and crash. This 
could be a vector attack by exhausting the resource.

Furthermore, in the case of enable/disable vLPI solution, you can 
disable an LPI that are still inflight. So you would not be able to free 
here. Furthermore enable/disable can happen often if you want to mask 
the interrupt temporarily (all MSI are edge-interrupt).

Lastly, in the case of MAPTI it is not possible to preempt the command 
queue that can contain ~32K of commands. So we may have to do 32K of 
xmalloc/xfree. Even one MAPTI is fast, likely 32K will be slow and 
monopolize a pCPU for more than expected.

But can we focus on getting a sensible design which can be easily 
extended and without any flaw? This already is quite a difficult tasks 
with the ITS. Asking also for performance from the first draft is making 
much worse.

If you recall, a lot of ARM subsystems (P2M, vGIC...) has been built in 
multiple stage. Each stage improved the performance. It sounds quite 
unfair to me to require the same level of performance as the current 
vGIC just because the code was merged later one.

This is raising the bar to contribute on Xen on ARM very high and may 
discourage someone to push something upstream.

To be clear, I do care about performance. But I think this has to come 
in a second step when it is too complicate to address as a first step. 
The first step is to be able to use Xen on the hardware without any flaw 
and allowing us to extend the code easily.

Cheers,
Jan Beulich March 28, 2017, 7:58 a.m. UTC | #14
>>> On 27.03.17 at 20:39, <sstabellini@kernel.org> wrote:
> CC'ing Andrew, Jan and George to get more feedback on the security
> impact of this patch.
> 
> I'll make a quick summary for you: we need to allocate a 56 bytes struct
> (called pending_irq) for each potential interrupt injected to guests
> (dom0 and domUs). With the new ARM interrupt controller there could be
> thousands.
> 
> We could do the allocation upfront, which requires more memory, or we
> could do the allocation dynamically at run time when each interrupt is
> enabled, and deallocate the struct when an interrupt is disabled.
> 
> However, there is a concern that doing xmalloc/xfree in response to an
> unprivileged DomU request could end up becoming a potential vector of
> denial of service attacks. The guest could enable a thousand interrupts,
> then disable a thousand interrupts and so on, monopolizing the usage of
> one physical cpu. It only takes the write of 1 bit in memory for a guest
> to enable/disable an interrupt.

Well, I think doing the allocations at device assign time would be
the least problematic approach: The tool stack could account for
the needed memory (ballooning Dom0 if necessary). (We still
have the open work item of introducing accounting of guest-
associated, but guest-inaccessible memory in the hypervisor.)

What I don't really understand the background of is the pCPU
monopolization concern. Is there anything here that's long-running
inside the hypervisor? Otherwise, the scheduler should be taking
care of everything.

Jan
Julien Grall March 28, 2017, 1:12 p.m. UTC | #15
Hi Jan,

On 28/03/17 08:58, Jan Beulich wrote:
>>>> On 27.03.17 at 20:39, <sstabellini@kernel.org> wrote:
>> CC'ing Andrew, Jan and George to get more feedback on the security
>> impact of this patch.
>>
>> I'll make a quick summary for you: we need to allocate a 56 bytes struct
>> (called pending_irq) for each potential interrupt injected to guests
>> (dom0 and domUs). With the new ARM interrupt controller there could be
>> thousands.
>>
>> We could do the allocation upfront, which requires more memory, or we
>> could do the allocation dynamically at run time when each interrupt is
>> enabled, and deallocate the struct when an interrupt is disabled.
>>
>> However, there is a concern that doing xmalloc/xfree in response to an
>> unprivileged DomU request could end up becoming a potential vector of
>> denial of service attacks. The guest could enable a thousand interrupts,
>> then disable a thousand interrupts and so on, monopolizing the usage of
>> one physical cpu. It only takes the write of 1 bit in memory for a guest
>> to enable/disable an interrupt.
>
> Well, I think doing the allocations at device assign time would be
> the least problematic approach: The tool stack could account for
> the needed memory (ballooning Dom0 if necessary). (We still
> have the open work item of introducing accounting of guest-
> associated, but guest-inaccessible memory in the hypervisor.)
>
> What I don't really understand the background of is the pCPU
> monopolization concern. Is there anything here that's long-running
> inside the hypervisor? Otherwise, the scheduler should be taking
> care of everything.

Let me give you some background before answering to the question. The 
ITS is an interrupt controller widget which provides a sophisticated way
of dealing with MSIs in a scalable manner. A command queue is used to 
manage the ITS. It is a memory region provided by the guest can be up to 
1MB. Each command is composed of 4 double-word (e.g 32 bytes) which make 
the possibly for a guest to queue up 32768 commands.

In order to control the command queue there are 2 registers:
	- GITS_CWRITER that points to the end of the queue and updated by the 
software when a new command is written
	- GITS_CREADR that points to the beginning of the queue and updated by 
the forware when a new command has been executed.

The ITS will process command until the queue is empty (u.e GITS_CWRITER 
== GITS_CREADR). A software has 2 solutions to wait the completion of 
the command:
	- Polling on GITS_CREADR
	- Adding a command INT which will send a interrupt when executed

Usually the polling mode also includes a timeout in the code.

A guest could potentially queue up to 32768 commands before updating 
GITS_CWRITER. In the current approach, the command queue will be handled 
synchronously when the software wrote into GITS_CWRITER and trapped by 
the hypervisor. This means that we will not return from the hypervisor 
until the ITS executed the command between GITS_CREADER and GITS_CWRITER.

All the command have to be executed quickly to avoid the guest 
monopolizing the pCPU by flooding the command queue.

We thought using different alternative such as tasklet or breaking down 
the command queue in batch. But none of them fit well.

If you have a better idea, I am happy to consider it.

Cheers,
Jan Beulich March 28, 2017, 1:34 p.m. UTC | #16
>>> On 28.03.17 at 15:12, <julien.grall@arm.com> wrote:
> Hi Jan,
> 
> On 28/03/17 08:58, Jan Beulich wrote:
>>>>> On 27.03.17 at 20:39, <sstabellini@kernel.org> wrote:
>>> CC'ing Andrew, Jan and George to get more feedback on the security
>>> impact of this patch.
>>>
>>> I'll make a quick summary for you: we need to allocate a 56 bytes struct
>>> (called pending_irq) for each potential interrupt injected to guests
>>> (dom0 and domUs). With the new ARM interrupt controller there could be
>>> thousands.
>>>
>>> We could do the allocation upfront, which requires more memory, or we
>>> could do the allocation dynamically at run time when each interrupt is
>>> enabled, and deallocate the struct when an interrupt is disabled.
>>>
>>> However, there is a concern that doing xmalloc/xfree in response to an
>>> unprivileged DomU request could end up becoming a potential vector of
>>> denial of service attacks. The guest could enable a thousand interrupts,
>>> then disable a thousand interrupts and so on, monopolizing the usage of
>>> one physical cpu. It only takes the write of 1 bit in memory for a guest
>>> to enable/disable an interrupt.
>>
>> Well, I think doing the allocations at device assign time would be
>> the least problematic approach: The tool stack could account for
>> the needed memory (ballooning Dom0 if necessary). (We still
>> have the open work item of introducing accounting of guest-
>> associated, but guest-inaccessible memory in the hypervisor.)
>>
>> What I don't really understand the background of is the pCPU
>> monopolization concern. Is there anything here that's long-running
>> inside the hypervisor? Otherwise, the scheduler should be taking
>> care of everything.
> 
> Let me give you some background before answering to the question. The 
> ITS is an interrupt controller widget which provides a sophisticated way
> of dealing with MSIs in a scalable manner. A command queue is used to 
> manage the ITS. It is a memory region provided by the guest can be up to 
> 1MB. Each command is composed of 4 double-word (e.g 32 bytes) which make 
> the possibly for a guest to queue up 32768 commands.
> 
> In order to control the command queue there are 2 registers:
> 	- GITS_CWRITER that points to the end of the queue and updated by the 
> software when a new command is written
> 	- GITS_CREADR that points to the beginning of the queue and updated by 
> the forware when a new command has been executed.
> 
> The ITS will process command until the queue is empty (u.e GITS_CWRITER 
> == GITS_CREADR). A software has 2 solutions to wait the completion of 
> the command:
> 	- Polling on GITS_CREADR
> 	- Adding a command INT which will send a interrupt when executed
> 
> Usually the polling mode also includes a timeout in the code.
> 
> A guest could potentially queue up to 32768 commands before updating 
> GITS_CWRITER. In the current approach, the command queue will be handled 
> synchronously when the software wrote into GITS_CWRITER and trapped by 
> the hypervisor. This means that we will not return from the hypervisor 
> until the ITS executed the command between GITS_CREADER and GITS_CWRITER.
> 
> All the command have to be executed quickly to avoid the guest 
> monopolizing the pCPU by flooding the command queue.
> 
> We thought using different alternative such as tasklet or breaking down 
> the command queue in batch. But none of them fit well.

I guess you want something continuation-like then? Does the
instruction doing the GITS_CWRITER write have any other side
effects? If not, wouldn't the approach used on x86 for forwarding
requests to qemu work here too? I.e. retry the instruction as long
as you're not ready to actually complete it, with a continuation
check put in the handling code you describe every so many
iterations. Of course there are dependencies here on the
cross-CPU visibility of the register - if all guest vCPU-s can access
it, things would require more care (as intermediate reads as well
as successive writes from multiple parties would need taking into
consideration).

Jan
diff mbox

Patch

diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
index a5348f2..bd3c032 100644
--- a/xen/arch/arm/gic.c
+++ b/xen/arch/arm/gic.c
@@ -509,6 +509,9 @@  static void gic_update_one_lr(struct vcpu *v, int i)
                 struct vcpu *v_target = vgic_get_target_vcpu(v, irq);
                 irq_set_affinity(p->desc, cpumask_of(v_target->processor));
             }
+            /* If this was an LPI, mark this struct as available again. */
+            if ( is_lpi(p->irq) )
+                p->irq = 0;
         }
     }
 }
diff --git a/xen/arch/arm/vgic-v3.c b/xen/arch/arm/vgic-v3.c
index 1fadb00..b0653c2 100644
--- a/xen/arch/arm/vgic-v3.c
+++ b/xen/arch/arm/vgic-v3.c
@@ -1426,6 +1426,9 @@  static int vgic_v3_vcpu_init(struct vcpu *v)
     if ( v->vcpu_id == last_cpu || (v->vcpu_id == (d->max_vcpus - 1)) )
         v->arch.vgic.flags |= VGIC_V3_RDIST_LAST;
 
+    spin_lock_init(&v->arch.vgic.pending_lpi_list_lock);
+    INIT_LIST_HEAD(&v->arch.vgic.pending_lpi_list);
+
     return 0;
 }
 
diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
index 364d5f0..e5cfa54 100644
--- a/xen/arch/arm/vgic.c
+++ b/xen/arch/arm/vgic.c
@@ -30,6 +30,8 @@ 
 
 #include <asm/mmio.h>
 #include <asm/gic.h>
+#include <asm/gic_v3_defs.h>
+#include <asm/gic_v3_its.h>
 #include <asm/vgic.h>
 
 static inline struct vgic_irq_rank *vgic_get_rank(struct vcpu *v, int rank)
@@ -61,7 +63,7 @@  struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq)
     return vgic_get_rank(v, rank);
 }
 
-static void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
+void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
 {
     INIT_LIST_HEAD(&p->inflight);
     INIT_LIST_HEAD(&p->lr_queue);
@@ -244,10 +246,14 @@  struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq)
 
 static int vgic_get_virq_priority(struct vcpu *v, unsigned int virq)
 {
-    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
+    struct vgic_irq_rank *rank;
     unsigned long flags;
     int priority;
 
+    if ( is_lpi(virq) )
+        return vgic_lpi_get_priority(v->domain, virq);
+
+    rank = vgic_rank_irq(v, virq);
     vgic_lock_rank(v, rank, flags);
     priority = rank->priority[virq & INTERRUPT_RANK_MASK];
     vgic_unlock_rank(v, rank, flags);
@@ -446,13 +452,63 @@  bool vgic_to_sgi(struct vcpu *v, register_t sgir, enum gic_sgi_mode irqmode,
     return true;
 }
 
+/*
+ * Holding struct pending_irq's for each possible virtual LPI in each domain
+ * requires too much Xen memory, also a malicious guest could potentially
+ * spam Xen with LPI map requests. We cannot cover those with (guest allocated)
+ * ITS memory, so we use a dynamic scheme of allocating struct pending_irq's
+ * on demand.
+ */
+struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int lpi,
+                                   bool allocate)
+{
+    struct lpi_pending_irq *lpi_irq, *empty = NULL;
+
+    spin_lock(&v->arch.vgic.pending_lpi_list_lock);
+    list_for_each_entry(lpi_irq, &v->arch.vgic.pending_lpi_list, entry)
+    {
+        if ( lpi_irq->pirq.irq == lpi )
+        {
+            spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
+            return &lpi_irq->pirq;
+        }
+
+        if ( lpi_irq->pirq.irq == 0 && !empty )
+            empty = lpi_irq;
+    }
+
+    if ( !allocate )
+    {
+        spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
+        return NULL;
+    }
+
+    if ( !empty )
+    {
+        empty = xzalloc(struct lpi_pending_irq);
+        vgic_init_pending_irq(&empty->pirq, lpi);
+        list_add_tail(&empty->entry, &v->arch.vgic.pending_lpi_list);
+    } else
+    {
+        empty->pirq.status = 0;
+        empty->pirq.irq = lpi;
+    }
+
+    spin_unlock(&v->arch.vgic.pending_lpi_list_lock);
+
+    return &empty->pirq;
+}
+
 struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
 {
     struct pending_irq *n;
+
     /* Pending irqs allocation strategy: the first vgic.nr_spis irqs
      * are used for SPIs; the rests are used for per cpu irqs */
     if ( irq < 32 )
         n = &v->arch.vgic.pending_irqs[irq];
+    else if ( is_lpi(irq) )
+        n = lpi_to_pending(v, irq, true);
     else
         n = &v->domain->arch.vgic.pending_irqs[irq - 32];
     return n;
@@ -480,7 +536,7 @@  void vgic_clear_pending_irqs(struct vcpu *v)
 void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
 {
     uint8_t priority;
-    struct pending_irq *iter, *n = irq_to_pending(v, virq);
+    struct pending_irq *iter, *n;
     unsigned long flags;
     bool running;
 
@@ -488,6 +544,8 @@  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
 
     spin_lock_irqsave(&v->arch.vgic.lock, flags);
 
+    n = irq_to_pending(v, virq);
+
     /* vcpu offline */
     if ( test_bit(_VPF_down, &v->pause_flags) )
     {
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 00b9c1a..f44a84b 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -257,6 +257,8 @@  struct arch_vcpu
         paddr_t rdist_base;
 #define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
         uint8_t flags;
+        struct list_head pending_lpi_list;
+        spinlock_t pending_lpi_list_lock;   /* protects the pending_lpi_list */
     } vgic;
 
     /* Timer registers  */
diff --git a/xen/include/asm-arm/vgic.h b/xen/include/asm-arm/vgic.h
index 467333c..8f1099c 100644
--- a/xen/include/asm-arm/vgic.h
+++ b/xen/include/asm-arm/vgic.h
@@ -83,6 +83,12 @@  struct pending_irq
     struct list_head lr_queue;
 };
 
+struct lpi_pending_irq
+{
+    struct list_head entry;
+    struct pending_irq pirq;
+};
+
 #define NR_INTERRUPT_PER_RANK   32
 #define INTERRUPT_RANK_MASK (NR_INTERRUPT_PER_RANK - 1)
 
@@ -296,13 +302,21 @@  extern struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int virq);
 extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq);
 extern void vgic_vcpu_inject_spi(struct domain *d, unsigned int virq);
 extern void vgic_clear_pending_irqs(struct vcpu *v);
+extern void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq);
 extern struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq);
 extern struct pending_irq *spi_to_pending(struct domain *d, unsigned int irq);
+extern struct pending_irq *lpi_to_pending(struct vcpu *v, unsigned int irq,
+                                          bool allocate);
 extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b, int n, int s);
 extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq);
 extern bool vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
 extern void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n);
 extern void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n);
+/* placeholder function until the property table gets introduced */
+static inline int vgic_lpi_get_priority(struct domain *d, uint32_t vlpi)
+{
+    return 0xa;
+}
 extern void register_vgic_ops(struct domain *d, const struct vgic_ops *ops);
 int vgic_v2_init(struct domain *d, int *mmio_count);
 int vgic_v3_init(struct domain *d, int *mmio_count);