diff mbox

[v2,3/5] VT-d PI: restrict the vcpu number on a given pcpu

Message ID 1494482652-42356-4-git-send-email-chao.gao@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chao Gao May 11, 2017, 6:04 a.m. UTC
Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
too many vCPUs are blocked on a given pCPU, it will incur that the list
grows too long. After a simple analysis, there are 32k domains and
128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
PI blocking list. When a wakeup interrupt arrives, the list is
traversed to find some specific vCPUs to wake them up. This traversal in
that case would consume much time.

To mitigate this issue, this patch limits the vcpu number on a given
pCPU, taking factors such as perfomance of common case, current hvm vcpu
count and current pcpu count into consideration. With this method, for
the common case, it works fast and for some extreme cases, the list
length is under control.

The change in vmx_pi_unblock_vcpu() is for the following case:
vcpu is running -> try to block (this patch may change NSDT to
another pCPU) but notification comes in time, thus the vcpu
goes back to running station -> VM-entry (we should set NSDT again,
reverting the change we make to NSDT in vmx_vcpu_block())

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
 xen/arch/x86/hvm/vmx/vmx.c | 78 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 71 insertions(+), 7 deletions(-)

Comments

Tian, Kevin May 15, 2017, 5:24 a.m. UTC | #1
> From: Gao, Chao
> Sent: Thursday, May 11, 2017 2:04 PM
> 
> Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
> too many vCPUs are blocked on a given pCPU, it will incur that the list
> grows too long. After a simple analysis, there are 32k domains and
> 128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
> PI blocking list. When a wakeup interrupt arrives, the list is
> traversed to find some specific vCPUs to wake them up. This traversal in
> that case would consume much time.
> 
> To mitigate this issue, this patch limits the vcpu number on a given
> pCPU, taking factors such as perfomance of common case, current hvm vcpu
> count and current pcpu count into consideration. With this method, for
> the common case, it works fast and for some extreme cases, the list
> length is under control.
> 
> The change in vmx_pi_unblock_vcpu() is for the following case:
> vcpu is running -> try to block (this patch may change NSDT to
> another pCPU) but notification comes in time, thus the vcpu
> goes back to running station -> VM-entry (we should set NSDT again,
> reverting the change we make to NSDT in vmx_vcpu_block())
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> ---
>  xen/arch/x86/hvm/vmx/vmx.c | 78
> +++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 71 insertions(+), 7 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index efff6cd..c0d0b58 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -100,16 +100,70 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>      spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
>  }
> 
> +/*
> + * Choose an appropriate pcpu to receive wakeup interrupt.
> + * By default, the local pcpu is chosen as the destination. But if the
> + * vcpu number of the local pcpu exceeds a limit, another pcpu is chosen.
> + *
> + * Currently, choose (v_tot/p_tot) + K as the limit of vcpu, where
> + * v_tot is the total number of vcpus on the system, p_tot is the total
> + * number of pcpus in the system, and K is a fixed number. Experments
> shows
> + * the maximal time to wakeup a vcpu from a 128-entry blocking list is
> + * considered acceptable. So choose 128 as the fixed number K.

better you can provide your experimental data here so others have
a gut-feeling why it's acceptable...

> + *
> + * This policy makes sure:
> + * 1) for common cases, the limit won't be reached and the local pcpu is
> used
> + * which is beneficial to performance (at least, avoid an IPI when unblocking
> + * vcpu).
> + * 2) for the worst case, the blocking list length scales with the vcpu count
> + * divided by the pcpu count.
> + */
> +#define PI_LIST_FIXED_NUM 128
> +#define PI_LIST_LIMIT     (atomic_read(&num_hvm_vcpus) /
> num_online_cpus() + \
> +                           PI_LIST_FIXED_NUM)
> +
> +static unsigned int vmx_pi_choose_dest_cpu(struct vcpu *v)
> +{
> +    int count, limit = PI_LIST_LIMIT;
> +    unsigned int dest = v->processor;
> +
> +    count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
> +    while ( unlikely(count >= limit) )
> +    {
> +        dest = cpumask_cycle(dest, &cpu_online_map);
> +        count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
> +    }

is it possible to hit infinite loop here?

> +    return dest;
> +}
> +
>  static void vmx_vcpu_block(struct vcpu *v)
>  {
>      unsigned long flags;
> -    unsigned int dest;
> +    unsigned int dest, dest_cpu;
>      spinlock_t *old_lock;
> -    spinlock_t *pi_blocking_list_lock =
> -		&per_cpu(vmx_pi_blocking, v->processor).lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    spinlock_t *pi_blocking_list_lock;
> +
> +    /*
> +     * After pCPU goes down, the per-cpu PI blocking list is cleared.
> +     * To make sure the parameter vCPU is added to the chosen pCPU's
> +     * PI blocking list before the list is cleared, just retry when
> +     * finding the pCPU has gone down. Also retry to choose another
> +     * pCPU when finding the list length reachs the limit.
> +     */
> + retry:
> +    dest_cpu = vmx_pi_choose_dest_cpu(v);
> +    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, dest_cpu).lock;
> 
>      spin_lock_irqsave(pi_blocking_list_lock, flags);
> +    if ( unlikely((!cpu_online(dest_cpu)) ||
> +                  (atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter) >=
> +                   PI_LIST_LIMIT)) )
> +    {
> +        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
> +        goto retry;
> +    }
> +
>      old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
>                         pi_blocking_list_lock);
> 
> @@ -120,11 +174,11 @@ static void vmx_vcpu_block(struct vcpu *v)
>       */
>      ASSERT(old_lock == NULL);
> 
> -    atomic_inc(&per_cpu(vmx_pi_blocking, v->processor).counter);
> -    HVMTRACE_4D(PI_LIST_ADD, v->domain->domain_id, v->vcpu_id, v-
> >processor,
> -                atomic_read(&per_cpu(vmx_pi_blocking, v->processor).counter));
> +    atomic_inc(&per_cpu(vmx_pi_blocking, dest_cpu).counter);
> +    HVMTRACE_4D(PI_LIST_ADD, v->domain->domain_id, v->vcpu_id,
> dest_cpu,
> +                atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter));
>      list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
> -                  &per_cpu(vmx_pi_blocking, v->processor).list);
> +                  &per_cpu(vmx_pi_blocking, dest_cpu).list);
>      spin_unlock_irqrestore(pi_blocking_list_lock, flags);
> 
>      ASSERT(!pi_test_sn(pi_desc));
> @@ -134,6 +188,13 @@ static void vmx_vcpu_block(struct vcpu *v)
>      ASSERT(pi_desc->ndst ==
>             (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
> 
> +    if ( unlikely(dest_cpu != v->processor) )
> +    {
> +        dest = cpu_physical_id(dest_cpu);
> +        write_atomic(&pi_desc->ndst,
> +                 (x2apic_enabled ? dest : MASK_INSR(dest,
> PI_xAPIC_NDST_MASK)));
> +    }
> +
>      write_atomic(&pi_desc->nv, pi_wakeup_vector);
>  }
> 
> @@ -163,6 +224,7 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>      unsigned long flags;
>      spinlock_t *pi_blocking_list_lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    unsigned int dest = cpu_physical_id(v->processor);
> 
>      /*
>       * Set 'NV' field back to posted_intr_vector, so the
> @@ -170,6 +232,8 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>       * it is running in non-root mode.
>       */
>      write_atomic(&pi_desc->nv, posted_intr_vector);
> +    write_atomic(&pi_desc->ndst,
> +                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
> 
>      pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
> 
> --
> 1.8.3.1
Chao Gao May 15, 2017, 9:27 a.m. UTC | #2
On Mon, May 15, 2017 at 01:24:45PM +0800, Tian, Kevin wrote:
>> From: Gao, Chao
>> Sent: Thursday, May 11, 2017 2:04 PM
>> 
>> Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
>> too many vCPUs are blocked on a given pCPU, it will incur that the list
>> grows too long. After a simple analysis, there are 32k domains and
>> 128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
>> PI blocking list. When a wakeup interrupt arrives, the list is
>> traversed to find some specific vCPUs to wake them up. This traversal in
>> that case would consume much time.
>> 
>> To mitigate this issue, this patch limits the vcpu number on a given
>> pCPU, taking factors such as perfomance of common case, current hvm vcpu
>> count and current pcpu count into consideration. With this method, for
>> the common case, it works fast and for some extreme cases, the list
>> length is under control.
>> 
>> The change in vmx_pi_unblock_vcpu() is for the following case:
>> vcpu is running -> try to block (this patch may change NSDT to
>> another pCPU) but notification comes in time, thus the vcpu
>> goes back to running station -> VM-entry (we should set NSDT again,
>> reverting the change we make to NSDT in vmx_vcpu_block())
>> 
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> ---
>>  xen/arch/x86/hvm/vmx/vmx.c | 78
>> +++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 71 insertions(+), 7 deletions(-)
>> 
>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>> index efff6cd..c0d0b58 100644
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -100,16 +100,70 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>>      spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
>>  }
>> 
>> +/*
>> + * Choose an appropriate pcpu to receive wakeup interrupt.
>> + * By default, the local pcpu is chosen as the destination. But if the
>> + * vcpu number of the local pcpu exceeds a limit, another pcpu is chosen.
>> + *
>> + * Currently, choose (v_tot/p_tot) + K as the limit of vcpu, where
>> + * v_tot is the total number of vcpus on the system, p_tot is the total
>> + * number of pcpus in the system, and K is a fixed number. Experments
>> shows
>> + * the maximal time to wakeup a vcpu from a 128-entry blocking list is
>> + * considered acceptable. So choose 128 as the fixed number K.
>
>better you can provide your experimental data here so others have
>a gut-feeling why it's acceptable...

Will add this.

>
>> + *
>> + * This policy makes sure:
>> + * 1) for common cases, the limit won't be reached and the local pcpu is
>> used
>> + * which is beneficial to performance (at least, avoid an IPI when unblocking
>> + * vcpu).
>> + * 2) for the worst case, the blocking list length scales with the vcpu count
>> + * divided by the pcpu count.
>> + */
>> +#define PI_LIST_FIXED_NUM 128
>> +#define PI_LIST_LIMIT     (atomic_read(&num_hvm_vcpus) /
>> num_online_cpus() + \
>> +                           PI_LIST_FIXED_NUM)
>> +
>> +static unsigned int vmx_pi_choose_dest_cpu(struct vcpu *v)
>> +{
>> +    int count, limit = PI_LIST_LIMIT;
>> +    unsigned int dest = v->processor;
>> +
>> +    count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
>> +    while ( unlikely(count >= limit) )
>> +    {
>> +        dest = cpumask_cycle(dest, &cpu_online_map);
>> +        count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
>> +    }
>
>is it possible to hit infinite loop here?
>

theoretically, it will not for cpumask_cycle() will iterate through all
online pcpus and it's impossible that all online pcpus have reach the
upper bound.

Thanks
Chao
George Dunlap May 15, 2017, 3:48 p.m. UTC | #3
On Thu, May 11, 2017 at 7:04 AM, Chao Gao <chao.gao@intel.com> wrote:
> Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
> too many vCPUs are blocked on a given pCPU, it will incur that the list
> grows too long. After a simple analysis, there are 32k domains and
> 128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
> PI blocking list. When a wakeup interrupt arrives, the list is
> traversed to find some specific vCPUs to wake them up. This traversal in
> that case would consume much time.
>
> To mitigate this issue, this patch limits the vcpu number on a given
> pCPU, taking factors such as perfomance of common case, current hvm vcpu
> count and current pcpu count into consideration. With this method, for
> the common case, it works fast and for some extreme cases, the list
> length is under control.
>
> The change in vmx_pi_unblock_vcpu() is for the following case:
> vcpu is running -> try to block (this patch may change NSDT to
> another pCPU) but notification comes in time, thus the vcpu
> goes back to running station -> VM-entry (we should set NSDT again,
> reverting the change we make to NSDT in vmx_vcpu_block())
>
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> ---
>  xen/arch/x86/hvm/vmx/vmx.c | 78 +++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 71 insertions(+), 7 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index efff6cd..c0d0b58 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -100,16 +100,70 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>      spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
>  }
>
> +/*
> + * Choose an appropriate pcpu to receive wakeup interrupt.
> + * By default, the local pcpu is chosen as the destination. But if the
> + * vcpu number of the local pcpu exceeds a limit, another pcpu is chosen.
> + *
> + * Currently, choose (v_tot/p_tot) + K as the limit of vcpu, where
> + * v_tot is the total number of vcpus on the system, p_tot is the total
> + * number of pcpus in the system, and K is a fixed number. Experments shows
> + * the maximal time to wakeup a vcpu from a 128-entry blocking list is
> + * considered acceptable. So choose 128 as the fixed number K.
> + *
> + * This policy makes sure:
> + * 1) for common cases, the limit won't be reached and the local pcpu is used
> + * which is beneficial to performance (at least, avoid an IPI when unblocking
> + * vcpu).
> + * 2) for the worst case, the blocking list length scales with the vcpu count
> + * divided by the pcpu count.
> + */
> +#define PI_LIST_FIXED_NUM 128
> +#define PI_LIST_LIMIT     (atomic_read(&num_hvm_vcpus) / num_online_cpus() + \
> +                           PI_LIST_FIXED_NUM)
> +
> +static unsigned int vmx_pi_choose_dest_cpu(struct vcpu *v)
> +{
> +    int count, limit = PI_LIST_LIMIT;
> +    unsigned int dest = v->processor;
> +
> +    count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
> +    while ( unlikely(count >= limit) )
> +    {
> +        dest = cpumask_cycle(dest, &cpu_online_map);
> +        count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
> +    }
> +    return dest;
> +}
> +
>  static void vmx_vcpu_block(struct vcpu *v)
>  {
>      unsigned long flags;
> -    unsigned int dest;
> +    unsigned int dest, dest_cpu;
>      spinlock_t *old_lock;
> -    spinlock_t *pi_blocking_list_lock =
> -               &per_cpu(vmx_pi_blocking, v->processor).lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    spinlock_t *pi_blocking_list_lock;
> +
> +    /*
> +     * After pCPU goes down, the per-cpu PI blocking list is cleared.
> +     * To make sure the parameter vCPU is added to the chosen pCPU's
> +     * PI blocking list before the list is cleared, just retry when
> +     * finding the pCPU has gone down. Also retry to choose another
> +     * pCPU when finding the list length reachs the limit.
> +     */
> + retry:
> +    dest_cpu = vmx_pi_choose_dest_cpu(v);
> +    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, dest_cpu).lock;
>
>      spin_lock_irqsave(pi_blocking_list_lock, flags);
> +    if ( unlikely((!cpu_online(dest_cpu)) ||
> +                  (atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter) >=
> +                   PI_LIST_LIMIT)) )
> +    {
> +        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
> +        goto retry;
> +    }

Algorithmically I think this is on the right track. But all these
atomic reads and writes are a mess.  Atomic accesses aren't free; and
the vast majority of the time you're doing things with the
pi_blocking_list_lock anyway.

Why not do something like this at the top of vmx_vcpu_block()
(replacing dest_cpu with pi_cpu for clarity)?

    pi_cpu = v->processor;
retry:
    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, pi_cpu).lock;
    spin_lock_irqsave(pi_blocking_list_lock, flags);
    /*
     * Since dest_cpu may now be one other than the one v is currently
     * running on, check to make sure that it's still up.
     */
    if ( unlikely((!cpu_online(pi_cpu)) ||
                  pi_over_limit(per_cpu(vmx_pi_blocking, pi_cpu).counter)) )
    {
        pi_cpu = cpumask_cycle(pi_cpu, &cpu_online_map);
        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
        goto retry;
    }

where we define pi_over_limit() like this:

static bool pi_over_limit(int count) {
    /* Compare w/ constant first to save an atomic read in the common case */
    return (count > PI_LIST_FIXED_NUM)
         &&  (count > (atomic_read(&num_hvm_vcpus) / num_online_cpus()) +
              PI_LIST_FIXED_NUM ) );
}

That way, in the incredibly common case where count < 128, you simply
grab the lock once and don't to *any* atomic reads, rather than doing
at least four atomic reads in the common case.

[snip]
> @@ -163,6 +224,7 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>      unsigned long flags;
>      spinlock_t *pi_blocking_list_lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    unsigned int dest = cpu_physical_id(v->processor);
>
>      /*
>       * Set 'NV' field back to posted_intr_vector, so the
> @@ -170,6 +232,8 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>       * it is running in non-root mode.
>       */
>      write_atomic(&pi_desc->nv, posted_intr_vector);
> +    write_atomic(&pi_desc->ndst,
> +                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));

Just checking -- if an interrupt is raised between these two lines,
what will happen?  Will the interrupt be queued up to be delivered to
the vcpu the next time it does a VMENTRY?

 -George
Chao Gao May 15, 2017, 4:13 p.m. UTC | #4
On Mon, May 15, 2017 at 04:48:47PM +0100, George Dunlap wrote:
>On Thu, May 11, 2017 at 7:04 AM, Chao Gao <chao.gao@intel.com> wrote:
>> Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
>> too many vCPUs are blocked on a given pCPU, it will incur that the list
>> grows too long. After a simple analysis, there are 32k domains and
>> 128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
>> PI blocking list. When a wakeup interrupt arrives, the list is
>> traversed to find some specific vCPUs to wake them up. This traversal in
>> that case would consume much time.
>>
>> To mitigate this issue, this patch limits the vcpu number on a given
>> pCPU, taking factors such as perfomance of common case, current hvm vcpu
>> count and current pcpu count into consideration. With this method, for
>> the common case, it works fast and for some extreme cases, the list
>> length is under control.
>>
>> The change in vmx_pi_unblock_vcpu() is for the following case:
>> vcpu is running -> try to block (this patch may change NSDT to
>> another pCPU) but notification comes in time, thus the vcpu
>> goes back to running station -> VM-entry (we should set NSDT again,
>> reverting the change we make to NSDT in vmx_vcpu_block())
>>
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> ---
>>  xen/arch/x86/hvm/vmx/vmx.c | 78 +++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 71 insertions(+), 7 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>> index efff6cd..c0d0b58 100644
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -100,16 +100,70 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>>      spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
>>  }
>>
>> +/*
>> + * Choose an appropriate pcpu to receive wakeup interrupt.
>> + * By default, the local pcpu is chosen as the destination. But if the
>> + * vcpu number of the local pcpu exceeds a limit, another pcpu is chosen.
>> + *
>> + * Currently, choose (v_tot/p_tot) + K as the limit of vcpu, where
>> + * v_tot is the total number of vcpus on the system, p_tot is the total
>> + * number of pcpus in the system, and K is a fixed number. Experments shows
>> + * the maximal time to wakeup a vcpu from a 128-entry blocking list is
>> + * considered acceptable. So choose 128 as the fixed number K.
>> + *
>> + * This policy makes sure:
>> + * 1) for common cases, the limit won't be reached and the local pcpu is used
>> + * which is beneficial to performance (at least, avoid an IPI when unblocking
>> + * vcpu).
>> + * 2) for the worst case, the blocking list length scales with the vcpu count
>> + * divided by the pcpu count.
>> + */
>> +#define PI_LIST_FIXED_NUM 128
>> +#define PI_LIST_LIMIT     (atomic_read(&num_hvm_vcpus) / num_online_cpus() + \
>> +                           PI_LIST_FIXED_NUM)
>> +
>> +static unsigned int vmx_pi_choose_dest_cpu(struct vcpu *v)
>> +{
>> +    int count, limit = PI_LIST_LIMIT;
>> +    unsigned int dest = v->processor;
>> +
>> +    count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
>> +    while ( unlikely(count >= limit) )
>> +    {
>> +        dest = cpumask_cycle(dest, &cpu_online_map);
>> +        count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
>> +    }
>> +    return dest;
>> +}
>> +
>>  static void vmx_vcpu_block(struct vcpu *v)
>>  {
>>      unsigned long flags;
>> -    unsigned int dest;
>> +    unsigned int dest, dest_cpu;
>>      spinlock_t *old_lock;
>> -    spinlock_t *pi_blocking_list_lock =
>> -               &per_cpu(vmx_pi_blocking, v->processor).lock;
>>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>> +    spinlock_t *pi_blocking_list_lock;
>> +
>> +    /*
>> +     * After pCPU goes down, the per-cpu PI blocking list is cleared.
>> +     * To make sure the parameter vCPU is added to the chosen pCPU's
>> +     * PI blocking list before the list is cleared, just retry when
>> +     * finding the pCPU has gone down. Also retry to choose another
>> +     * pCPU when finding the list length reachs the limit.
>> +     */
>> + retry:
>> +    dest_cpu = vmx_pi_choose_dest_cpu(v);
>> +    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, dest_cpu).lock;
>>
>>      spin_lock_irqsave(pi_blocking_list_lock, flags);
>> +    if ( unlikely((!cpu_online(dest_cpu)) ||
>> +                  (atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter) >=
>> +                   PI_LIST_LIMIT)) )
>> +    {
>> +        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
>> +        goto retry;
>> +    }
>
>Algorithmically I think this is on the right track. But all these
>atomic reads and writes are a mess.  Atomic accesses aren't free; and
>the vast majority of the time you're doing things with the
>pi_blocking_list_lock anyway.
>
>Why not do something like this at the top of vmx_vcpu_block()
>(replacing dest_cpu with pi_cpu for clarity)?
>
>    pi_cpu = v->processor;
>retry:
>    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, pi_cpu).lock;
>    spin_lock_irqsave(pi_blocking_list_lock, flags);
>    /*
>     * Since dest_cpu may now be one other than the one v is currently
>     * running on, check to make sure that it's still up.
>     */
>    if ( unlikely((!cpu_online(pi_cpu)) ||
>                  pi_over_limit(per_cpu(vmx_pi_blocking, pi_cpu).counter)) )
>    {
>        pi_cpu = cpumask_cycle(pi_cpu, &cpu_online_map);
>        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
>        goto retry;
>    }
>
>where we define pi_over_limit() like this:
>
>static bool pi_over_limit(int count) {
>    /* Compare w/ constant first to save an atomic read in the common case */
>    return (count > PI_LIST_FIXED_NUM)
>         &&  (count > (atomic_read(&num_hvm_vcpus) / num_online_cpus()) +
>              PI_LIST_FIXED_NUM ) );
>}
>
>That way, in the incredibly common case where count < 128, you simply
>grab the lock once and don't to *any* atomic reads, rather than doing
>at least four atomic reads in the common case.

Agree. We should make things fast if possible. Could I add you as
suggested-by? Thanks for your help.

>
>[snip]
>> @@ -163,6 +224,7 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>>      unsigned long flags;
>>      spinlock_t *pi_blocking_list_lock;
>>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>> +    unsigned int dest = cpu_physical_id(v->processor);
>>
>>      /*
>>       * Set 'NV' field back to posted_intr_vector, so the
>> @@ -170,6 +232,8 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
>>       * it is running in non-root mode.
>>       */
>>      write_atomic(&pi_desc->nv, posted_intr_vector);
>> +    write_atomic(&pi_desc->ndst,
>> +                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
>
>Just checking -- if an interrupt is raised between these two lines,
>what will happen?  Will the interrupt be queued up to be delivered to
>the vcpu the next time it does a VMENTRY?

I think an interrupt between the two lines incurs a spurious interrupt
to the current destination pcpu. The interrupt will be inject in this
VM-entry as long as we will sync PIR to vIRR.

Thanks
Chao
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index efff6cd..c0d0b58 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -100,16 +100,70 @@  void vmx_pi_per_cpu_init(unsigned int cpu)
     spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
 }
 
+/*
+ * Choose an appropriate pcpu to receive wakeup interrupt.
+ * By default, the local pcpu is chosen as the destination. But if the
+ * vcpu number of the local pcpu exceeds a limit, another pcpu is chosen.
+ *
+ * Currently, choose (v_tot/p_tot) + K as the limit of vcpu, where
+ * v_tot is the total number of vcpus on the system, p_tot is the total
+ * number of pcpus in the system, and K is a fixed number. Experments shows
+ * the maximal time to wakeup a vcpu from a 128-entry blocking list is
+ * considered acceptable. So choose 128 as the fixed number K.
+ *
+ * This policy makes sure:
+ * 1) for common cases, the limit won't be reached and the local pcpu is used
+ * which is beneficial to performance (at least, avoid an IPI when unblocking
+ * vcpu).
+ * 2) for the worst case, the blocking list length scales with the vcpu count
+ * divided by the pcpu count.
+ */
+#define PI_LIST_FIXED_NUM 128
+#define PI_LIST_LIMIT     (atomic_read(&num_hvm_vcpus) / num_online_cpus() + \
+                           PI_LIST_FIXED_NUM)
+
+static unsigned int vmx_pi_choose_dest_cpu(struct vcpu *v)
+{
+    int count, limit = PI_LIST_LIMIT;
+    unsigned int dest = v->processor;
+
+    count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
+    while ( unlikely(count >= limit) )
+    {
+        dest = cpumask_cycle(dest, &cpu_online_map);
+        count = atomic_read(&per_cpu(vmx_pi_blocking, dest).counter);
+    }
+    return dest;
+}
+
 static void vmx_vcpu_block(struct vcpu *v)
 {
     unsigned long flags;
-    unsigned int dest;
+    unsigned int dest, dest_cpu;
     spinlock_t *old_lock;
-    spinlock_t *pi_blocking_list_lock =
-		&per_cpu(vmx_pi_blocking, v->processor).lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    spinlock_t *pi_blocking_list_lock;
+
+    /*
+     * After pCPU goes down, the per-cpu PI blocking list is cleared.
+     * To make sure the parameter vCPU is added to the chosen pCPU's
+     * PI blocking list before the list is cleared, just retry when
+     * finding the pCPU has gone down. Also retry to choose another
+     * pCPU when finding the list length reachs the limit.
+     */
+ retry:
+    dest_cpu = vmx_pi_choose_dest_cpu(v);
+    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, dest_cpu).lock;
 
     spin_lock_irqsave(pi_blocking_list_lock, flags);
+    if ( unlikely((!cpu_online(dest_cpu)) ||
+                  (atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter) >=
+                   PI_LIST_LIMIT)) )
+    {
+        spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+        goto retry;
+    }
+
     old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
                        pi_blocking_list_lock);
 
@@ -120,11 +174,11 @@  static void vmx_vcpu_block(struct vcpu *v)
      */
     ASSERT(old_lock == NULL);
 
-    atomic_inc(&per_cpu(vmx_pi_blocking, v->processor).counter);
-    HVMTRACE_4D(PI_LIST_ADD, v->domain->domain_id, v->vcpu_id, v->processor,
-                atomic_read(&per_cpu(vmx_pi_blocking, v->processor).counter));
+    atomic_inc(&per_cpu(vmx_pi_blocking, dest_cpu).counter);
+    HVMTRACE_4D(PI_LIST_ADD, v->domain->domain_id, v->vcpu_id, dest_cpu,
+                atomic_read(&per_cpu(vmx_pi_blocking, dest_cpu).counter));
     list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
-                  &per_cpu(vmx_pi_blocking, v->processor).list);
+                  &per_cpu(vmx_pi_blocking, dest_cpu).list);
     spin_unlock_irqrestore(pi_blocking_list_lock, flags);
 
     ASSERT(!pi_test_sn(pi_desc));
@@ -134,6 +188,13 @@  static void vmx_vcpu_block(struct vcpu *v)
     ASSERT(pi_desc->ndst ==
            (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
 
+    if ( unlikely(dest_cpu != v->processor) )
+    {
+        dest = cpu_physical_id(dest_cpu);
+        write_atomic(&pi_desc->ndst,
+                 (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
+    }
+
     write_atomic(&pi_desc->nv, pi_wakeup_vector);
 }
 
@@ -163,6 +224,7 @@  static void vmx_pi_unblock_vcpu(struct vcpu *v)
     unsigned long flags;
     spinlock_t *pi_blocking_list_lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    unsigned int dest = cpu_physical_id(v->processor);
 
     /*
      * Set 'NV' field back to posted_intr_vector, so the
@@ -170,6 +232,8 @@  static void vmx_pi_unblock_vcpu(struct vcpu *v)
      * it is running in non-root mode.
      */
     write_atomic(&pi_desc->nv, posted_intr_vector);
+    write_atomic(&pi_desc->ndst,
+                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
 
     pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;