diff mbox

[v7,2/4] KVM: X86: Add Paravirt TLB Shootdown

Message ID 1512021674-9880-3-git-send-email-wanpeng.li@hotmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wanpeng Li Nov. 30, 2017, 6:01 a.m. UTC
From: Wanpeng Li <wanpeng.li@hotmail.com>

Remote flushing api's does a busy wait which is fine in bare-metal
scenario. But with-in the guest, the vcpus might have been pre-empted
or blocked. In this scenario, the initator vcpu would end up
busy-waiting for a long amount of time.

This patch set implements para-virt flush tlbs making sure that it
does not wait for vcpus that are sleeping. And all the sleeping vcpus
flush the tlb on guest enter.

The best result is achieved when we're overcommiting the host by running 
multiple vCPUs on each pCPU. In this case PV tlb flush avoids touching 
vCPUs which are not scheduled and avoid the wait on the main CPU.

Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads,
so 64 pCPUs, and each VM is 64 vCPUs.

ebizzy -M 
              vanilla    optimized     boost
1VM            46799       48670         4%
2VM            23962       42691        78%
3VM            16152       37539       132%

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
---
 Documentation/virtual/kvm/cpuid.txt  |  4 +++
 arch/x86/include/uapi/asm/kvm_para.h |  2 ++
 arch/x86/kernel/kvm.c                | 47 ++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

Comments

Wanpeng Li Nov. 30, 2017, 7:53 a.m. UTC | #1
2017-11-30 14:01 GMT+08:00 Wanpeng Li <kernellwp@gmail.com>:
> From: Wanpeng Li <wanpeng.li@hotmail.com>
>
> Remote flushing api's does a busy wait which is fine in bare-metal
> scenario. But with-in the guest, the vcpus might have been pre-empted
> or blocked. In this scenario, the initator vcpu would end up
> busy-waiting for a long amount of time.
>
> This patch set implements para-virt flush tlbs making sure that it
> does not wait for vcpus that are sleeping. And all the sleeping vcpus
> flush the tlb on guest enter.
>
> The best result is achieved when we're overcommiting the host by running
> multiple vCPUs on each pCPU. In this case PV tlb flush avoids touching
> vCPUs which are not scheduled and avoid the wait on the main CPU.
>
> Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads,
> so 64 pCPUs, and each VM is 64 vCPUs.
>
> ebizzy -M
>               vanilla    optimized     boost
> 1VM            46799       48670         4%
> 2VM            23962       42691        78%
> 3VM            16152       37539       132%
>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
> ---
>  Documentation/virtual/kvm/cpuid.txt  |  4 +++
>  arch/x86/include/uapi/asm/kvm_para.h |  2 ++
>  arch/x86/kernel/kvm.c                | 47 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 53 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
> index 3c65feb..dcab6dc 100644
> --- a/Documentation/virtual/kvm/cpuid.txt
> +++ b/Documentation/virtual/kvm/cpuid.txt
> @@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
>                                     ||       || before enabling paravirtualized
>                                     ||       || spinlock support.
>  ------------------------------------------------------------------------------
> +KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
> +                                   ||       || before enabling paravirtualized
> +                                   ||       || tlb flush.
> +------------------------------------------------------------------------------
>  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
>                                     ||       || per-cpu warps are expected in
>                                     ||       || kvmclock.
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index 763b692..8fbcc16 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -25,6 +25,7 @@
>  #define KVM_FEATURE_STEAL_TIME         5
>  #define KVM_FEATURE_PV_EOI             6
>  #define KVM_FEATURE_PV_UNHALT          7
> +#define KVM_FEATURE_PV_TLB_FLUSH       9
>
>  /* The last 8 bits are used to indicate how to interpret the flags field
>   * in pvclock structure. If no bits are set, all flags are ignored.
> @@ -53,6 +54,7 @@ struct kvm_steal_time {
>
>  #define KVM_VCPU_NOT_PREEMPTED      (0 << 0)
>  #define KVM_VCPU_PREEMPTED          (1 << 0)
> +#define KVM_VCPU_SHOULD_FLUSH       (1 << 1)
>
>  #define KVM_CLOCK_PAIRING_WALLCLOCK 0
>  struct kvm_clock_pairing {
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 6610b92..64fb9a4 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
>         update_intr_gate(X86_TRAP_PF, async_page_fault);
>  }
>
> +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
> +
> +static void kvm_flush_tlb_others(const struct cpumask *cpumask,
> +                       const struct flush_tlb_info *info)
> +{
> +       u8 state;
> +       int cpu;
> +       struct kvm_steal_time *src;
> +       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
> +
> +       cpumask_copy(flushmask, cpumask);
> +       /*
> +        * We have to call flush only on online vCPUs. And
> +        * queue flush_on_enter for pre-empted vCPUs
> +        */
> +       for_each_cpu(cpu, flushmask) {
> +               src = &per_cpu(steal_time, cpu);
> +               state = READ_ONCE(src->preempted);
> +               if ((state & KVM_VCPU_PREEMPTED)) {
> +                       if (try_cmpxchg(&src->preempted, &state,
> +                               state | KVM_VCPU_SHOULD_FLUSH))
> +                               __cpumask_clear_cpu(cpu, flushmask);
> +               }
> +       }
> +
> +       native_flush_tlb_others(flushmask, info);
> +}
> +
>  static void __init kvm_guest_init(void)
>  {
>         int i;
> @@ -517,6 +545,9 @@ static void __init kvm_guest_init(void)
>                 pv_time_ops.steal_clock = kvm_steal_clock;
>         }
>
> +       if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH))
> +               pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
> +
>         if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>                 apic_set_eoi_write(kvm_guest_apic_eoi_write);
>
> @@ -598,6 +629,22 @@ static __init int activate_jump_labels(void)
>  }
>  arch_initcall(activate_jump_labels);
>
> +static __init int kvm_setup_pv_tlb_flush(void)
> +{
> +       int cpu;
> +
> +       if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) {
> +               for_each_possible_cpu(cpu) {
> +                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
> +                               GFP_KERNEL, cpu_to_node(cpu));
> +               }
> +               pr_info("KVM setup remote TLB flush\n");

Please change it to "KVM setup pv remote TLB flush\n" if it is the
last version before applying. :)

Regards,
Wanpeng Li

> +       }
> +
> +       return 0;
> +}
> +arch_initcall(kvm_setup_pv_tlb_flush);
> +
>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>
>  /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
> --
> 2.7.4
>
Radim Krčmář Nov. 30, 2017, 3:17 p.m. UTC | #2
2017-11-29 22:01-0800, Wanpeng Li:
> From: Wanpeng Li <wanpeng.li@hotmail.com>
> ---
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> @@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
>  	update_intr_gate(X86_TRAP_PF, async_page_fault);
>  }
>  
> +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
> +
> +static void kvm_flush_tlb_others(const struct cpumask *cpumask,
> +			const struct flush_tlb_info *info)
> +{
> +	u8 state;
> +	int cpu;
> +	struct kvm_steal_time *src;
> +	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
> +
> +	cpumask_copy(flushmask, cpumask);

Is it impossible to call this function before the allocation?

I was guessing that early_initcall might allow us to avoid a (static)
condition as there is no point in calling when there are no others, but
expected the worst ...

thanks.
Wanpeng Li Dec. 1, 2017, 12:55 a.m. UTC | #3
2017-11-30 23:17 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> 2017-11-29 22:01-0800, Wanpeng Li:
>> From: Wanpeng Li <wanpeng.li@hotmail.com>
>> ---
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> @@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
>>       update_intr_gate(X86_TRAP_PF, async_page_fault);
>>  }
>>
>> +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
>> +
>> +static void kvm_flush_tlb_others(const struct cpumask *cpumask,
>> +                     const struct flush_tlb_info *info)
>> +{
>> +     u8 state;
>> +     int cpu;
>> +     struct kvm_steal_time *src;
>> +     struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
>> +
>> +     cpumask_copy(flushmask, cpumask);
>
> Is it impossible to call this function before the allocation?
>
> I was guessing that early_initcall might allow us to avoid a (static)
> condition as there is no point in calling when there are no others, but
> expected the worst ...

kernel_init()
  -> kernel_init_freeable()
        -> do_basic_setup()
            -> do_initcalls()
  -> async_synchronize_full()  => finish all async __init code
  -> try_to_run_init_process()

All the async __init codes can guarantee to be completed before init
task is created, so I think arch_initcall() is fine.

Regards,
Wanpeng Li
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 3c65feb..dcab6dc 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -54,6 +54,10 @@  KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
                                    ||       || before enabling paravirtualized
                                    ||       || spinlock support.
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || tlb flush.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 763b692..8fbcc16 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -25,6 +25,7 @@ 
 #define KVM_FEATURE_STEAL_TIME		5
 #define KVM_FEATURE_PV_EOI		6
 #define KVM_FEATURE_PV_UNHALT		7
+#define KVM_FEATURE_PV_TLB_FLUSH	9
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -53,6 +54,7 @@  struct kvm_steal_time {
 
 #define KVM_VCPU_NOT_PREEMPTED      (0 << 0)
 #define KVM_VCPU_PREEMPTED          (1 << 0)
+#define KVM_VCPU_SHOULD_FLUSH       (1 << 1)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6610b92..64fb9a4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -498,6 +498,34 @@  static void __init kvm_apf_trap_init(void)
 	update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+			const struct flush_tlb_info *info)
+{
+	u8 state;
+	int cpu;
+	struct kvm_steal_time *src;
+	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+	cpumask_copy(flushmask, cpumask);
+	/*
+	 * We have to call flush only on online vCPUs. And
+	 * queue flush_on_enter for pre-empted vCPUs
+	 */
+	for_each_cpu(cpu, flushmask) {
+		src = &per_cpu(steal_time, cpu);
+		state = READ_ONCE(src->preempted);
+		if ((state & KVM_VCPU_PREEMPTED)) {
+			if (try_cmpxchg(&src->preempted, &state,
+				state | KVM_VCPU_SHOULD_FLUSH))
+				__cpumask_clear_cpu(cpu, flushmask);
+		}
+	}
+
+	native_flush_tlb_others(flushmask, info);
+}
+
 static void __init kvm_guest_init(void)
 {
 	int i;
@@ -517,6 +545,9 @@  static void __init kvm_guest_init(void)
 		pv_time_ops.steal_clock = kvm_steal_clock;
 	}
 
+	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH))
+		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -598,6 +629,22 @@  static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+	int cpu;
+
+	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) {
+		for_each_possible_cpu(cpu) {
+			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+				GFP_KERNEL, cpu_to_node(cpu));
+		}
+		pr_info("KVM setup remote TLB flush\n");
+	}
+
+	return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */