diff mbox series

x86: optimize loading of GDT at context switch

Message ID 20190722132244.893-1-jgross@suse.com (mailing list archive)
State New, archived
Headers show
Series x86: optimize loading of GDT at context switch | expand

Commit Message

Jürgen Groß July 22, 2019, 1:22 p.m. UTC
Instead of dynamically decide whether the previous vcpu was using full
or default GDT just add a percpu variable for that purpose. This at
once removes the need for testing vcpu_ids to differ twice.

This change improves performance by 0.5% - 1% on my test machine when
doing parallel compilation.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
This patch was originally part of my core scheduling series, but it
neither depends on any patch of that series, nor does any patch of the
series depend on this one, so I'm sending it on its own.

I have removed latching the need_full_gdt(nd) value in a local variable
as it has shown to impact performance very slightly in a bad way.
---
 xen/arch/x86/cpu/common.c  |  3 +++
 xen/arch/x86/domain.c      | 14 ++++++++------
 xen/include/asm-x86/desc.h |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

Comments

Jan Beulich July 25, 2019, 10:29 a.m. UTC | #1
On 22.07.2019 15:22, Juergen Gross wrote:
> @@ -756,6 +758,7 @@ void load_system_tables(void)
>   		offsetof(struct tss_struct, __cacheline_filler) - 1,
>   		SYS_DESC_tss_busy);
>   
> +        per_cpu(full_gdt_loaded, cpu) = false;
>   	lgdt(&gdtr);
>   	lidt(&idtr);
>   	ltr(TSS_ENTRY << 3);

As per the surrounding code there should be a hard tab used for
indentation here.

> @@ -1739,8 +1743,7 @@ static void __context_switch(void)
>   
>       if ( need_full_gdt(nd) )
>           update_xen_slot_in_full_gdt(n, cpu);
> -
> -    if ( need_full_gdt(pd) &&
> +    if ( per_cpu(full_gdt_loaded, cpu) &&
>            ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
>           load_default_gdt(cpu);

I think it wouldn't be bad if the blank line was kept.

If I end up committing this, I'll try to remember to do both
adjustments, unless you object for some reason.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 1db96d959c..b0bb9292fd 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -49,6 +49,8 @@  unsigned int vaddr_bits __read_mostly = VADDR_BITS;
 static unsigned int cleared_caps[NCAPINTS];
 static unsigned int forced_caps[NCAPINTS];
 
+DEFINE_PER_CPU(bool, full_gdt_loaded);
+
 void __init setup_clear_cpu_cap(unsigned int cap)
 {
 	const uint32_t *dfs;
@@ -756,6 +758,7 @@  void load_system_tables(void)
 		offsetof(struct tss_struct, __cacheline_filler) - 1,
 		SYS_DESC_tss_busy);
 
+        per_cpu(full_gdt_loaded, cpu) = false;
 	lgdt(&gdtr);
 	lidt(&idtr);
 	ltr(TSS_ENTRY << 3);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index ea55160887..353a6e24fb 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1670,7 +1670,7 @@  static void update_xen_slot_in_full_gdt(const struct vcpu *v, unsigned int cpu)
                                    : per_cpu(compat_gdt_table_l1e, cpu));
 }
 
-static void load_full_gdt(const struct vcpu *v)
+static void load_full_gdt(const struct vcpu *v, unsigned int cpu)
 {
     struct desc_ptr gdt_desc = {
         .limit = LAST_RESERVED_GDT_BYTE,
@@ -1678,6 +1678,8 @@  static void load_full_gdt(const struct vcpu *v)
     };
 
     lgdt(&gdt_desc);
+
+    per_cpu(full_gdt_loaded, cpu) = true;
 }
 
 static void load_default_gdt(unsigned int cpu)
@@ -1689,6 +1691,8 @@  static void load_default_gdt(unsigned int cpu)
     };
 
     lgdt(&gdt_desc);
+
+    per_cpu(full_gdt_loaded, cpu) = false;
 }
 
 static void __context_switch(void)
@@ -1739,8 +1743,7 @@  static void __context_switch(void)
 
     if ( need_full_gdt(nd) )
         update_xen_slot_in_full_gdt(n, cpu);
-
-    if ( need_full_gdt(pd) &&
+    if ( per_cpu(full_gdt_loaded, cpu) &&
          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
         load_default_gdt(cpu);
 
@@ -1753,9 +1756,8 @@  static void __context_switch(void)
         svm_load_segs(0, 0, 0, 0, 0, 0, 0);
 #endif
 
-    if ( need_full_gdt(nd) &&
-         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
-        load_full_gdt(n);
+    if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
+        load_full_gdt(n, cpu);
 
     if ( pd != nd )
         cpumask_clear_cpu(cpu, pd->dirty_cpumask);
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index e565727dc0..c011c03ae2 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -210,6 +210,7 @@  DECLARE_PER_CPU(l1_pgentry_t, gdt_table_l1e);
 extern seg_desc_t boot_cpu_compat_gdt_table[];
 DECLARE_PER_CPU(seg_desc_t *, compat_gdt_table);
 DECLARE_PER_CPU(l1_pgentry_t, compat_gdt_table_l1e);
+DECLARE_PER_CPU(bool, full_gdt_loaded);
 
 extern void load_TR(void);