diff mbox

mwait-idle: prevent SKL-H boot failure when C8+C9+C10 enabled

Message ID 56FBAB9D02000078000E12C2@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich March 30, 2016, 8:34 a.m. UTC
Some SKL-H configurations require "max_cstate=7" to boot.
While that is an effective workaround, it disables C10.

This patch detects the problematic configuration,
and disables C8 and C9, keeping C10 enabled.

Note that enabling SGX in BIOS SETUP can also prevent this issue,
if the system BIOS provides that option.

https://bugzilla.kernel.org/show_bug.cgi?id=109081 
"Freezes with Intel i7 6700HQ (Skylake), unless intel_idle.max_cstate=7"

Signed-off-by: Len Brown <len.brown@intel.com>

Adjust to Xen infrastructure.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
mwait-idle: prevent SKL-H boot failure when C8+C9+C10 enabled

Some SKL-H configurations require "max_cstate=7" to boot.
While that is an effective workaround, it disables C10.

This patch detects the problematic configuration,
and disables C8 and C9, keeping C10 enabled.

Note that enabling SGX in BIOS SETUP can also prevent this issue,
if the system BIOS provides that option.

https://bugzilla.kernel.org/show_bug.cgi?id=109081
"Freezes with Intel i7 6700HQ (Skylake), unless intel_idle.max_cstate=7"

Signed-off-by: Len Brown <len.brown@intel.com>

Adjust to Xen infrastructure.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -60,7 +60,7 @@
 #include <asm/msr.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-#define MWAIT_IDLE_VERSION "0.4"
+#define MWAIT_IDLE_VERSION "0.4.1"
 #undef PREFIX
 #define PREFIX "mwait-idle: "
 
@@ -100,6 +100,7 @@ static const struct cpuidle_state {
 	unsigned int	target_residency; /* in US */
 } *cpuidle_state_table;
 
+#define CPUIDLE_FLAG_DISABLED		0x1
 /*
  * Set this flag for states where the HW flushes the TLB for us
  * and so we don't need cross-calls to keep it consistent.
@@ -477,7 +478,7 @@ static const struct cpuidle_state bdw_cs
 	{}
 };
 
-static const struct cpuidle_state skl_cstates[] = {
+static struct cpuidle_state skl_cstates[] = {
 	{
 		.name = "C1-SKL",
 		.flags = MWAIT2flg(0x00),
@@ -781,34 +782,84 @@ static const struct x86_cpu_id intel_idl
 };
 
 /*
- * mwait_idle_state_table_update()
- *
- * Update the default state_table for this CPU-id
+ * ivt_idle_state_table_update(void)
  *
- * Currently used to access tuned IVT multi-socket targets
+ * Tune IVT multi-socket targets
  * Assumption: num_sockets == (max_package_num + 1)
  */
-static void __init mwait_idle_state_table_update(void)
+static void __init ivt_idle_state_table_update(void)
 {
 	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
-	if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
-		unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
+	unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
 
-		for_each_present_cpu(cpu)
-			if (max_apicid < x86_cpu_to_apicid[cpu])
-				max_apicid = x86_cpu_to_apicid[cpu];
-		switch (apicid_to_socket(max_apicid)) {
-		case 0: case 1:
-			/* 1 and 2 socket systems use default ivt_cstates */
-			break;
-		case 2: case 3:
-			cpuidle_state_table = ivt_cstates_4s;
-			break;
-		default:
-			cpuidle_state_table = ivt_cstates_8s;
-			break;
-		}
+	for_each_present_cpu(cpu)
+		if (max_apicid < x86_cpu_to_apicid[cpu])
+			max_apicid = x86_cpu_to_apicid[cpu];
+	switch (apicid_to_socket(max_apicid)) {
+	case 0: case 1:
+		/* 1 and 2 socket systems use default ivt_cstates */
+		break;
+	case 2: case 3:
+		cpuidle_state_table = ivt_cstates_4s;
+		break;
+	default:
+		cpuidle_state_table = ivt_cstates_8s;
+		break;
+	}
+}
+
+/*
+ * sklh_idle_state_table_update(void)
+ *
+ * On SKL-H (model 0x5e) disable C8 and C9 if:
+ * C10 is enabled and SGX disabled
+ */
+static void sklh_idle_state_table_update(void)
+{
+	u64 msr;
+
+	/* if PC10 disabled via cmdline max_cstate=7 or shallower */
+	if (max_cstate <= 7)
+		return;
+
+	/* if PC10 not present in CPUID.MWAIT.EDX */
+	if ((mwait_substates & (MWAIT_CSTATE_MASK << 28)) == 0)
+		return;
+
+	rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
+
+	/* PC10 is not enabled in PKG C-state limit */
+	if ((msr & 0xF) != 8)
+		return;
+
+	/* if SGX is present */
+	if (boot_cpu_has(X86_FEATURE_SGX)) {
+		rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+
+		/* if SGX is enabled */
+		if (msr & (1 << 18))
+			return;
 	}
+
+	skl_cstates[5].flags |= CPUIDLE_FLAG_DISABLED;	/* C8-SKL */
+	skl_cstates[6].flags |= CPUIDLE_FLAG_DISABLED;	/* C9-SKL */
+}
+
+/*
+ * mwait_idle_state_table_update()
+ *
+ * Update the default state_table for this CPU-id
+ */
+static void __init mwait_idle_state_table_update(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case 0x3e: /* IVT */
+		ivt_idle_state_table_update();
+		break;
+	case 0x5e: /* SKL-H */
+		sklh_idle_state_table_update();
+		break;
+ 	}
 }
 
 static int __init mwait_idle_probe(void)
@@ -897,6 +948,14 @@ static int mwait_idle_cpu_init(struct no
 		if (num_substates == 0)
 			continue;
 
+		/* if state marked as disabled, skip it */
+		if (cpuidle_state_table[cstate].flags &
+		    CPUIDLE_FLAG_DISABLED) {
+			printk(XENLOG_DEBUG PREFIX "state %s is disabled",
+			       cpuidle_state_table[cstate].name);
+			continue;
+		}
+
 		if (dev->count >= ACPI_PROCESSOR_MAX_POWER) {
 			printk(PREFIX "max C-state count of %u reached\n",
 			       ACPI_PROCESSOR_MAX_POWER);
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -288,6 +288,7 @@
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_EBC_FREQUENCY_ID	0x0000002c
+#define MSR_IA32_FEATURE_CONTROL	0x0000003a
 #define MSR_IA32_TSC_ADJUST		0x0000003b
 
 #define MSR_IA32_APICBASE		0x0000001b
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -186,6 +186,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0.ebx, word 5 */
 XEN_CPUFEATURE(FSGSBASE,      5*32+ 0) /*   {RD,WR}{FS,GS}BASE instructions */
 XEN_CPUFEATURE(TSC_ADJUST,    5*32+ 1) /*   TSC_ADJUST MSR available */
+XEN_CPUFEATURE(SGX,           5*32+ 2) /*   Software Guard extensions */
 XEN_CPUFEATURE(BMI1,          5*32+ 3) /*   1st bit manipulation extensions */
 XEN_CPUFEATURE(HLE,           5*32+ 4) /*   Hardware Lock Elision */
 XEN_CPUFEATURE(AVX2,          5*32+ 5) /*   AVX2 instructions */

Comments

Jan Beulich March 30, 2016, 8:40 a.m. UTC | #1
>>> On 30.03.16 at 10:34, <JBeulich@suse.com> wrote:
> Some SKL-H configurations require "max_cstate=7" to boot.
> While that is an effective workaround, it disables C10.
> 
> This patch detects the problematic configuration,
> and disables C8 and C9, keeping C10 enabled.
> 
> Note that enabling SGX in BIOS SETUP can also prevent this issue,
> if the system BIOS provides that option.
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=109081 
> "Freezes with Intel i7 6700HQ (Skylake), unless intel_idle.max_cstate=7"
> 
> Signed-off-by: Len Brown <len.brown@intel.com>

[Linux commit: d70e28f57e14a481977436695b0c9ba165472431]

> Adjust to Xen infrastructure.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -60,7 +60,7 @@
>  #include <asm/msr.h>
>  #include <acpi/cpufreq/cpufreq.h>
>  
> -#define MWAIT_IDLE_VERSION "0.4"
> +#define MWAIT_IDLE_VERSION "0.4.1"
>  #undef PREFIX
>  #define PREFIX "mwait-idle: "
>  
> @@ -100,6 +100,7 @@ static const struct cpuidle_state {
>  	unsigned int	target_residency; /* in US */
>  } *cpuidle_state_table;
>  
> +#define CPUIDLE_FLAG_DISABLED		0x1
>  /*
>   * Set this flag for states where the HW flushes the TLB for us
>   * and so we don't need cross-calls to keep it consistent.
> @@ -477,7 +478,7 @@ static const struct cpuidle_state bdw_cs
>  	{}
>  };
>  
> -static const struct cpuidle_state skl_cstates[] = {
> +static struct cpuidle_state skl_cstates[] = {
>  	{
>  		.name = "C1-SKL",
>  		.flags = MWAIT2flg(0x00),
> @@ -781,34 +782,84 @@ static const struct x86_cpu_id intel_idl
>  };
>  
>  /*
> - * mwait_idle_state_table_update()
> - *
> - * Update the default state_table for this CPU-id
> + * ivt_idle_state_table_update(void)
>   *
> - * Currently used to access tuned IVT multi-socket targets
> + * Tune IVT multi-socket targets
>   * Assumption: num_sockets == (max_package_num + 1)
>   */
> -static void __init mwait_idle_state_table_update(void)
> +static void __init ivt_idle_state_table_update(void)
>  {
>  	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
> -	if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
> -		unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
> +	unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
>  
> -		for_each_present_cpu(cpu)
> -			if (max_apicid < x86_cpu_to_apicid[cpu])
> -				max_apicid = x86_cpu_to_apicid[cpu];
> -		switch (apicid_to_socket(max_apicid)) {
> -		case 0: case 1:
> -			/* 1 and 2 socket systems use default ivt_cstates */
> -			break;
> -		case 2: case 3:
> -			cpuidle_state_table = ivt_cstates_4s;
> -			break;
> -		default:
> -			cpuidle_state_table = ivt_cstates_8s;
> -			break;
> -		}
> +	for_each_present_cpu(cpu)
> +		if (max_apicid < x86_cpu_to_apicid[cpu])
> +			max_apicid = x86_cpu_to_apicid[cpu];
> +	switch (apicid_to_socket(max_apicid)) {
> +	case 0: case 1:
> +		/* 1 and 2 socket systems use default ivt_cstates */
> +		break;
> +	case 2: case 3:
> +		cpuidle_state_table = ivt_cstates_4s;
> +		break;
> +	default:
> +		cpuidle_state_table = ivt_cstates_8s;
> +		break;
> +	}
> +}
> +
> +/*
> + * sklh_idle_state_table_update(void)
> + *
> + * On SKL-H (model 0x5e) disable C8 and C9 if:
> + * C10 is enabled and SGX disabled
> + */
> +static void sklh_idle_state_table_update(void)
> +{
> +	u64 msr;
> +
> +	/* if PC10 disabled via cmdline max_cstate=7 or shallower */
> +	if (max_cstate <= 7)
> +		return;
> +
> +	/* if PC10 not present in CPUID.MWAIT.EDX */
> +	if ((mwait_substates & (MWAIT_CSTATE_MASK << 28)) == 0)
> +		return;
> +
> +	rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
> +
> +	/* PC10 is not enabled in PKG C-state limit */
> +	if ((msr & 0xF) != 8)
> +		return;
> +
> +	/* if SGX is present */
> +	if (boot_cpu_has(X86_FEATURE_SGX)) {
> +		rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
> +
> +		/* if SGX is enabled */
> +		if (msr & (1 << 18))
> +			return;
>  	}
> +
> +	skl_cstates[5].flags |= CPUIDLE_FLAG_DISABLED;	/* C8-SKL */
> +	skl_cstates[6].flags |= CPUIDLE_FLAG_DISABLED;	/* C9-SKL */
> +}
> +
> +/*
> + * mwait_idle_state_table_update()
> + *
> + * Update the default state_table for this CPU-id
> + */
> +static void __init mwait_idle_state_table_update(void)
> +{
> +	switch (boot_cpu_data.x86_model) {
> +	case 0x3e: /* IVT */
> +		ivt_idle_state_table_update();
> +		break;
> +	case 0x5e: /* SKL-H */
> +		sklh_idle_state_table_update();
> +		break;
> + 	}
>  }
>  
>  static int __init mwait_idle_probe(void)
> @@ -897,6 +948,14 @@ static int mwait_idle_cpu_init(struct no
>  		if (num_substates == 0)
>  			continue;
>  
> +		/* if state marked as disabled, skip it */
> +		if (cpuidle_state_table[cstate].flags &
> +		    CPUIDLE_FLAG_DISABLED) {
> +			printk(XENLOG_DEBUG PREFIX "state %s is disabled",
> +			       cpuidle_state_table[cstate].name);
> +			continue;
> +		}
> +
>  		if (dev->count >= ACPI_PROCESSOR_MAX_POWER) {
>  			printk(PREFIX "max C-state count of %u reached\n",
>  			       ACPI_PROCESSOR_MAX_POWER);
> --- a/xen/include/asm-x86/msr-index.h
> +++ b/xen/include/asm-x86/msr-index.h
> @@ -288,6 +288,7 @@
>  #define MSR_IA32_PLATFORM_ID		0x00000017
>  #define MSR_IA32_EBL_CR_POWERON		0x0000002a
>  #define MSR_IA32_EBC_FREQUENCY_ID	0x0000002c
> +#define MSR_IA32_FEATURE_CONTROL	0x0000003a
>  #define MSR_IA32_TSC_ADJUST		0x0000003b
>  
>  #define MSR_IA32_APICBASE		0x0000001b
> --- a/xen/include/public/arch-x86/cpufeatureset.h
> +++ b/xen/include/public/arch-x86/cpufeatureset.h
> @@ -186,6 +186,7 @@
>  /* Intel-defined CPU features, CPUID level 0x00000007:0.ebx, word 5 */
>  XEN_CPUFEATURE(FSGSBASE,      5*32+ 0) /*   {RD,WR}{FS,GS}BASE instructions 
> */
>  XEN_CPUFEATURE(TSC_ADJUST,    5*32+ 1) /*   TSC_ADJUST MSR available */
> +XEN_CPUFEATURE(SGX,           5*32+ 2) /*   Software Guard extensions */
>  XEN_CPUFEATURE(BMI1,          5*32+ 3) /*   1st bit manipulation extensions 
> */
>  XEN_CPUFEATURE(HLE,           5*32+ 4) /*   Hardware Lock Elision */
>  XEN_CPUFEATURE(AVX2,          5*32+ 5) /*   AVX2 instructions */
diff mbox

Patch

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -60,7 +60,7 @@ 
 #include <asm/msr.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-#define MWAIT_IDLE_VERSION "0.4"
+#define MWAIT_IDLE_VERSION "0.4.1"
 #undef PREFIX
 #define PREFIX "mwait-idle: "
 
@@ -100,6 +100,7 @@  static const struct cpuidle_state {
 	unsigned int	target_residency; /* in US */
 } *cpuidle_state_table;
 
+#define CPUIDLE_FLAG_DISABLED		0x1
 /*
  * Set this flag for states where the HW flushes the TLB for us
  * and so we don't need cross-calls to keep it consistent.
@@ -477,7 +478,7 @@  static const struct cpuidle_state bdw_cs
 	{}
 };
 
-static const struct cpuidle_state skl_cstates[] = {
+static struct cpuidle_state skl_cstates[] = {
 	{
 		.name = "C1-SKL",
 		.flags = MWAIT2flg(0x00),
@@ -781,34 +782,84 @@  static const struct x86_cpu_id intel_idl
 };
 
 /*
- * mwait_idle_state_table_update()
- *
- * Update the default state_table for this CPU-id
+ * ivt_idle_state_table_update(void)
  *
- * Currently used to access tuned IVT multi-socket targets
+ * Tune IVT multi-socket targets
  * Assumption: num_sockets == (max_package_num + 1)
  */
-static void __init mwait_idle_state_table_update(void)
+static void __init ivt_idle_state_table_update(void)
 {
 	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
-	if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
-		unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
+	unsigned int cpu, max_apicid = boot_cpu_physical_apicid;
 
-		for_each_present_cpu(cpu)
-			if (max_apicid < x86_cpu_to_apicid[cpu])
-				max_apicid = x86_cpu_to_apicid[cpu];
-		switch (apicid_to_socket(max_apicid)) {
-		case 0: case 1:
-			/* 1 and 2 socket systems use default ivt_cstates */
-			break;
-		case 2: case 3:
-			cpuidle_state_table = ivt_cstates_4s;
-			break;
-		default:
-			cpuidle_state_table = ivt_cstates_8s;
-			break;
-		}
+	for_each_present_cpu(cpu)
+		if (max_apicid < x86_cpu_to_apicid[cpu])
+			max_apicid = x86_cpu_to_apicid[cpu];
+	switch (apicid_to_socket(max_apicid)) {
+	case 0: case 1:
+		/* 1 and 2 socket systems use default ivt_cstates */
+		break;
+	case 2: case 3:
+		cpuidle_state_table = ivt_cstates_4s;
+		break;
+	default:
+		cpuidle_state_table = ivt_cstates_8s;
+		break;
+	}
+}
+
+/*
+ * sklh_idle_state_table_update(void)
+ *
+ * On SKL-H (model 0x5e) disable C8 and C9 if:
+ * C10 is enabled and SGX disabled
+ */
+static void sklh_idle_state_table_update(void)
+{
+	u64 msr;
+
+	/* if PC10 disabled via cmdline max_cstate=7 or shallower */
+	if (max_cstate <= 7)
+		return;
+
+	/* if PC10 not present in CPUID.MWAIT.EDX */
+	if ((mwait_substates & (MWAIT_CSTATE_MASK << 28)) == 0)
+		return;
+
+	rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
+
+	/* PC10 is not enabled in PKG C-state limit */
+	if ((msr & 0xF) != 8)
+		return;
+
+	/* if SGX is present */
+	if (boot_cpu_has(X86_FEATURE_SGX)) {
+		rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+
+		/* if SGX is enabled */
+		if (msr & (1 << 18))
+			return;
 	}
+
+	skl_cstates[5].flags |= CPUIDLE_FLAG_DISABLED;	/* C8-SKL */
+	skl_cstates[6].flags |= CPUIDLE_FLAG_DISABLED;	/* C9-SKL */
+}
+
+/*
+ * mwait_idle_state_table_update()
+ *
+ * Update the default state_table for this CPU-id
+ */
+static void __init mwait_idle_state_table_update(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case 0x3e: /* IVT */
+		ivt_idle_state_table_update();
+		break;
+	case 0x5e: /* SKL-H */
+		sklh_idle_state_table_update();
+		break;
+ 	}
 }
 
 static int __init mwait_idle_probe(void)
@@ -897,6 +948,14 @@  static int mwait_idle_cpu_init(struct no
 		if (num_substates == 0)
 			continue;
 
+		/* if state marked as disabled, skip it */
+		if (cpuidle_state_table[cstate].flags &
+		    CPUIDLE_FLAG_DISABLED) {
+			printk(XENLOG_DEBUG PREFIX "state %s is disabled",
+			       cpuidle_state_table[cstate].name);
+			continue;
+		}
+
 		if (dev->count >= ACPI_PROCESSOR_MAX_POWER) {
 			printk(PREFIX "max C-state count of %u reached\n",
 			       ACPI_PROCESSOR_MAX_POWER);
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -288,6 +288,7 @@ 
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_EBC_FREQUENCY_ID	0x0000002c
+#define MSR_IA32_FEATURE_CONTROL	0x0000003a
 #define MSR_IA32_TSC_ADJUST		0x0000003b
 
 #define MSR_IA32_APICBASE		0x0000001b
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -186,6 +186,7 @@ 
 /* Intel-defined CPU features, CPUID level 0x00000007:0.ebx, word 5 */
 XEN_CPUFEATURE(FSGSBASE,      5*32+ 0) /*   {RD,WR}{FS,GS}BASE instructions */
 XEN_CPUFEATURE(TSC_ADJUST,    5*32+ 1) /*   TSC_ADJUST MSR available */
+XEN_CPUFEATURE(SGX,           5*32+ 2) /*   Software Guard extensions */
 XEN_CPUFEATURE(BMI1,          5*32+ 3) /*   1st bit manipulation extensions */
 XEN_CPUFEATURE(HLE,           5*32+ 4) /*   Hardware Lock Elision */
 XEN_CPUFEATURE(AVX2,          5*32+ 5) /*   AVX2 instructions */