diff mbox

[v2,0/5] x86: CPU idle management adjustments

Message ID 0dbf8520-89c7-753e-c10a-13399aec9cd5@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich July 3, 2019, 12:55 p.m. UTC
The first patch is something I had meant to do forever since the introduction
of mwait-idle. The 2nd patch addresses a latent problem (becoming an active
one with patch 3) in C-state selection when actually entering an idle state.
The 3rd patch is my counterproposal to Brian's intended abuse (as I would
call it) of the mwait-idle driver. The final two patches are ones I've been
carrying for about 5 years now, initially waiting for a response to my
proposed edits to Ross'es patches, and eventually sort of forgotten (but
re-based as needed).

1: x86/cpuidle: switch to uniform meaning of "max_cstate="
2: x86/cpuidle: really use C1 for "urgent" CPUs
3: x86/AMD: make C-state handling independent of Dom0
4: x86: allow limiting the max C-state sub-state
5: tools/libxc: allow controlling the max C-state sub-state

Due to mail mangling issues I'm attaching all patches here, alongside the
individual mails in reply to this one.

Jan
x86/cpuidle: switch to uniform meaning of "max_cstate="

While the MWAIT idle driver already takes it to mean an actual C state,
the ACPI idle driver so far used it as a list index. The list index,
however, is an implementation detail of Xen and affected by firmware
settings (i.e. not necessarily uniform for a particular system).

While touching this code also avoid invoking menu_get_trace_data()
when tracing is not active. For consistency do this also for the
MWAIT driver.

Note that I'm intentionally not adding any sorting logic to set_cx():
Before and after this patch we assume entries to arrive in order, so
this would be an orthogonal change.

Take the opportunity and add minimal documentation for the command line
option.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Adjust xenpm output wording a little. Explicitly log "unlimited".
---
TBD: I wonder if we really need struct acpi_processor_cx's idx field
     anymore. It's used in a number of (questionable) places ...
x86/cpuidle: really use C1 for "urgent" CPUs

For one on recent AMD CPUs entering C1 (if available at all) requires
use of MWAIT, while HLT (i.e. default_idle()) would put the processor
into as deep as CC6. And then even on other vendors' CPUs we should
avoid entering default_idle() when the intended state can be reached
by using the active idle driver's facilities.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -582,12 +582,15 @@ static void acpi_processor_idle(void)
     u32 exp = 0, pred = 0;
     u32 irq_traced[4] = { 0 };
 
-    if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
+    if ( max_cstate > 0 && power &&
          (next_state = cpuidle_current_governor->select(power)) > 0 )
     {
+        unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
+                                                         : max_cstate;
+
         do {
             cx = &power->states[next_state];
-        } while ( cx->type > max_cstate && --next_state );
+        } while ( cx->type > max_state && --next_state );
         if ( next_state )
         {
             if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -724,11 +724,14 @@ static void mwait_idle(void)
 	u64 before, after;
 	u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
 
-	if (max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
+	if (max_cstate > 0 && power &&
 	    (next_state = cpuidle_current_governor->select(power)) > 0) {
+		unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
+								 : max_cstate;
+
 		do {
 			cx = &power->states[next_state];
-		} while (cx->type > max_cstate && --next_state);
+		} while (cx->type > max_state && --next_state);
 		if (!next_state)
 			cx = NULL;
 		else if (tb_init_done)
x86/AMD: make C-state handling independent of Dom0

At least for more recent CPUs, following what BKDG / PPR suggest for the
BIOS to surface via ACPI we can make ourselves independent of Dom0
uploading respective data.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Handle Hygon Fam18. Set local_apic_timer_c2_ok (for Fam17 and Hygon
    Fam18 only for now).
---
TBD: We may want to verify that HLT indeed is configured to enter CC6.
TBD: Brian's series specifies .target_residency as 1000 for CC6; may
     want to do so here as well. Question then is whether this value is
     also suitable for the older families.
TBD: I guess we could extend this to families older then Fam15 as well.

--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -110,6 +110,8 @@ boolean_param("lapic_timer_c2_ok", local
 
 struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
 
+static int8_t __read_mostly vendor_override;
+
 struct hw_residencies
 {
     uint64_t mc0;
@@ -1214,6 +1216,9 @@ long set_cx_pminfo(uint32_t acpi_id, str
     if ( pm_idle_save && pm_idle != acpi_processor_idle )
         return 0;
 
+    if ( vendor_override > 0 )
+        return 0;
+
     print_cx_pminfo(acpi_id, power);
 
     cpu_id = get_cpu_id(acpi_id);
@@ -1286,6 +1291,103 @@ long set_cx_pminfo(uint32_t acpi_id, str
     return 0;
 }
 
+static void amd_cpuidle_init(struct acpi_processor_power *power)
+{
+    unsigned int i, nr = 0;
+    const struct cpuinfo_x86 *c = &current_cpu_data;
+    const unsigned int ecx_req = CPUID5_ECX_EXTENSIONS_SUPPORTED |
+                                 CPUID5_ECX_INTERRUPT_BREAK;
+    const struct acpi_processor_cx *cx = NULL;
+    static const struct acpi_processor_cx fam17[] = {
+        {
+            .type = ACPI_STATE_C1,
+            .entry_method = ACPI_CSTATE_EM_FFH,
+            .address = 0,
+            .latency = 1,
+        },
+        {
+            .type = ACPI_STATE_C2,
+            .entry_method = ACPI_CSTATE_EM_HALT,
+            .latency = 400,
+        },
+    };
+
+    if ( pm_idle_save && pm_idle != acpi_processor_idle )
+        return;
+
+    if ( vendor_override < 0 )
+        return;
+
+    switch ( c->x86 )
+    {
+    case 0x18:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_HYGON )
+        {
+    default:
+            vendor_override = -1;
+            return;
+        }
+        /* fall through */
+    case 0x17:
+        if ( cpu_has_monitor && c->cpuid_level >= CPUID_MWAIT_LEAF &&
+             (cpuid_ecx(CPUID_MWAIT_LEAF) & ecx_req) == ecx_req )
+        {
+            cx = fam17;
+            nr = ARRAY_SIZE(fam17);
+            local_apic_timer_c2_ok = true;
+            break;
+        }
+        /* fall through */
+    case 0x15:
+    case 0x16:
+        cx = &fam17[1];
+        nr = ARRAY_SIZE(fam17) - 1;
+        break;
+    }
+
+    power->flags.has_cst = true;
+
+    for ( i = 0; i < nr; ++i )
+    {
+        if ( cx[i].type > max_cstate )
+            break;
+        power->states[i + 1] = cx[i];
+        power->states[i + 1].idx = i + 1;
+        power->states[i + 1].target_residency = cx[i].latency * latency_factor;
+    }
+
+    if ( i )
+    {
+        power->count = i + 1;
+        power->safe_state = &power->states[i];
+
+        if ( !vendor_override )
+        {
+            if ( !boot_cpu_has(X86_FEATURE_ARAT) )
+                hpet_broadcast_init();
+
+            if ( !lapic_timer_init() )
+            {
+                vendor_override = -1;
+                cpuidle_init_cpu(power->cpu);
+                return;
+            }
+
+            if ( !pm_idle_save )
+            {
+                pm_idle_save = pm_idle;
+                pm_idle = acpi_processor_idle;
+            }
+
+            dead_idle = acpi_dead_idle;
+
+            vendor_override = 1;
+        }
+    }
+    else
+        vendor_override = -1;
+}
+
 uint32_t pmstat_get_cx_nr(uint32_t cpuid)
 {
     return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0;
@@ -1432,8 +1534,8 @@ static int cpu_callback(
     int rc = 0;
 
     /*
-     * Only hook on CPU_UP_PREPARE because a dead cpu may utilize the info
-     * to enter deep C-state.
+     * Only hook on CPU_UP_PREPARE / CPU_ONLINE because a dead cpu may utilize
+     * the info to enter deep C-state.
      */
     switch ( action )
     {
@@ -1442,6 +1544,13 @@ static int cpu_callback(
         if ( !rc && cpuidle_current_governor->enable )
             rc = cpuidle_current_governor->enable(processor_powers[cpu]);
         break;
+
+    case CPU_ONLINE:
+        if ( (boot_cpu_data.x86_vendor &
+              (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
+             processor_powers[cpu] )
+            amd_cpuidle_init(processor_powers[cpu]);
+        break;
     }
 
     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
x86: allow limiting the max C-state sub-state

Allow limiting the max C-state sub-state by appending to the max_cstate
command-line parameter. E.g. max_cstate=1,0
The limit only applies to the highest legal C-state. For example:
 max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
 max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
 max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
 max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explicitly log "unlimited". Pass NULL in the 2nd simple_strtoul()
    invocation.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1374,9 +1374,11 @@ Specify the maximum number of CPUs that
 This option is ignored in **pv-shim** mode.
 
 ### max_cstate (x86)
-> `= <integer>`
+> `= <integer>[,<integer>]`
 
-Specify the deepest C-state CPUs are permitted to be placed in.
+Specify the deepest C-state CPUs are permitted to be placed in, and
+optionally the maximum sub C-state to be used used.  The latter only applies
+to the highest permitted C-state.
 
 ### max_gsi_irqs (x86)
 > `= <integer>`
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -104,7 +104,17 @@ bool lapic_timer_init(void)
 
 void (*__read_mostly pm_idle_save)(void);
 unsigned int max_cstate __read_mostly = UINT_MAX;
-integer_param("max_cstate", max_cstate);
+unsigned int max_csubstate __read_mostly = UINT_MAX;
+
+static int __init parse_cstate(const char *s)
+{
+    max_cstate = simple_strtoul(s, &s, 0);
+    if ( *s == ',' )
+        max_csubstate = simple_strtoul(s + 1, NULL, 0);
+    return 0;
+}
+custom_param("max_cstate", parse_cstate);
+
 static bool __read_mostly local_apic_timer_c2_ok;
 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
 
@@ -347,7 +357,13 @@ static void dump_cx(unsigned char key)
 
     printk("'%c' pressed -> printing ACPI Cx structures\n", key);
     if ( max_cstate < UINT_MAX )
+    {
         printk("max state: C%u\n", max_cstate);
+        if ( max_csubstate < UINT_MAX )
+            printk("max sub-state: %u\n", max_csubstate);
+        else
+            printk("max sub-state: unlimited\n");
+    }
     else
         printk("max state: unlimited\n");
     for_each_present_cpu ( cpu )
@@ -592,7 +608,13 @@ static void acpi_processor_idle(void)
 
         do {
             cx = &power->states[next_state];
-        } while ( cx->type > max_state && --next_state );
+        } while ( (cx->type > max_state ||
+                   cx->entry_method == ACPI_CSTATE_EM_NONE ||
+                   (cx->entry_method == ACPI_CSTATE_EM_FFH &&
+                    cx->type == max_cstate &&
+                    (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
+                  --next_state );
+            cx = &power->states[next_state];
         if ( next_state )
         {
             if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -731,7 +731,9 @@ static void mwait_idle(void)
 
 		do {
 			cx = &power->states[next_state];
-		} while (cx->type > max_state && --next_state);
+		} while ((cx->type > max_state || (cx->type == max_cstate &&
+			  MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
+			 --next_state);
 		if (!next_state)
 			cx = NULL;
 		else if (tb_init_done)
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -141,13 +141,21 @@ void acpi_unregister_gsi (u32 gsi);
 
 #ifdef	CONFIG_ACPI_CSTATE
 /*
- * Set highest legal C-state
- * 0: C0 okay, but not C1
- * 1: C1 okay, but not C2
- * 2: C2 okay, but not C3 etc.
+ * max_cstate sets the highest legal C-state.
+ * max_cstate = 0: C0 okay, but not C1
+ * max_cstate = 1: C1 okay, but not C2
+ * max_cstate = 2: C2 okay, but not C3 etc.
+
+ * max_csubstate sets the highest legal C-state sub-state. Only applies to the
+ * highest legal C-state.
+ * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
+ * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
+ * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
+ * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
  */
 
 extern unsigned int max_cstate;
+extern unsigned int max_csubstate;
 
 static inline unsigned int acpi_get_cstate_limit(void)
 {
tools/libxc: allow controlling the max C-state sub-state

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>

Make handling in do_pm_op() more homogeneous: Before interpreting
op->cpuid as such, handle all operations not acting on a particular
CPU. Also expose the setting via xenpm.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
---
v2: Adjust xenpm's usage message. Also adjust its output wording a
    little.

--- a/tools/libxc/xc_pm.c
+++ b/tools/libxc/xc_pm.c
@@ -367,7 +367,7 @@ int xc_set_sched_opt_smt(xc_interface *x
    return rc;
 }
 
-int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value)
+static int get_max_cstate(xc_interface *xch, uint32_t *value, uint32_t type)
 {
     int rc;
     DECLARE_SYSCTL;
@@ -379,7 +379,7 @@ int xc_get_cpuidle_max_cstate(xc_interfa
     }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_get_max_cstate;
-    sysctl.u.pm_op.cpuid = 0;
+    sysctl.u.pm_op.cpuid = type;
     sysctl.u.pm_op.u.get_max_cstate = 0;
     rc = do_sysctl(xch, &sysctl);
     *value = sysctl.u.pm_op.u.get_max_cstate;
@@ -387,7 +387,17 @@ int xc_get_cpuidle_max_cstate(xc_interfa
     return rc;
 }
 
-int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value)
+int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value)
+{
+    return get_max_cstate(xch, value, 0);
+}
+
+int xc_get_cpuidle_max_csubstate(xc_interface *xch, uint32_t *value)
+{
+    return get_max_cstate(xch, value, 1);
+}
+
+static int set_max_cstate(xc_interface *xch, uint32_t value, uint32_t type)
 {
     DECLARE_SYSCTL;
 
@@ -398,12 +408,22 @@ int xc_set_cpuidle_max_cstate(xc_interfa
     }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_set_max_cstate;
-    sysctl.u.pm_op.cpuid = 0;
+    sysctl.u.pm_op.cpuid = type;
     sysctl.u.pm_op.u.set_max_cstate = value;
 
     return do_sysctl(xch, &sysctl);
 }
 
+int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value)
+{
+    return set_max_cstate(xch, value, 0);
+}
+
+int xc_set_cpuidle_max_csubstate(xc_interface *xch, uint32_t value)
+{
+    return set_max_cstate(xch, value, 1);
+}
+
 int xc_enable_turbo(xc_interface *xch, int cpuid)
 {
     DECLARE_SYSCTL;
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1906,6 +1906,9 @@ int xc_set_sched_opt_smt(xc_interface *x
 int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value);
 int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value);
 
+int xc_get_cpuidle_max_csubstate(xc_interface *xch, uint32_t *value);
+int xc_set_cpuidle_max_csubstate(xc_interface *xch, uint32_t value);
+
 int xc_enable_turbo(xc_interface *xch, int cpuid);
 int xc_disable_turbo(xc_interface *xch, int cpuid);
 
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -64,7 +64,9 @@ void show_help(void)
             " set-sched-smt           enable|disable enable/disable scheduler smt power saving\n"
             " set-vcpu-migration-delay      <num> set scheduler vcpu migration delay in us\n"
             " get-vcpu-migration-delay            get scheduler vcpu migration delay\n"
-            " set-max-cstate        <num>|'unlimited' set the C-State limitation (<num> >= 0)\n"
+            " set-max-cstate        <num>|'unlimited'[,<num2>|'unlimited']\n"
+            "                                     set the C-State limitation (<num> >= 0) and\n"
+            "                                     optionally the C-sub-state limitation (<num2> >= 0)\n"
             " start [seconds]                     start collect Cx/Px statistics,\n"
             "                                     output after CTRL-C or SIGINT or several seconds.\n"
             " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
@@ -195,7 +197,15 @@ static int show_max_cstate(xc_interface
         return ret;
 
     if ( value < XEN_SYSCTL_CX_UNLIMITED )
-        printf("Max possible C-state: C%"PRIu32"\n\n", value);
+    {
+        printf("Max possible C-state: C%"PRIu32"\n", value);
+        if ( (ret = xc_get_cpuidle_max_csubstate(xc_handle, &value)) )
+            return ret;
+        if ( value < XEN_SYSCTL_CX_UNLIMITED )
+            printf("Max possible substate: %"PRIu32"\n\n", value);
+        else
+            puts("");
+    }
     else
         printf("All C-states allowed\n\n");
 
@@ -1120,13 +1130,17 @@ void get_vcpu_migration_delay_func(int a
 
 void set_max_cstate_func(int argc, char *argv[])
 {
-    int value;
+    int value, subval = XEN_SYSCTL_CX_UNLIMITED;
     char buf[12];
 
-    if ( argc != 1 ||
+    if ( argc < 1 || argc > 2 ||
          (sscanf(argv[0], "%d", &value) == 1
           ? value < 0
-          : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) )
+          : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) ||
+         (argc == 2 &&
+          (sscanf(argv[1], "%d", &subval) == 1
+           ? subval < 0
+           : (subval = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[1], "unlimited")))) )
     {
         fprintf(stderr, "Missing, excess, or invalid argument(s)\n");
         exit(EINVAL);
@@ -1137,8 +1151,23 @@ void set_max_cstate_func(int argc, char
     if ( !xc_set_cpuidle_max_cstate(xc_handle, (uint32_t)value) )
         printf("max C-state set to %s\n", value >= 0 ? buf : argv[0]);
     else
+    {
         fprintf(stderr, "Failed to set max C-state to %s (%d - %s)\n",
                 value >= 0 ? buf : argv[0], errno, strerror(errno));
+        return;
+    }
+
+    if ( value != XEN_SYSCTL_CX_UNLIMITED )
+    {
+        snprintf(buf, ARRAY_SIZE(buf), "%d", subval);
+
+        if ( !xc_set_cpuidle_max_csubstate(xc_handle, (uint32_t)subval) )
+            printf("max C-substate set to %s succeeded\n",
+                   subval >= 0 ? buf : "unlimited");
+        else
+            fprintf(stderr, "Failed to set max C-substate to %s (%d - %s)\n",
+                    subval >= 0 ? buf : "unlimited", errno, strerror(errno));
+    }
 }
 
 void enable_turbo_mode(int argc, char *argv[])
--- a/xen/drivers/acpi/pmstat.c
+++ b/xen/drivers/acpi/pmstat.c
@@ -398,7 +398,40 @@ int do_pm_op(struct xen_sysctl_pm_op *op
     int ret = 0;
     const struct processor_pminfo *pmpt;
 
-    if ( !op || op->cpuid >= nr_cpu_ids || !cpu_online(op->cpuid) )
+    switch ( op->cmd )
+    {
+    case XEN_SYSCTL_pm_op_set_sched_opt_smt:
+    {
+        uint32_t saved_value = sched_smt_power_savings;
+
+        if ( op->cpuid != 0 )
+            return -EINVAL;
+        sched_smt_power_savings = !!op->u.set_sched_opt_smt;
+        op->u.set_sched_opt_smt = saved_value;
+        return 0;
+    }
+
+    case XEN_SYSCTL_pm_op_get_max_cstate:
+        BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX);
+        if ( op->cpuid == 0 )
+            op->u.get_max_cstate = acpi_get_cstate_limit();
+        else if ( op->cpuid == 1 )
+            op->u.get_max_cstate = acpi_get_csubstate_limit();
+        else
+            ret = -EINVAL;
+        return ret;
+
+    case XEN_SYSCTL_pm_op_set_max_cstate:
+        if ( op->cpuid == 0 )
+            acpi_set_cstate_limit(op->u.set_max_cstate);
+        else if ( op->cpuid == 1 )
+            acpi_set_csubstate_limit(op->u.set_max_cstate);
+        else
+            ret = -EINVAL;
+        return ret;
+    }
+
+    if ( op->cpuid >= nr_cpu_ids || !cpu_online(op->cpuid) )
         return -EINVAL;
     pmpt = processor_pminfo[op->cpuid];
 
@@ -438,30 +471,6 @@ int do_pm_op(struct xen_sysctl_pm_op *op
         break;
     }
 
-    case XEN_SYSCTL_pm_op_set_sched_opt_smt:
-    {
-        uint32_t saved_value;
-
-        saved_value = sched_smt_power_savings;
-        sched_smt_power_savings = !!op->u.set_sched_opt_smt;
-        op->u.set_sched_opt_smt = saved_value;
-
-        break;
-    }
-
-    case XEN_SYSCTL_pm_op_get_max_cstate:
-    {
-        BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX);
-        op->u.get_max_cstate = acpi_get_cstate_limit();
-        break;
-    }
-
-    case XEN_SYSCTL_pm_op_set_max_cstate:
-    {
-        acpi_set_cstate_limit(op->u.set_max_cstate);
-        break;
-    }
-
     case XEN_SYSCTL_pm_op_enable_turbo:
     {
         ret = cpufreq_update_turbo(op->cpuid, CPUFREQ_TURBO_ENABLED);
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -350,7 +350,11 @@ struct xen_sysctl_pm_op {
     /* set/reset scheduler power saving option */
     #define XEN_SYSCTL_pm_op_set_sched_opt_smt    0x21
 
-    /* cpuidle max_cstate access command */
+    /*
+     * cpuidle max C-state and max C-sub-state access command:
+     * Set cpuid to 0 for max C-state.
+     * Set cpuid to 1 for max C-sub-state.
+     */
     #define XEN_SYSCTL_pm_op_get_max_cstate       0x22
     #define XEN_SYSCTL_pm_op_set_max_cstate       0x23
 
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -166,9 +166,22 @@ static inline void acpi_set_cstate_limit
 	max_cstate = new_limit;
 	return;
 }
+
+static inline unsigned int acpi_get_csubstate_limit(void)
+{
+	return max_csubstate;
+}
+
+static inline void acpi_set_csubstate_limit(unsigned int new_limit)
+{
+	max_csubstate = new_limit;
+}
+
 #else
 static inline unsigned int acpi_get_cstate_limit(void) { return 0; }
 static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; }
+static inline unsigned int acpi_get_csubstate_limit(void) { return 0; }
+static inline void acpi_set_csubstate_limit(unsigned int new_limit) { return; }
 #endif
 
 #ifdef XEN_GUEST_HANDLE_PARAM
diff mbox

Patch

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1376,6 +1376,8 @@  This option is ignored in **pv-shim** mo
 ### max_cstate (x86)
 > `= <integer>`
 
+Specify the deepest C-state CPUs are permitted to be placed in.
+
 ### max_gsi_irqs (x86)
 > `= <integer>`
 
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -64,7 +64,7 @@  void show_help(void)
             " set-sched-smt           enable|disable enable/disable scheduler smt power saving\n"
             " set-vcpu-migration-delay      <num> set scheduler vcpu migration delay in us\n"
             " get-vcpu-migration-delay            get scheduler vcpu migration delay\n"
-            " set-max-cstate        <num>         set the C-State limitation (<num> >= 0)\n"
+            " set-max-cstate        <num>|'unlimited' set the C-State limitation (<num> >= 0)\n"
             " start [seconds]                     start collect Cx/Px statistics,\n"
             "                                     output after CTRL-C or SIGINT or several seconds.\n"
             " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
@@ -194,7 +194,11 @@  static int show_max_cstate(xc_interface
     if ( (ret = xc_get_cpuidle_max_cstate(xc_handle, &value)) )
         return ret;
 
-    printf("Max possible C-state: C%d\n\n", value);
+    if ( value < XEN_SYSCTL_CX_UNLIMITED )
+        printf("Max possible C-state: C%"PRIu32"\n\n", value);
+    else
+        printf("All C-states allowed\n\n");
+
     return 0;
 }
 
@@ -1117,18 +1121,24 @@  void get_vcpu_migration_delay_func(int a
 void set_max_cstate_func(int argc, char *argv[])
 {
     int value;
+    char buf[12];
 
-    if ( argc != 1 || sscanf(argv[0], "%d", &value) != 1 || value < 0 )
+    if ( argc != 1 ||
+         (sscanf(argv[0], "%d", &value) == 1
+          ? value < 0
+          : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) )
     {
-        fprintf(stderr, "Missing or invalid argument(s)\n");
+        fprintf(stderr, "Missing, excess, or invalid argument(s)\n");
         exit(EINVAL);
     }
 
+    snprintf(buf, ARRAY_SIZE(buf), "C%d", value);
+
     if ( !xc_set_cpuidle_max_cstate(xc_handle, (uint32_t)value) )
-        printf("set max_cstate to C%d succeeded\n", value);
+        printf("max C-state set to %s\n", value >= 0 ? buf : argv[0]);
     else
-        fprintf(stderr, "set max_cstate to C%d failed (%d - %s)\n",
-                value, errno, strerror(errno));
+        fprintf(stderr, "Failed to set max C-state to %s (%d - %s)\n",
+                value >= 0 ? buf : argv[0], errno, strerror(errno));
 }
 
 void enable_turbo_mode(int argc, char *argv[])
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -103,7 +103,7 @@  bool lapic_timer_init(void)
 }
 
 void (*__read_mostly pm_idle_save)(void);
-unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
+unsigned int max_cstate __read_mostly = UINT_MAX;
 integer_param("max_cstate", max_cstate);
 static bool __read_mostly local_apic_timer_c2_ok;
 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
@@ -344,7 +344,10 @@  static void dump_cx(unsigned char key)
     unsigned int cpu;
 
     printk("'%c' pressed -> printing ACPI Cx structures\n", key);
-    printk("max cstate: C%u\n", max_cstate);
+    if ( max_cstate < UINT_MAX )
+        printk("max state: C%u\n", max_cstate);
+    else
+        printk("max state: unlimited\n");
     for_each_present_cpu ( cpu )
     {
         struct acpi_processor_power *power = processor_powers[cpu];
@@ -582,13 +585,19 @@  static void acpi_processor_idle(void)
     if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
          (next_state = cpuidle_current_governor->select(power)) > 0 )
     {
-        cx = &power->states[next_state];
-        if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
-             acpi_idle_bm_check() )
-            cx = power->safe_state;
-        if ( cx->idx > max_cstate )
-            cx = &power->states[max_cstate];
-        menu_get_trace_data(&exp, &pred);
+        do {
+            cx = &power->states[next_state];
+        } while ( cx->type > max_cstate && --next_state );
+        if ( next_state )
+        {
+            if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
+                 acpi_idle_bm_check() )
+                cx = power->safe_state;
+            if ( tb_init_done )
+                menu_get_trace_data(&exp, &pred);
+        }
+        else
+            cx = NULL;
     }
     if ( !cx )
     {
@@ -1396,12 +1405,12 @@  int pmstat_reset_cx_stat(uint32_t cpuid)
 
 void cpuidle_disable_deep_cstate(void)
 {
-    if ( max_cstate > 1 )
+    if ( max_cstate > ACPI_STATE_C1 )
     {
         if ( local_apic_timer_c2_ok )
-            max_cstate = 2;
+            max_cstate = ACPI_STATE_C2;
         else
-            max_cstate = 1;
+            max_cstate = ACPI_STATE_C1;
     }
 
     hpet_disable_legacy_broadcast();
@@ -1409,7 +1418,8 @@  void cpuidle_disable_deep_cstate(void)
 
 bool cpuidle_using_deep_cstate(void)
 {
-    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 2 : 1);
+    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
+                                                               : ACPI_STATE_C1);
 }
 
 static int cpu_callback(
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -731,7 +731,8 @@  static void mwait_idle(void)
 		} while (cx->type > max_cstate && --next_state);
 		if (!next_state)
 			cx = NULL;
-		menu_get_trace_data(&exp, &pred);
+		else if (tb_init_done)
+			menu_get_trace_data(&exp, &pred);
 	}
 	if (!cx) {
 		if (pm_idle_save)
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -2268,7 +2268,7 @@  static void dump_softtsc(unsigned char k
     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
     {
         printk("TSC has constant rate, ");
-        if (max_cstate <= 2 && tsc_max_warp == 0)
+        if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
             printk("no deep Cstates, passed warp test, deemed reliable, ");
         else
             printk("deep Cstates possible, so not reliable, ");
--- a/xen/drivers/acpi/pmstat.c
+++ b/xen/drivers/acpi/pmstat.c
@@ -451,6 +451,7 @@  int do_pm_op(struct xen_sysctl_pm_op *op
 
     case XEN_SYSCTL_pm_op_get_max_cstate:
     {
+        BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX);
         op->u.get_max_cstate = acpi_get_cstate_limit();
         break;
     }
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -370,6 +370,7 @@  struct xen_sysctl_pm_op {
         struct xen_set_cpufreq_para set_para;
         uint64_aligned_t get_avgfreq;
         uint32_t                    set_sched_opt_smt;
+#define XEN_SYSCTL_CX_UNLIMITED 0xffffffff
         uint32_t                    get_max_cstate;
         uint32_t                    set_max_cstate;
     } u;