diff mbox series

[v4,3/3] GHES: Add GHES NMI nice level support

Message ID 1536136099-4880-1-git-send-email-qiuxu.zhuo@intel.com (mailing list archive)
State Changes Requested, archived
Headers show
Series None | expand

Commit Message

Zhuo, Qiuxu Sept. 5, 2018, 8:28 a.m. UTC
Current NMI mechanism is to process all the handlers for each NMI.
Because perf uses NMI, so GHES NMI handler runs unnecessarily for
every perf NMI handling. This will be captured by PMU's PEBS (Precise
Event Based Sampling) and disturb perf result.

GHES NMIs are very rare because they are only used in extreme error
situations or very frequent when machine is dying and error floods
happen. So add a GHES NMI nice level via GHES platform device sysfs
if it's > 0 and any other NMI (e.g. PMU NMI) has been handled for
current NMI, then skip current GHES NMI handler. So next PMU NMI can
be processed early and perf result is not distrubed by GHES NMI handler.

We reply statistically on the property that GHES NMIs are unlikely
to collide with perf NMIs, or they are frequent there will be enough
of them that it doesn't matter. It's a heuristics that is not 100%
correct, but a reasonable one, and it saves a lot of unnecessary
work for every NMI.

Test machines have HEST ACPI table installed and NMI notification
set, test cmds are 'perf mem record -a sleep 1' and 'perf mem report'.

Before applying patch (perf memory profile):

On Intel Broadwell-4S:
0.63%  1  17910  LFB or LFB hit  [k] intel_pstate_update_util
0.59%  1  16960  LFB or LFB hit  [k] intel_pstate_update_util
...
0.30%  1  8722   L1 or L1 hit    [k] ghes_notify_nmi

On Intel Skylake-4S:
3.45%  1  20218  L1 hit          [k] native_read_msr
1.21%  1  7078   LFB hit         [k] intel_pstate_update_util
...
1.21%  1  7077   N/A miss        [k] ghes_notify_nmi

After applying patch and 'echo 1 > /sys/devices/platform/GHES.[0-9]*/nmi_nice':
No GHES was showed up in perf memory profile.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Reported-by: Andi Kleen <andi.kleen@intel.com>
---
 drivers/acpi/apei/ghes.c | 33 +++++++++++++++++++++++++++++++--
 include/acpi/ghes.h      |  2 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

Comments

Rafael J. Wysocki Oct. 3, 2018, 9:37 a.m. UTC | #1
On Wednesday, September 5, 2018 10:28:19 AM CEST Qiuxu Zhuo wrote:
> Current NMI mechanism is to process all the handlers for each NMI.
> Because perf uses NMI, so GHES NMI handler runs unnecessarily for
> every perf NMI handling. This will be captured by PMU's PEBS (Precise
> Event Based Sampling) and disturb perf result.
> 
> GHES NMIs are very rare because they are only used in extreme error
> situations or very frequent when machine is dying and error floods
> happen. So add a GHES NMI nice level via GHES platform device sysfs
> if it's > 0 and any other NMI (e.g. PMU NMI) has been handled for
> current NMI, then skip current GHES NMI handler. So next PMU NMI can
> be processed early and perf result is not distrubed by GHES NMI handler.
> 
> We reply statistically on the property that GHES NMIs are unlikely
> to collide with perf NMIs, or they are frequent there will be enough
> of them that it doesn't matter. It's a heuristics that is not 100%
> correct, but a reasonable one, and it saves a lot of unnecessary
> work for every NMI.
> 
> Test machines have HEST ACPI table installed and NMI notification
> set, test cmds are 'perf mem record -a sleep 1' and 'perf mem report'.
> 
> Before applying patch (perf memory profile):
> 
> On Intel Broadwell-4S:
> 0.63%  1  17910  LFB or LFB hit  [k] intel_pstate_update_util
> 0.59%  1  16960  LFB or LFB hit  [k] intel_pstate_update_util
> ...
> 0.30%  1  8722   L1 or L1 hit    [k] ghes_notify_nmi
> 
> On Intel Skylake-4S:
> 3.45%  1  20218  L1 hit          [k] native_read_msr
> 1.21%  1  7078   LFB hit         [k] intel_pstate_update_util
> ...
> 1.21%  1  7077   N/A miss        [k] ghes_notify_nmi
> 
> After applying patch and 'echo 1 > /sys/devices/platform/GHES.[0-9]*/nmi_nice':
> No GHES was showed up in perf memory profile.
> 
> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> Suggested-by: Ying Huang <ying.huang@intel.com>
> Reported-by: Andi Kleen <andi.kleen@intel.com>

Unless this has been applied already, can you CC the entire series to
linux-acpi, please?

Thanks,
Rafael
Zhuo, Qiuxu Oct. 3, 2018, 12:24 p.m. UTC | #2
> Unless this has been applied already, can you CC the entire series to linux-acpi,
> please?

Sure!    :-)

Thanks!
-Qiuxu
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index afc3814f8a8b..394cb5286fb3 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -47,6 +47,7 @@ 
 #include <linux/sched/clock.h>
 #include <linux/uuid.h>
 #include <linux/ras.h>
+#include <linux/device.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -129,6 +130,8 @@  static atomic_t ghes_estatus_cache_alloced;
 
 static int ghes_panic_timeout __read_mostly = 30;
 
+static int ghes_nmi_nice;
+
 static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
 {
 	phys_addr_t paddr;
@@ -929,11 +932,32 @@  static void __process_error(struct ghes *ghes)
 #endif
 }
 
+static ssize_t nmi_nice_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ghes_nmi_nice);
+}
+
+static ssize_t nmi_nice_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	if (kstrtoint(buf, 0, &ghes_nmi_nice))
+		return -EINVAL;
+
+	return count;
+}
+static DEVICE_ATTR_RW(nmi_nice);
+
 static int ghes_notify_nmi(unsigned int cmd, int handled, struct pt_regs *regs)
 {
 	struct ghes *ghes;
 	int sev, ret = NMI_DONE;
 
+	if (handled && ghes_nmi_nice > 0)
+		return NMI_DONE;
+
 	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
 		return ret;
 
@@ -994,8 +1018,10 @@  static void ghes_nmi_add(struct ghes *ghes)
 	len = ghes_esource_prealloc_size(ghes->generic);
 	ghes_estatus_pool_expand(len);
 	mutex_lock(&ghes_list_mutex);
-	if (list_empty(&ghes_nmi))
+	if (list_empty(&ghes_nmi)) {
 		register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
+		device_create_file(ghes->dev, &dev_attr_nmi_nice);
+	}
 	list_add_rcu(&ghes->list, &ghes_nmi);
 	mutex_unlock(&ghes_list_mutex);
 }
@@ -1006,8 +1032,10 @@  static void ghes_nmi_remove(struct ghes *ghes)
 
 	mutex_lock(&ghes_list_mutex);
 	list_del_rcu(&ghes->list);
-	if (list_empty(&ghes_nmi))
+	if (list_empty(&ghes_nmi)) {
+		device_remove_file(ghes->dev, &dev_attr_nmi_nice);
 		unregister_nmi_handler(NMI_LOCAL, "ghes");
+	}
 	mutex_unlock(&ghes_list_mutex);
 	/*
 	 * To synchronize with NMI handler, ghes can only be
@@ -1086,6 +1114,7 @@  static int ghes_probe(struct platform_device *ghes_dev)
 		ghes = NULL;
 		goto err;
 	}
+	ghes->dev = &ghes_dev->dev;
 
 	switch (generic->notify.type) {
 	case ACPI_HEST_NOTIFY_POLLED:
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 1624e2be485c..936903d3af16 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -29,6 +29,8 @@  struct ghes {
 		struct timer_list timer;
 		unsigned int irq;
 	};
+
+	struct device *dev;
 };
 
 struct ghes_estatus_node {