diff mbox series

ACPI/APEI: Better fix to avoid spamming the console with old error logs

Message ID 20220622170906.33759-1-tony.luck@intel.com (mailing list archive)
State Mainlined, archived
Headers show
Series ACPI/APEI: Better fix to avoid spamming the console with old error logs | expand

Commit Message

Tony Luck June 22, 2022, 5:09 p.m. UTC
The fix in commit 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT
table data") does not work as intended on systems where the BIOS has a
fixed size block of memory for the BERT table, relying on s/w to quit
when it finds a record with estatus->block_status == 0. On these systems
all errors are suppressed because the check:

	if (region_len < ACPI_BERT_PRINT_MAX_LEN)

always fails.

New scheme skips individual CPER records that are too large, and also
limits the total number of records that will be printed to 5.

Fixes: 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT table data")
Cc: stable@vger.kernel.org
Signed-off-by: Tony Luck <tony.luck@intel.com>
---

Now in PATCH format with a real commit comment. This version fixes
the issues seen by Intel's validation team.

 drivers/acpi/apei/bert.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

Comments

Darren Hart June 27, 2022, 9:15 p.m. UTC | #1
On Wed, Jun 22, 2022 at 10:09:06AM -0700, Tony Luck wrote:
> The fix in commit 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT
> table data") does not work as intended on systems where the BIOS has a
> fixed size block of memory for the BERT table, relying on s/w to quit
> when it finds a record with estatus->block_status == 0. On these systems
> all errors are suppressed because the check:
> 
> 	if (region_len < ACPI_BERT_PRINT_MAX_LEN)
> 
> always fails.
> 
> New scheme skips individual CPER records that are too large, and also
> limits the total number of records that will be printed to 5.

Apologies for the delay.

This seems like a reasonable approach. Working to confirm new behavior on Ampere
Altra systems (specifically how region_len and estatus_len are related).
Rafael J. Wysocki June 29, 2022, 5:56 p.m. UTC | #2
On Wed, Jun 22, 2022 at 7:09 PM Tony Luck <tony.luck@intel.com> wrote:
>
> The fix in commit 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT
> table data") does not work as intended on systems where the BIOS has a
> fixed size block of memory for the BERT table, relying on s/w to quit
> when it finds a record with estatus->block_status == 0. On these systems
> all errors are suppressed because the check:
>
>         if (region_len < ACPI_BERT_PRINT_MAX_LEN)
>
> always fails.
>
> New scheme skips individual CPER records that are too large, and also
> limits the total number of records that will be printed to 5.
>
> Fixes: 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT table data")
> Cc: stable@vger.kernel.org
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>
> Now in PATCH format with a real commit comment. This version fixes
> the issues seen by Intel's validation team.
>
>  drivers/acpi/apei/bert.c | 31 +++++++++++++++++++++++--------
>  1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c
> index 598fd19b65fa..45973aa6e06d 100644
> --- a/drivers/acpi/apei/bert.c
> +++ b/drivers/acpi/apei/bert.c
> @@ -29,16 +29,26 @@
>
>  #undef pr_fmt
>  #define pr_fmt(fmt) "BERT: " fmt
> +
> +#define ACPI_BERT_PRINT_MAX_RECORDS 5
>  #define ACPI_BERT_PRINT_MAX_LEN 1024
>
>  static int bert_disable;
>
> +/*
> + * Print "all" the error records in the BERT table, but avoid huge spam to
> + * the console if the BIOS included oversize records, or too many records.
> + * Skipping some records here does not lose anything because the full
> + * data is available to user tools in:
> + *     /sys/firmware/acpi/tables/data/BERT
> + */
>  static void __init bert_print_all(struct acpi_bert_region *region,
>                                   unsigned int region_len)
>  {
>         struct acpi_hest_generic_status *estatus =
>                 (struct acpi_hest_generic_status *)region;
>         int remain = region_len;
> +       int printed = 0, skipped = 0;
>         u32 estatus_len;
>
>         while (remain >= sizeof(struct acpi_bert_region)) {
> @@ -46,24 +56,26 @@ static void __init bert_print_all(struct acpi_bert_region *region,
>                 if (remain < estatus_len) {
>                         pr_err(FW_BUG "Truncated status block (length: %u).\n",
>                                estatus_len);
> -                       return;
> +                       break;
>                 }
>
>                 /* No more error records. */
>                 if (!estatus->block_status)
> -                       return;
> +                       break;
>
>                 if (cper_estatus_check(estatus)) {
>                         pr_err(FW_BUG "Invalid error record.\n");
> -                       return;
> +                       break;
>                 }
>
> -               pr_info_once("Error records from previous boot:\n");
> -               if (region_len < ACPI_BERT_PRINT_MAX_LEN)
> +               if (estatus_len < ACPI_BERT_PRINT_MAX_LEN &&
> +                   printed < ACPI_BERT_PRINT_MAX_RECORDS) {
> +                       pr_info_once("Error records from previous boot:\n");
>                         cper_estatus_print(KERN_INFO HW_ERR, estatus);
> -               else
> -                       pr_info_once("Max print length exceeded, table data is available at:\n"
> -                                    "/sys/firmware/acpi/tables/data/BERT");
> +                       printed++;
> +               } else {
> +                       skipped++;
> +               }
>
>                 /*
>                  * Because the boot error source is "one-time polled" type,
> @@ -75,6 +87,9 @@ static void __init bert_print_all(struct acpi_bert_region *region,
>                 estatus = (void *)estatus + estatus_len;
>                 remain -= estatus_len;
>         }
> +
> +       if (skipped)
> +               pr_info(HW_ERR "Skipped %d error records\n", skipped);
>  }
>
>  static int __init setup_bert_disable(char *str)
> --

Applied as 5.20 material, thanks!
diff mbox series

Patch

diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c
index 598fd19b65fa..45973aa6e06d 100644
--- a/drivers/acpi/apei/bert.c
+++ b/drivers/acpi/apei/bert.c
@@ -29,16 +29,26 @@ 
 
 #undef pr_fmt
 #define pr_fmt(fmt) "BERT: " fmt
+
+#define ACPI_BERT_PRINT_MAX_RECORDS 5
 #define ACPI_BERT_PRINT_MAX_LEN 1024
 
 static int bert_disable;
 
+/*
+ * Print "all" the error records in the BERT table, but avoid huge spam to
+ * the console if the BIOS included oversize records, or too many records.
+ * Skipping some records here does not lose anything because the full
+ * data is available to user tools in:
+ *	/sys/firmware/acpi/tables/data/BERT
+ */
 static void __init bert_print_all(struct acpi_bert_region *region,
 				  unsigned int region_len)
 {
 	struct acpi_hest_generic_status *estatus =
 		(struct acpi_hest_generic_status *)region;
 	int remain = region_len;
+	int printed = 0, skipped = 0;
 	u32 estatus_len;
 
 	while (remain >= sizeof(struct acpi_bert_region)) {
@@ -46,24 +56,26 @@  static void __init bert_print_all(struct acpi_bert_region *region,
 		if (remain < estatus_len) {
 			pr_err(FW_BUG "Truncated status block (length: %u).\n",
 			       estatus_len);
-			return;
+			break;
 		}
 
 		/* No more error records. */
 		if (!estatus->block_status)
-			return;
+			break;
 
 		if (cper_estatus_check(estatus)) {
 			pr_err(FW_BUG "Invalid error record.\n");
-			return;
+			break;
 		}
 
-		pr_info_once("Error records from previous boot:\n");
-		if (region_len < ACPI_BERT_PRINT_MAX_LEN)
+		if (estatus_len < ACPI_BERT_PRINT_MAX_LEN &&
+		    printed < ACPI_BERT_PRINT_MAX_RECORDS) {
+			pr_info_once("Error records from previous boot:\n");
 			cper_estatus_print(KERN_INFO HW_ERR, estatus);
-		else
-			pr_info_once("Max print length exceeded, table data is available at:\n"
-				     "/sys/firmware/acpi/tables/data/BERT");
+			printed++;
+		} else {
+			skipped++;
+		}
 
 		/*
 		 * Because the boot error source is "one-time polled" type,
@@ -75,6 +87,9 @@  static void __init bert_print_all(struct acpi_bert_region *region,
 		estatus = (void *)estatus + estatus_len;
 		remain -= estatus_len;
 	}
+
+	if (skipped)
+		pr_info(HW_ERR "Skipped %d error records\n", skipped);
 }
 
 static int __init setup_bert_disable(char *str)