diff mbox

[V4,01/10] acpi: apei: read ack upon ghes record consumption

Message ID 1477071013-29563-2-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Tyler Baicar Oct. 21, 2016, 5:30 p.m. UTC
A RAS (Reliability, Availability, Serviceability) controller
may be a separate processor running in parallel with OS
execution, and may generate error records for consumption by
the OS. If the RAS controller produces multiple error records,
then they may be overwritten before the OS has consumed them.

The Generic Hardware Error Source (GHES) v2 structure
introduces the capability for the OS to acknowledge the
consumption of the error record generated by the RAS
controller. A RAS controller supporting GHESv2 shall wait for
the acknowledgment before writing a new error record, thus
eliminating the race condition.

Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org>
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Signed-off-by: Naveen Kaje <nkaje@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/apei/hest.c |  7 +++++--
 include/acpi/ghes.h      |  5 ++++-
 3 files changed, 51 insertions(+), 3 deletions(-)

Comments

Suzuki K Poulose Oct. 24, 2016, 8:51 a.m. UTC | #1
On 21/10/16 18:30, Tyler Baicar wrote:
> A RAS (Reliability, Availability, Serviceability) controller
> may be a separate processor running in parallel with OS
> execution, and may generate error records for consumption by
> the OS. If the RAS controller produces multiple error records,
> then they may be overwritten before the OS has consumed them.
>
> The Generic Hardware Error Source (GHES) v2 structure
> introduces the capability for the OS to acknowledge the
> consumption of the error record generated by the RAS
> controller. A RAS controller supporting GHESv2 shall wait for
> the acknowledgment before writing a new error record, thus
> eliminating the race condition.
>
> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/acpi/apei/hest.c |  7 +++++--
>  include/acpi/ghes.h      |  5 ++++-
>  3 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 60746ef..7d020b0 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -45,6 +45,7 @@
>  #include <linux/aer.h>
>  #include <linux/nmi.h>
>
> +#include <acpi/actbl1.h>
>  #include <acpi/ghes.h>
>  #include <acpi/apei.h>
>  #include <asm/tlbflush.h>
> @@ -79,6 +80,10 @@
>  	((struct acpi_hest_generic_status *)				\
>  	 ((struct ghes_estatus_node *)(estatus_node) + 1))
>
> +#define HEST_TYPE_GENERIC_V2(ghes)				\
> +	((struct acpi_hest_header *)ghes->generic)->type ==	\
> +	 ACPI_HEST_TYPE_GENERIC_ERROR_V2
> +
>  /*
>   * This driver isn't really modular, however for the time being,
>   * continuing to use module_param is the easiest way to remain
> @@ -248,7 +253,15 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>  	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>  	if (!ghes)
>  		return ERR_PTR(-ENOMEM);
> +
>  	ghes->generic = generic;
> +	if (HEST_TYPE_GENERIC_V2(ghes)) {
> +		rc = apei_map_generic_address(
> +			&ghes->generic_v2->read_ack_register);
> +		if (rc)
> +			goto err_unmap;

I think should be goto err_free, see more below.

> +	}
> +
>  	rc = apei_map_generic_address(&generic->error_status_address);
>  	if (rc)
>  		goto err_free;
> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>
>  err_unmap:
>  	apei_unmap_generic_address(&generic->error_status_address);
> +	if (HEST_TYPE_GENERIC_V2(ghes))
> +		apei_unmap_generic_address(
> +			&ghes->generic_v2->read_ack_register);

We might end up trying to unmap (error_status_address) which is not mapped
if we hit the error in mapping read_ack_register. The read_ack_register unmap
hunk should be moved below to err_free.


>  err_free:
>  	kfree(ghes);
>  	return ERR_PTR(rc);
> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
>  {
>  	kfree(ghes->estatus);
>  	apei_unmap_generic_address(&ghes->generic->error_status_address);
> +	if (HEST_TYPE_GENERIC_V2(ghes))
> +		apei_unmap_generic_address(
> +			&ghes->generic_v2->read_ack_register);
>  }
>
>  static inline int ghes_severity(int severity)
> @@ -648,6 +667,23 @@ static void ghes_estatus_cache_add(
>  	rcu_read_unlock();
>  }
>

> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)

nit: We are actually writing something to the read_ack_register. The names
read_ack_register (which may be as per standard) and more importantly the
function name (ghes_do_read_ack) sounds a bit misleading.

Rest looks fine to me.

Suzuki
Tyler Baicar Oct. 24, 2016, 8:28 p.m. UTC | #2
On 10/24/2016 2:51 AM, Suzuki K Poulose wrote:
> On 21/10/16 18:30, Tyler Baicar wrote:
>> A RAS (Reliability, Availability, Serviceability) controller
>> may be a separate processor running in parallel with OS
>> execution, and may generate error records for consumption by
>> the OS. If the RAS controller produces multiple error records,
>> then they may be overwritten before the OS has consumed them.
>>
>> The Generic Hardware Error Source (GHES) v2 structure
>> introduces the capability for the OS to acknowledge the
>> consumption of the error record generated by the RAS
>> controller. A RAS controller supporting GHESv2 shall wait for
>> the acknowledgment before writing a new error record, thus
>> eliminating the race condition.
>>
>> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
>> Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org>
>> ---
>>  drivers/acpi/apei/ghes.c | 42 
>> ++++++++++++++++++++++++++++++++++++++++++
>>  drivers/acpi/apei/hest.c |  7 +++++--
>>  include/acpi/ghes.h      |  5 ++++-
>>  3 files changed, 51 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 60746ef..7d020b0 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -45,6 +45,7 @@
>>  #include <linux/aer.h>
>>  #include <linux/nmi.h>
>>
>> +#include <acpi/actbl1.h>
>>  #include <acpi/ghes.h>
>>  #include <acpi/apei.h>
>>  #include <asm/tlbflush.h>
>> @@ -79,6 +80,10 @@
>>      ((struct acpi_hest_generic_status *)                \
>>       ((struct ghes_estatus_node *)(estatus_node) + 1))
>>
>> +#define HEST_TYPE_GENERIC_V2(ghes)                \
>> +    ((struct acpi_hest_header *)ghes->generic)->type ==    \
>> +     ACPI_HEST_TYPE_GENERIC_ERROR_V2
>> +
>>  /*
>>   * This driver isn't really modular, however for the time being,
>>   * continuing to use module_param is the easiest way to remain
>> @@ -248,7 +253,15 @@ static struct ghes *ghes_new(struct 
>> acpi_hest_generic *generic)
>>      ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>>      if (!ghes)
>>          return ERR_PTR(-ENOMEM);
>> +
>>      ghes->generic = generic;
>> +    if (HEST_TYPE_GENERIC_V2(ghes)) {
>> +        rc = apei_map_generic_address(
>> +            &ghes->generic_v2->read_ack_register);
>> +        if (rc)
>> +            goto err_unmap;
>
> I think should be goto err_free, see more below.
>
>> +    }
>> +
>>      rc = apei_map_generic_address(&generic->error_status_address);
>>      if (rc)
>>          goto err_free;
>> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct 
>> acpi_hest_generic *generic)
>>
>>  err_unmap:
>> apei_unmap_generic_address(&generic->error_status_address);
>> +    if (HEST_TYPE_GENERIC_V2(ghes))
>> +        apei_unmap_generic_address(
>> +            &ghes->generic_v2->read_ack_register);
>
> We might end up trying to unmap (error_status_address) which is not 
> mapped
> if we hit the error in mapping read_ack_register. The 
> read_ack_register unmap
> hunk should be moved below to err_free.
>
This needs to be changed, I'll add a separate label for unmapping 
read_ack_register and error_status_address for the case that the 
read_ack_register map succeeds but the error_status_address map fails.

err_unmap_status_addr:
apei_unmap_generic_address(&generic->error_status_address);
err_unmap_read_ack_addr:
         if (HEST_TYPE_GENERIC_V2(ghes))
                 apei_unmap_generic_address(
&ghes->generic_v2->read_ack_register);
err_free:
         kfree(ghes);
         return ERR_PTR(rc);

If mapping read_ack_register fails, goto err_free.
If mapping read_ack_register is successful but mapping 
error_status_address fails, goto err_unmap_read_ack_addr.
And if both mappings succeed but the kmalloc fails, then goto 
err_unmap_status_addr.

>
>>  err_free:
>>      kfree(ghes);
>>      return ERR_PTR(rc);
>> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes)
>>  {
>>      kfree(ghes->estatus);
>> apei_unmap_generic_address(&ghes->generic->error_status_address);
>> +    if (HEST_TYPE_GENERIC_V2(ghes))
>> +        apei_unmap_generic_address(
>> +            &ghes->generic_v2->read_ack_register);
>>  }
>>
>>  static inline int ghes_severity(int severity)
>> @@ -648,6 +667,23 @@ static void ghes_estatus_cache_add(
>>      rcu_read_unlock();
>>  }
>>
>
>> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)
>
> nit: We are actually writing something to the read_ack_register. The 
> names
> read_ack_register (which may be as per standard) and more importantly the
> function name (ghes_do_read_ack) sounds a bit misleading.
It is called "Read Ack Register" in the spec (ACPI 6.1 table 18-344), 
but I agree the function name can be improved.
Maybe ghes_acknowledge_error or ghes_ack_error.

Thanks,
Tyler
>
> Rest looks fine to me.
>
> Suzuki
>
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 60746ef..7d020b0 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -45,6 +45,7 @@ 
 #include <linux/aer.h>
 #include <linux/nmi.h>
 
+#include <acpi/actbl1.h>
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
@@ -79,6 +80,10 @@ 
 	((struct acpi_hest_generic_status *)				\
 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+#define HEST_TYPE_GENERIC_V2(ghes)				\
+	((struct acpi_hest_header *)ghes->generic)->type ==	\
+	 ACPI_HEST_TYPE_GENERIC_ERROR_V2
+
 /*
  * This driver isn't really modular, however for the time being,
  * continuing to use module_param is the easiest way to remain
@@ -248,7 +253,15 @@  static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
 	if (!ghes)
 		return ERR_PTR(-ENOMEM);
+
 	ghes->generic = generic;
+	if (HEST_TYPE_GENERIC_V2(ghes)) {
+		rc = apei_map_generic_address(
+			&ghes->generic_v2->read_ack_register);
+		if (rc)
+			goto err_unmap;
+	}
+
 	rc = apei_map_generic_address(&generic->error_status_address);
 	if (rc)
 		goto err_free;
@@ -270,6 +283,9 @@  static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 
 err_unmap:
 	apei_unmap_generic_address(&generic->error_status_address);
+	if (HEST_TYPE_GENERIC_V2(ghes))
+		apei_unmap_generic_address(
+			&ghes->generic_v2->read_ack_register);
 err_free:
 	kfree(ghes);
 	return ERR_PTR(rc);
@@ -279,6 +295,9 @@  static void ghes_fini(struct ghes *ghes)
 {
 	kfree(ghes->estatus);
 	apei_unmap_generic_address(&ghes->generic->error_status_address);
+	if (HEST_TYPE_GENERIC_V2(ghes))
+		apei_unmap_generic_address(
+			&ghes->generic_v2->read_ack_register);
 }
 
 static inline int ghes_severity(int severity)
@@ -648,6 +667,23 @@  static void ghes_estatus_cache_add(
 	rcu_read_unlock();
 }
 
+static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2)
+{
+	int rc;
+	u64 val = 0;
+
+	rc = apei_read(&val, &generic_v2->read_ack_register);
+	if (rc)
+		return rc;
+	val &= generic_v2->read_ack_preserve <<
+		generic_v2->read_ack_register.bit_offset;
+	val |= generic_v2->read_ack_write <<
+		generic_v2->read_ack_register.bit_offset;
+	rc = apei_write(val, &generic_v2->read_ack_register);
+
+	return rc;
+}
+
 static int ghes_proc(struct ghes *ghes)
 {
 	int rc;
@@ -660,6 +696,12 @@  static int ghes_proc(struct ghes *ghes)
 			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
 	}
 	ghes_do_proc(ghes, ghes->estatus);
+
+	if (HEST_TYPE_GENERIC_V2(ghes)) {
+		rc = ghes_do_read_ack(ghes->generic_v2);
+		if (rc)
+			return rc;
+	}
 out:
 	ghes_clear_estatus(ghes);
 	return 0;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 792a0d9..ef725a9 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -52,6 +52,7 @@  static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
 	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
 	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
 	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+	[ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
 };
 
 static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
@@ -146,7 +147,8 @@  static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void
 {
 	int *count = data;
 
-	if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR)
+	if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+	    hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
 		(*count)++;
 	return 0;
 }
@@ -157,7 +159,8 @@  static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data)
 	struct ghes_arr *ghes_arr = data;
 	int rc, i;
 
-	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
+	    hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
 		return 0;
 
 	if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 720446c..68f088a 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -13,7 +13,10 @@ 
 #define GHES_EXITING		0x0002
 
 struct ghes {
-	struct acpi_hest_generic *generic;
+	union {
+		struct acpi_hest_generic *generic;
+		struct acpi_hest_generic_v2 *generic_v2;
+	};
 	struct acpi_hest_generic_status *estatus;
 	u64 buffer_paddr;
 	unsigned long flags;