Message ID | 1477071013-29563-2-git-send-email-tbaicar@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 21/10/16 18:30, Tyler Baicar wrote: > A RAS (Reliability, Availability, Serviceability) controller > may be a separate processor running in parallel with OS > execution, and may generate error records for consumption by > the OS. If the RAS controller produces multiple error records, > then they may be overwritten before the OS has consumed them. > > The Generic Hardware Error Source (GHES) v2 structure > introduces the capability for the OS to acknowledge the > consumption of the error record generated by the RAS > controller. A RAS controller supporting GHESv2 shall wait for > the acknowledgment before writing a new error record, thus > eliminating the race condition. > > Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> > Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org> > Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> > Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> > --- > drivers/acpi/apei/ghes.c | 42 ++++++++++++++++++++++++++++++++++++++++++ > drivers/acpi/apei/hest.c | 7 +++++-- > include/acpi/ghes.h | 5 ++++- > 3 files changed, 51 insertions(+), 3 deletions(-) > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index 60746ef..7d020b0 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -45,6 +45,7 @@ > #include <linux/aer.h> > #include <linux/nmi.h> > > +#include <acpi/actbl1.h> > #include <acpi/ghes.h> > #include <acpi/apei.h> > #include <asm/tlbflush.h> > @@ -79,6 +80,10 @@ > ((struct acpi_hest_generic_status *) \ > ((struct ghes_estatus_node *)(estatus_node) + 1)) > > +#define HEST_TYPE_GENERIC_V2(ghes) \ > + ((struct acpi_hest_header *)ghes->generic)->type == \ > + ACPI_HEST_TYPE_GENERIC_ERROR_V2 > + > /* > * This driver isn't really modular, however for the time being, > * continuing to use module_param is the easiest way to remain > @@ -248,7 +253,15 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) > ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); > if (!ghes) > return ERR_PTR(-ENOMEM); > + > ghes->generic = generic; > + if (HEST_TYPE_GENERIC_V2(ghes)) { > + rc = apei_map_generic_address( > + &ghes->generic_v2->read_ack_register); > + if (rc) > + goto err_unmap; I think should be goto err_free, see more below. > + } > + > rc = apei_map_generic_address(&generic->error_status_address); > if (rc) > goto err_free; > @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) > > err_unmap: > apei_unmap_generic_address(&generic->error_status_address); > + if (HEST_TYPE_GENERIC_V2(ghes)) > + apei_unmap_generic_address( > + &ghes->generic_v2->read_ack_register); We might end up trying to unmap (error_status_address) which is not mapped if we hit the error in mapping read_ack_register. The read_ack_register unmap hunk should be moved below to err_free. > err_free: > kfree(ghes); > return ERR_PTR(rc); > @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) > { > kfree(ghes->estatus); > apei_unmap_generic_address(&ghes->generic->error_status_address); > + if (HEST_TYPE_GENERIC_V2(ghes)) > + apei_unmap_generic_address( > + &ghes->generic_v2->read_ack_register); > } > > static inline int ghes_severity(int severity) > @@ -648,6 +667,23 @@ static void ghes_estatus_cache_add( > rcu_read_unlock(); > } > > +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) nit: We are actually writing something to the read_ack_register. The names read_ack_register (which may be as per standard) and more importantly the function name (ghes_do_read_ack) sounds a bit misleading. Rest looks fine to me. Suzuki
On 10/24/2016 2:51 AM, Suzuki K Poulose wrote: > On 21/10/16 18:30, Tyler Baicar wrote: >> A RAS (Reliability, Availability, Serviceability) controller >> may be a separate processor running in parallel with OS >> execution, and may generate error records for consumption by >> the OS. If the RAS controller produces multiple error records, >> then they may be overwritten before the OS has consumed them. >> >> The Generic Hardware Error Source (GHES) v2 structure >> introduces the capability for the OS to acknowledge the >> consumption of the error record generated by the RAS >> controller. A RAS controller supporting GHESv2 shall wait for >> the acknowledgment before writing a new error record, thus >> eliminating the race condition. >> >> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> >> Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org> >> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> >> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> >> --- >> drivers/acpi/apei/ghes.c | 42 >> ++++++++++++++++++++++++++++++++++++++++++ >> drivers/acpi/apei/hest.c | 7 +++++-- >> include/acpi/ghes.h | 5 ++++- >> 3 files changed, 51 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index 60746ef..7d020b0 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -45,6 +45,7 @@ >> #include <linux/aer.h> >> #include <linux/nmi.h> >> >> +#include <acpi/actbl1.h> >> #include <acpi/ghes.h> >> #include <acpi/apei.h> >> #include <asm/tlbflush.h> >> @@ -79,6 +80,10 @@ >> ((struct acpi_hest_generic_status *) \ >> ((struct ghes_estatus_node *)(estatus_node) + 1)) >> >> +#define HEST_TYPE_GENERIC_V2(ghes) \ >> + ((struct acpi_hest_header *)ghes->generic)->type == \ >> + ACPI_HEST_TYPE_GENERIC_ERROR_V2 >> + >> /* >> * This driver isn't really modular, however for the time being, >> * continuing to use module_param is the easiest way to remain >> @@ -248,7 +253,15 @@ static struct ghes *ghes_new(struct >> acpi_hest_generic *generic) >> ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); >> if (!ghes) >> return ERR_PTR(-ENOMEM); >> + >> ghes->generic = generic; >> + if (HEST_TYPE_GENERIC_V2(ghes)) { >> + rc = apei_map_generic_address( >> + &ghes->generic_v2->read_ack_register); >> + if (rc) >> + goto err_unmap; > > I think should be goto err_free, see more below. > >> + } >> + >> rc = apei_map_generic_address(&generic->error_status_address); >> if (rc) >> goto err_free; >> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct >> acpi_hest_generic *generic) >> >> err_unmap: >> apei_unmap_generic_address(&generic->error_status_address); >> + if (HEST_TYPE_GENERIC_V2(ghes)) >> + apei_unmap_generic_address( >> + &ghes->generic_v2->read_ack_register); > > We might end up trying to unmap (error_status_address) which is not > mapped > if we hit the error in mapping read_ack_register. The > read_ack_register unmap > hunk should be moved below to err_free. > This needs to be changed, I'll add a separate label for unmapping read_ack_register and error_status_address for the case that the read_ack_register map succeeds but the error_status_address map fails. err_unmap_status_addr: apei_unmap_generic_address(&generic->error_status_address); err_unmap_read_ack_addr: if (HEST_TYPE_GENERIC_V2(ghes)) apei_unmap_generic_address( &ghes->generic_v2->read_ack_register); err_free: kfree(ghes); return ERR_PTR(rc); If mapping read_ack_register fails, goto err_free. If mapping read_ack_register is successful but mapping error_status_address fails, goto err_unmap_read_ack_addr. And if both mappings succeed but the kmalloc fails, then goto err_unmap_status_addr. > >> err_free: >> kfree(ghes); >> return ERR_PTR(rc); >> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) >> { >> kfree(ghes->estatus); >> apei_unmap_generic_address(&ghes->generic->error_status_address); >> + if (HEST_TYPE_GENERIC_V2(ghes)) >> + apei_unmap_generic_address( >> + &ghes->generic_v2->read_ack_register); >> } >> >> static inline int ghes_severity(int severity) >> @@ -648,6 +667,23 @@ static void ghes_estatus_cache_add( >> rcu_read_unlock(); >> } >> > >> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) > > nit: We are actually writing something to the read_ack_register. The > names > read_ack_register (which may be as per standard) and more importantly the > function name (ghes_do_read_ack) sounds a bit misleading. It is called "Read Ack Register" in the spec (ACPI 6.1 table 18-344), but I agree the function name can be improved. Maybe ghes_acknowledge_error or ghes_ack_error. Thanks, Tyler > > Rest looks fine to me. > > Suzuki >
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 60746ef..7d020b0 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -45,6 +45,7 @@ #include <linux/aer.h> #include <linux/nmi.h> +#include <acpi/actbl1.h> #include <acpi/ghes.h> #include <acpi/apei.h> #include <asm/tlbflush.h> @@ -79,6 +80,10 @@ ((struct acpi_hest_generic_status *) \ ((struct ghes_estatus_node *)(estatus_node) + 1)) +#define HEST_TYPE_GENERIC_V2(ghes) \ + ((struct acpi_hest_header *)ghes->generic)->type == \ + ACPI_HEST_TYPE_GENERIC_ERROR_V2 + /* * This driver isn't really modular, however for the time being, * continuing to use module_param is the easiest way to remain @@ -248,7 +253,15 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); if (!ghes) return ERR_PTR(-ENOMEM); + ghes->generic = generic; + if (HEST_TYPE_GENERIC_V2(ghes)) { + rc = apei_map_generic_address( + &ghes->generic_v2->read_ack_register); + if (rc) + goto err_unmap; + } + rc = apei_map_generic_address(&generic->error_status_address); if (rc) goto err_free; @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) err_unmap: apei_unmap_generic_address(&generic->error_status_address); + if (HEST_TYPE_GENERIC_V2(ghes)) + apei_unmap_generic_address( + &ghes->generic_v2->read_ack_register); err_free: kfree(ghes); return ERR_PTR(rc); @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) { kfree(ghes->estatus); apei_unmap_generic_address(&ghes->generic->error_status_address); + if (HEST_TYPE_GENERIC_V2(ghes)) + apei_unmap_generic_address( + &ghes->generic_v2->read_ack_register); } static inline int ghes_severity(int severity) @@ -648,6 +667,23 @@ static void ghes_estatus_cache_add( rcu_read_unlock(); } +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) +{ + int rc; + u64 val = 0; + + rc = apei_read(&val, &generic_v2->read_ack_register); + if (rc) + return rc; + val &= generic_v2->read_ack_preserve << + generic_v2->read_ack_register.bit_offset; + val |= generic_v2->read_ack_write << + generic_v2->read_ack_register.bit_offset; + rc = apei_write(val, &generic_v2->read_ack_register); + + return rc; +} + static int ghes_proc(struct ghes *ghes) { int rc; @@ -660,6 +696,12 @@ static int ghes_proc(struct ghes *ghes) ghes_estatus_cache_add(ghes->generic, ghes->estatus); } ghes_do_proc(ghes, ghes->estatus); + + if (HEST_TYPE_GENERIC_V2(ghes)) { + rc = ghes_do_read_ack(ghes->generic_v2); + if (rc) + return rc; + } out: ghes_clear_estatus(ghes); return 0; diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index 792a0d9..ef725a9 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = { [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer), [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge), [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic), + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2), }; static int hest_esrc_len(struct acpi_hest_header *hest_hdr) @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void { int *count = data; - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR) + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) (*count)++; return 0; } @@ -157,7 +159,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data) struct ghes_arr *ghes_arr = data; int rc, i; - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR) + if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR && + hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2) return 0; if (!((struct acpi_hest_generic *)hest_hdr)->enabled) diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 720446c..68f088a 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -13,7 +13,10 @@ #define GHES_EXITING 0x0002 struct ghes { - struct acpi_hest_generic *generic; + union { + struct acpi_hest_generic *generic; + struct acpi_hest_generic_v2 *generic_v2; + }; struct acpi_hest_generic_status *estatus; u64 buffer_paddr; unsigned long flags;