Message ID | 1475875882-2604-2-git-send-email-tbaicar@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Tyler, A few comments below. Tyler Baicar <tbaicar@codeaurora.org> writes: > A RAS (Reliability, Availability, Serviceability) controller > may be a separate processor running in parallel with OS > execution, and may generate error records for consumption by > the OS. If the RAS controller produces multiple error records, > then they may be overwritten before the OS has consumed them. > > The Generic Hardware Error Source (GHES) v2 structure > introduces the capability for the OS to acknowledge the > consumption of the error record generated by the RAS > controller. A RAS controller supporting GHESv2 shall wait for > the acknowledgment before writing a new error record, thus > eliminating the race condition. > > Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> > Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org> > Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> > Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> > --- > drivers/acpi/apei/ghes.c | 41 +++++++++++++++++++++++++++++++++++++++++ > drivers/acpi/apei/hest.c | 7 +++++-- > include/acpi/ghes.h | 1 + > 3 files changed, 47 insertions(+), 2 deletions(-) > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index 60746ef..3021f0e 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -45,6 +45,7 @@ > #include <linux/aer.h> > #include <linux/nmi.h> > > +#include <acpi/actbl1.h> > #include <acpi/ghes.h> > #include <acpi/apei.h> > #include <asm/tlbflush.h> > @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) > struct ghes *ghes; > unsigned int error_block_length; > int rc; > + struct acpi_hest_header *hest_hdr; > > ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); > if (!ghes) > return ERR_PTR(-ENOMEM); > + > + hest_hdr = (struct acpi_hest_header *)generic; > + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) { > + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic; > + rc = apei_map_generic_address( > + &ghes->generic_v2->read_ack_register); > + if (rc) > + goto err_unmap; > + } else > + ghes->generic_v2 = NULL; Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already? > + > ghes->generic = generic; > rc = apei_map_generic_address(&generic->error_status_address); > if (rc) > @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) > > err_unmap: > apei_unmap_generic_address(&generic->error_status_address); > + if (ghes->generic_v2) > + apei_unmap_generic_address( > + &ghes->generic_v2->read_ack_register); > err_free: > kfree(ghes); > return ERR_PTR(rc); > @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) > { > kfree(ghes->estatus); > apei_unmap_generic_address(&ghes->generic->error_status_address); > + if (ghes->generic_v2) > + apei_unmap_generic_address( > + &ghes->generic_v2->read_ack_register); > } > > static inline int ghes_severity(int severity) > @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add( > rcu_read_unlock(); > } > > +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) > +{ > + int rc; > + u64 val = 0; > + > + rc = apei_read(&val, &generic_v2->read_ack_register); > + if (rc) > + return rc; > + val &= generic_v2->read_ack_preserve << > + generic_v2->read_ack_register.bit_offset; > + val |= generic_v2->read_ack_write; Reading the spec, it is not clear whether you need the left shift above. Having said that, if you do it for read_ack_preserve, do you also need to left shift read_ack_write by read_ack_register.bit_offset? > + rc = apei_write(val, &generic_v2->read_ack_register); > + > + return rc; > +} > + > static int ghes_proc(struct ghes *ghes) > { > int rc; > @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes) > ghes_estatus_cache_add(ghes->generic, ghes->estatus); > } > ghes_do_proc(ghes, ghes->estatus); > + > + if (ghes->generic_v2) { > + rc = ghes_do_read_ack(ghes->generic_v2); > + if (rc) > + return rc; > + } > out: > ghes_clear_estatus(ghes); > return 0; > diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c > index 792a0d9..ef725a9 100644 > --- a/drivers/acpi/apei/hest.c > +++ b/drivers/acpi/apei/hest.c > @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = { > [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer), > [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge), > [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic), > + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2), > }; > > static int hest_esrc_len(struct acpi_hest_header *hest_hdr) > @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void > { > int *count = data; > > - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR) > + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || > + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) > (*count)++; > return 0; > } > @@ -157,7 +159,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data) > struct ghes_arr *ghes_arr = data; > int rc, i; > > - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR) > + if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR && > + hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2) > return 0; > > if (!((struct acpi_hest_generic *)hest_hdr)->enabled) > diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h > index 720446c..d0108b6 100644 > --- a/include/acpi/ghes.h > +++ b/include/acpi/ghes.h > @@ -14,6 +14,7 @@ > > struct ghes { > struct acpi_hest_generic *generic; > + struct acpi_hest_generic_v2 *generic_v2; You either have a GHES or a GHESv2 structure. Instead of duplication, could this be represented as a union? Thanks, Punit > struct acpi_hest_generic_status *estatus; > u64 buffer_paddr; > unsigned long flags;
Hello Punit, Thank you for the feedback! Responses below On 10/12/2016 9:39 AM, Punit Agrawal wrote: > Hi Tyler, > > A few comments below. > > Tyler Baicar <tbaicar@codeaurora.org> writes: > >> A RAS (Reliability, Availability, Serviceability) controller >> may be a separate processor running in parallel with OS >> execution, and may generate error records for consumption by >> the OS. If the RAS controller produces multiple error records, >> then they may be overwritten before the OS has consumed them. >> >> The Generic Hardware Error Source (GHES) v2 structure >> introduces the capability for the OS to acknowledge the >> consumption of the error record generated by the RAS >> controller. A RAS controller supporting GHESv2 shall wait for >> the acknowledgment before writing a new error record, thus >> eliminating the race condition. >> >> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> >> Signed-off-by: Richard Ruigrok <rruigrok@codeaurora.org> >> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> >> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> >> --- >> drivers/acpi/apei/ghes.c | 41 +++++++++++++++++++++++++++++++++++++++++ >> drivers/acpi/apei/hest.c | 7 +++++-- >> include/acpi/ghes.h | 1 + >> 3 files changed, 47 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index 60746ef..3021f0e 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -45,6 +45,7 @@ >> #include <linux/aer.h> >> #include <linux/nmi.h> >> >> +#include <acpi/actbl1.h> >> #include <acpi/ghes.h> >> #include <acpi/apei.h> >> #include <asm/tlbflush.h> >> @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) >> struct ghes *ghes; >> unsigned int error_block_length; >> int rc; >> + struct acpi_hest_header *hest_hdr; >> >> ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); >> if (!ghes) >> return ERR_PTR(-ENOMEM); >> + >> + hest_hdr = (struct acpi_hest_header *)generic; >> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) { >> + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic; >> + rc = apei_map_generic_address( >> + &ghes->generic_v2->read_ack_register); >> + if (rc) >> + goto err_unmap; >> + } else >> + ghes->generic_v2 = NULL; > Since you kzalloc ghes, shouldn't ghes->generic_v2 be NULL already? Yes, the documentation says kzalloc returns memory set to zero, so I will remove this else statement. >> + >> ghes->generic = generic; >> rc = apei_map_generic_address(&generic->error_status_address); >> if (rc) >> @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) >> >> err_unmap: >> apei_unmap_generic_address(&generic->error_status_address); >> + if (ghes->generic_v2) >> + apei_unmap_generic_address( >> + &ghes->generic_v2->read_ack_register); >> err_free: >> kfree(ghes); >> return ERR_PTR(rc); >> @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) >> { >> kfree(ghes->estatus); >> apei_unmap_generic_address(&ghes->generic->error_status_address); >> + if (ghes->generic_v2) >> + apei_unmap_generic_address( >> + &ghes->generic_v2->read_ack_register); >> } >> >> static inline int ghes_severity(int severity) >> @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add( >> rcu_read_unlock(); >> } >> >> +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) >> +{ >> + int rc; >> + u64 val = 0; >> + >> + rc = apei_read(&val, &generic_v2->read_ack_register); >> + if (rc) >> + return rc; >> + val &= generic_v2->read_ack_preserve << >> + generic_v2->read_ack_register.bit_offset; >> + val |= generic_v2->read_ack_write; > Reading the spec, it is not clear whether you need the left shift > above. > > Having said that, if you do it for read_ack_preserve, do you also need > to left shift read_ack_write by read_ack_register.bit_offset? Good catch, it looks like the read_ack_write should also get this shift. I'm using a shift of 0 so I didn't catch this in testing :) >> + rc = apei_write(val, &generic_v2->read_ack_register); >> + >> + return rc; >> +} >> + >> static int ghes_proc(struct ghes *ghes) >> { >> int rc; >> @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes) >> ghes_estatus_cache_add(ghes->generic, ghes->estatus); >> } >> ghes_do_proc(ghes, ghes->estatus); >> + >> + if (ghes->generic_v2) { >> + rc = ghes_do_read_ack(ghes->generic_v2); >> + if (rc) >> + return rc; >> + } >> out: >> ghes_clear_estatus(ghes); >> return 0; >> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c >> index 792a0d9..ef725a9 100644 >> --- a/drivers/acpi/apei/hest.c >> +++ b/drivers/acpi/apei/hest.c >> @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = { >> [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer), >> [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge), >> [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic), >> + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2), >> }; >> >> static int hest_esrc_len(struct acpi_hest_header *hest_hdr) >> @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void >> { >> int *count = data; >> >> - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR) >> + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || >> + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) >> (*count)++; >> return 0; >> } >> @@ -157,7 +159,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data) >> struct ghes_arr *ghes_arr = data; >> int rc, i; >> >> - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR) >> + if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR && >> + hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2) >> return 0; >> >> if (!((struct acpi_hest_generic *)hest_hdr)->enabled) >> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h >> index 720446c..d0108b6 100644 >> --- a/include/acpi/ghes.h >> +++ b/include/acpi/ghes.h >> @@ -14,6 +14,7 @@ >> >> struct ghes { >> struct acpi_hest_generic *generic; >> + struct acpi_hest_generic_v2 *generic_v2; > You either have a GHES or a GHESv2 structure. Instead of duplication, > could this be represented as a union? > > Thanks, > Punit I think that should be doable. I'll make these changes in the next version. Thanks, Tyler >> struct acpi_hest_generic_status *estatus; >> u64 buffer_paddr; >> unsigned long flags;
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 60746ef..3021f0e 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -45,6 +45,7 @@ #include <linux/aer.h> #include <linux/nmi.h> +#include <acpi/actbl1.h> #include <acpi/ghes.h> #include <acpi/apei.h> #include <asm/tlbflush.h> @@ -244,10 +245,22 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) struct ghes *ghes; unsigned int error_block_length; int rc; + struct acpi_hest_header *hest_hdr; ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); if (!ghes) return ERR_PTR(-ENOMEM); + + hest_hdr = (struct acpi_hest_header *)generic; + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) { + ghes->generic_v2 = (struct acpi_hest_generic_v2 *)generic; + rc = apei_map_generic_address( + &ghes->generic_v2->read_ack_register); + if (rc) + goto err_unmap; + } else + ghes->generic_v2 = NULL; + ghes->generic = generic; rc = apei_map_generic_address(&generic->error_status_address); if (rc) @@ -270,6 +283,9 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) err_unmap: apei_unmap_generic_address(&generic->error_status_address); + if (ghes->generic_v2) + apei_unmap_generic_address( + &ghes->generic_v2->read_ack_register); err_free: kfree(ghes); return ERR_PTR(rc); @@ -279,6 +295,9 @@ static void ghes_fini(struct ghes *ghes) { kfree(ghes->estatus); apei_unmap_generic_address(&ghes->generic->error_status_address); + if (ghes->generic_v2) + apei_unmap_generic_address( + &ghes->generic_v2->read_ack_register); } static inline int ghes_severity(int severity) @@ -648,6 +667,22 @@ static void ghes_estatus_cache_add( rcu_read_unlock(); } +static int ghes_do_read_ack(struct acpi_hest_generic_v2 *generic_v2) +{ + int rc; + u64 val = 0; + + rc = apei_read(&val, &generic_v2->read_ack_register); + if (rc) + return rc; + val &= generic_v2->read_ack_preserve << + generic_v2->read_ack_register.bit_offset; + val |= generic_v2->read_ack_write; + rc = apei_write(val, &generic_v2->read_ack_register); + + return rc; +} + static int ghes_proc(struct ghes *ghes) { int rc; @@ -660,6 +695,12 @@ static int ghes_proc(struct ghes *ghes) ghes_estatus_cache_add(ghes->generic, ghes->estatus); } ghes_do_proc(ghes, ghes->estatus); + + if (ghes->generic_v2) { + rc = ghes_do_read_ack(ghes->generic_v2); + if (rc) + return rc; + } out: ghes_clear_estatus(ghes); return 0; diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index 792a0d9..ef725a9 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -52,6 +52,7 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = { [ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer), [ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge), [ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic), + [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2), }; static int hest_esrc_len(struct acpi_hest_header *hest_hdr) @@ -146,7 +147,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void { int *count = data; - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR) + if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) (*count)++; return 0; } @@ -157,7 +159,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data) struct ghes_arr *ghes_arr = data; int rc, i; - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR) + if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR && + hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2) return 0; if (!((struct acpi_hest_generic *)hest_hdr)->enabled) diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 720446c..d0108b6 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -14,6 +14,7 @@ struct ghes { struct acpi_hest_generic *generic; + struct acpi_hest_generic_v2 *generic_v2; struct acpi_hest_generic_status *estatus; u64 buffer_paddr; unsigned long flags;