Message ID | 86258A5CC0A3704780874CF6004BA8A62DCAB57E@lhreml502-mbs (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
[[RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV notification type] On 27/02/2017 (Mon 18:19) Shiju Jose wrote: > Add a new GHES error source handling function for > GSIV(Global System Interrupt Vector). > If an error source's notification type is GSIV, > then this handling function can be registered > into the GSIV handler and it can parse > and report error information when they occur. > > Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com> > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > --- > arch/arm64/Kconfig | 1 + > drivers/acpi/apei/Kconfig | 9 +++++++ > drivers/acpi/apei/ghes.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 76 insertions(+) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 1117421..e41fdcf 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -53,6 +53,7 @@ config ARM64 > select HANDLE_DOMAIN_IRQ > select HARDIRQS_SW_RESEND > select HAVE_ACPI_APEI if (ACPI && EFI) > + select HAVE_ACPI_APEI_GSIV if (ACPI && EFI) > select HAVE_ALIGNED_STRUCT_PAGE if SLUB > select HAVE_ARCH_AUDITSYSCALL > select HAVE_ARCH_BITREVERSE > diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig > index b0140c8..f45ddb5 100644 > --- a/drivers/acpi/apei/Kconfig > +++ b/drivers/acpi/apei/Kconfig > @@ -1,6 +1,15 @@ > config HAVE_ACPI_APEI > bool > > +config HAVE_ACPI_APEI_GSIV > + bool "APEI GSIV(Global System Interrupt) logging/recovering support" > + depends on ACPI_APEI && ACPI_APEI_GHES > + help > + This option should be enabled if the system supports > + firmware first handling of GSIV (Global System Interrupt) > + for the hardware errors and allows OS to do error > + recovery/logging. > + > config HAVE_ACPI_APEI_NMI > bool A "config HAVE_<foo>" option normally doesn't have a "help" section since it isn't meant to be user selected. You can see that in the other two options that exist right here in the context of your patch. Also your Cc list here seems rather large; not sure how you came up with such a large list... P. -- > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index e53bef6..e1611d2 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -721,6 +721,58 @@ static struct notifier_block ghes_notifier_sci = { > .notifier_call = ghes_notify_sci, > }; > > +#ifdef CONFIG_HAVE_ACPI_APEI_GSIV > +static LIST_HEAD(ghes_gsiv); > + > +static int ghes_notify_gsiv(struct notifier_block *this, > + unsigned long event, void *data) > +{ > + struct ghes *ghes; > + int ret = NOTIFY_DONE; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(ghes, &ghes_gsiv, list) { > + if (!ghes_proc(ghes)) > + ret = NOTIFY_OK; > + } > + rcu_read_unlock(); > + return ret; > +} > +static struct notifier_block ghes_notifier_gsiv = { > + .notifier_call = ghes_notify_gsiv, > +}; > + > +static void ghes_gsiv_add(struct ghes *ghes) > +{ > + mutex_lock(&ghes_list_mutex); > + if (list_empty(&ghes_gsiv)) > + register_acpi_hed_notifier(&ghes_notifier_gsiv); > + list_add_rcu(&ghes->list, &ghes_gsiv); > + mutex_unlock(&ghes_list_mutex); > +} > + > +static void ghes_gsiv_remove(struct ghes *ghes) > +{ > + mutex_lock(&ghes_list_mutex); > + list_del_rcu(&ghes->list); > + if (list_empty(&ghes_gsiv)) > + unregister_acpi_hed_notifier(&ghes_notifier_gsiv); > + mutex_unlock(&ghes_list_mutex); > +} > +#else /* CONFIG_HAVE_ACPI_APEI_GSIV */ > +static inline void ghes_gsiv_add(struct ghes *ghes) > +{ > + pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n", > + ghes->generic->header.source_id); > +} > + > +static inline void ghes_gsiv_remove(struct ghes *ghes) > +{ > + pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n", > + ghes->generic->header.source_id); > +} > +#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */ > + > #ifdef CONFIG_HAVE_ACPI_APEI_NMI > /* > * printk is not safe in NMI context. So in NMI handler, we allocate > @@ -973,6 +1025,14 @@ static int ghes_probe(struct platform_device *ghes_dev) > goto err; > } > break; > + case ACPI_HEST_NOTIFY_GSIV: > + if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) { > + pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n", > + generic->header.source_id); > + rc = -ENOTSUPP; > + goto err; > + } > + break; > case ACPI_HEST_NOTIFY_LOCAL: > pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", > generic->header.source_id); > @@ -1034,6 +1094,9 @@ static int ghes_probe(struct platform_device *ghes_dev) > case ACPI_HEST_NOTIFY_NMI: > ghes_nmi_add(ghes); > break; > + case ACPI_HEST_NOTIFY_GSIV: > + ghes_gsiv_add(ghes); > + break; > default: > BUG(); > } > @@ -1076,6 +1139,9 @@ static int ghes_remove(struct platform_device *ghes_dev) > case ACPI_HEST_NOTIFY_NMI: > ghes_nmi_remove(ghes); > break; > + case ACPI_HEST_NOTIFY_GSIV: > + ghes_gsiv_remove(ghes); > + break; > default: > BUG(); > break; > -- > 2.1.4 >
On Mon, 27 Feb 2017 13:58:19 -0500 Paul Gortmaker <paul.gortmaker@windriver.com> wrote: > > --- a/drivers/acpi/apei/Kconfig > > +++ b/drivers/acpi/apei/Kconfig > > @@ -1,6 +1,15 @@ > > config HAVE_ACPI_APEI > > bool > > > > +config HAVE_ACPI_APEI_GSIV > > + bool "APEI GSIV(Global System Interrupt) logging/recovering support" > > + depends on ACPI_APEI && ACPI_APEI_GHES > > + help > > + This option should be enabled if the system supports > > + firmware first handling of GSIV (Global System Interrupt) > > + for the hardware errors and allows OS to do error > > + recovery/logging. > > + > > config HAVE_ACPI_APEI_NMI > > bool > > A "config HAVE_<foo>" option normally doesn't have a "help" section > since it isn't meant to be user selected. You can see that in the other > two options that exist right here in the context of your patch. I will argue that having "help" messages for even configs that do not get selected are useful. It helps reviewers of the code to know why one of these events should be enabled or not. > > Also your Cc list here seems rather large; not sure how you came up with > such a large list... +1 -- Steve
Hi Shiju, On 27/02/17 18:19, Shiju Jose wrote: > Add a new GHES error source handling function for > GSIV(Global System Interrupt Vector). > If an error source's notification type is GSIV, > then this handling function can be registered > into the GSIV handler and it can parse > and report error information when they occur. I'm missing some of the story here, but how is GSIV different from 'External Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'... The GHES GSIV code below is identical to the behaviour of the SCI Notification type... are these two names for the same thing? (I'm confused!) Thanks, James > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index e53bef6..e1611d2 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -721,6 +721,58 @@ static struct notifier_block ghes_notifier_sci = { > .notifier_call = ghes_notify_sci, > }; > > +#ifdef CONFIG_HAVE_ACPI_APEI_GSIV > +static LIST_HEAD(ghes_gsiv); > + > +static int ghes_notify_gsiv(struct notifier_block *this, > + unsigned long event, void *data) > +{ > + struct ghes *ghes; > + int ret = NOTIFY_DONE; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(ghes, &ghes_gsiv, list) { > + if (!ghes_proc(ghes)) > + ret = NOTIFY_OK; > + } > + rcu_read_unlock(); > + return ret; > +} > +static struct notifier_block ghes_notifier_gsiv = { > + .notifier_call = ghes_notify_gsiv, > +}; > + > +static void ghes_gsiv_add(struct ghes *ghes) > +{ > + mutex_lock(&ghes_list_mutex); > + if (list_empty(&ghes_gsiv)) > + register_acpi_hed_notifier(&ghes_notifier_gsiv); > + list_add_rcu(&ghes->list, &ghes_gsiv); > + mutex_unlock(&ghes_list_mutex); > +} > + > +static void ghes_gsiv_remove(struct ghes *ghes) > +{ > + mutex_lock(&ghes_list_mutex); > + list_del_rcu(&ghes->list); > + if (list_empty(&ghes_gsiv)) > + unregister_acpi_hed_notifier(&ghes_notifier_gsiv); > + mutex_unlock(&ghes_list_mutex); > +} > +#else /* CONFIG_HAVE_ACPI_APEI_GSIV */ > +static inline void ghes_gsiv_add(struct ghes *ghes) > +{ > + pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n", > + ghes->generic->header.source_id); > +} > + > +static inline void ghes_gsiv_remove(struct ghes *ghes) > +{ > + pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n", > + ghes->generic->header.source_id); > +} > +#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */ > + > #ifdef CONFIG_HAVE_ACPI_APEI_NMI > /* > * printk is not safe in NMI context. So in NMI handler, we allocate > @@ -973,6 +1025,14 @@ static int ghes_probe(struct platform_device *ghes_dev) > goto err; > } > break; > + case ACPI_HEST_NOTIFY_GSIV: > + if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) { > + pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n", > + generic->header.source_id); > + rc = -ENOTSUPP; > + goto err; > + } > + break; > case ACPI_HEST_NOTIFY_LOCAL: > pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", > generic->header.source_id); > @@ -1034,6 +1094,9 @@ static int ghes_probe(struct platform_device *ghes_dev) > case ACPI_HEST_NOTIFY_NMI: > ghes_nmi_add(ghes); > break; > + case ACPI_HEST_NOTIFY_GSIV: > + ghes_gsiv_add(ghes); > + break; > default: > BUG(); > } > @@ -1076,6 +1139,9 @@ static int ghes_remove(struct platform_device *ghes_dev) > case ACPI_HEST_NOTIFY_NMI: > ghes_nmi_remove(ghes); > break; > + case ACPI_HEST_NOTIFY_GSIV: > + ghes_gsiv_remove(ghes); > + break; > default: > BUG(); > break; >
Hi James, On 2017/2/28 21:22, James Morse wrote: > Hi Shiju, > > On 27/02/17 18:19, Shiju Jose wrote: >> Add a new GHES error source handling function for >> GSIV(Global System Interrupt Vector). >> If an error source's notification type is GSIV, >> then this handling function can be registered >> into the GSIV handler and it can parse >> and report error information when they occur. > > I'm missing some of the story here, but how is GSIV different from 'External > Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'... Yes, it's the same from CPU side (they are interrupts!), but there is history behind them and the usage is different. I think External Interrupt was introduced before ACPI is available on ARM (hardware reduced platforms), so I guess it was used for errors reported to OS which were not using SCI mechanism, for example, some IO error reporting. For External Interrupt, we don't use the ACPI event system, so for the firmware, it just report the errors associate with the interrupt number, the kernel map the interrupt number and install the irq handler for it. For GSIV based event, it was introduced for hardware reduced platform in recent ACPI revision, and ARM64 is one of its consumers. When errors are reported via GSIV, ACPI event notification needs to be implemented and requires the platform to define a hardware error device (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013. For example, if we are using SPI (ARM GIC) 100 to report errors, there is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, when error happened and trigger the interrupt, ACPI0013 driver will notify the error device (PNP0C33), then error device driver (drivers/acpi/hed.c) will process the error data form APEI table... So GSIV is quite different from 'External interrupt' in the way of working from both firmware and kernel side. > > > The GHES GSIV code below is identical to the behaviour of the SCI Notification > type... are these two names for the same thing? (I'm confused!) SCI is also an 'interrupt' but it's a special irq number for ACPI event, and it has GPE (general purpose event) registers behind it, which is used only on Intel platforms. SCI based event use Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV is used for HW-reduced platform which has no GPEs. Hope it can clarify something :) Thanks Hanjun
Hi Hanjun, On 01/03/17 08:27, Hanjun Guo wrote: > On 2017/2/28 21:22, James Morse wrote: >> On 27/02/17 18:19, Shiju Jose wrote: >>> Add a new GHES error source handling function for >>> GSIV(Global System Interrupt Vector). >>> If an error source's notification type is GSIV, >>> then this handling function can be registered >>> into the GSIV handler and it can parse >>> and report error information when they occur. >> >> I'm missing some of the story here, but how is GSIV different from 'External >> Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'... > > Yes, it's the same from CPU side (they are interrupts!), but there > is history behind them and the usage is different. > > I think External Interrupt was introduced before ACPI is available on > ARM (hardware reduced platforms), so I guess it was used for errors > reported to OS which were not using SCI mechanism, for example, some > IO error reporting. > > For External Interrupt, we don't use the ACPI event system, so for > the firmware, it just report the errors associate with the interrupt > number, the kernel map the interrupt number and install the > irq handler for it. > > For GSIV based event, it was introduced for hardware reduced platform > in recent ACPI revision, and ARM64 is one of its consumers. When > errors are reported via GSIV, ACPI event notification needs to be > implemented and requires the platform to define a hardware error device > (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013. Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means the same but the route they take to get into APEI is different. > For example, if we are using SPI (ARM GIC) 100 to report errors, there > is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, when Aha, this is where the interrupt-magic happens. > error happened and trigger the interrupt, ACPI0013 driver will > notify the error device (PNP0C33), then error device driver > (drivers/acpi/hed.c) will process the error data form APEI table... >> The GHES GSIV code below is identical to the behaviour of the SCI Notification >> type... are these two names for the same thing? (I'm confused!) > > SCI is also an 'interrupt' but it's a special irq number for ACPI > event, and it has GPE (general purpose event) registers behind it, > which is used only on Intel platforms. SCI based event use > Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV > is used for HW-reduced platform which has no GPEs. > Hope it can clarify something :) Yes thanks! (the mist is slowly clearing...) If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive notification from PNP0C33', is there any point having separate lists and add/remove functions for them? Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list to describe 'hed' instead, then group the two case statements together? There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as SCI is already built-in and this way the code added is tiny. The only thing we would lose is the name 'GSIV' in the not-supported error message which we don't need if its always supported. Thanks, James
Hi James, > -----Original Message----- > From: James Morse [mailto:james.morse@arm.com] > Sent: 01 March 2017 17:04 > To: Hanjun Guo; Shiju Jose > Cc: christoffer.dall@linaro.org; marc.zyngier@arm.com; > pbonzini@redhat.com; rkrcmar@redhat.com; linux@armlinux.org.uk; > catalin.marinas@arm.com; will.deacon@arm.com; rjw@rjwysocki.net; > lenb@kernel.org; matt@codeblueprint.co.uk; robert.moore@intel.com; > lv.zheng@intel.com; nkaje@codeaurora.org; zjzhang@codeaurora.org; > mark.rutland@arm.com; akpm@linux-foundation.org; > eun.taik.lee@samsung.com; sandeepa.s.prabhu@gmail.com; > labbott@redhat.com; shijie.huang@arm.com; rruigrok@codeaurora.org; > paul.gortmaker@windriver.com; tn@semihalf.com; fu.wei@linaro.org; > rostedt@goodmis.org; bristot@redhat.com; linux-arm- > kernel@lists.infradead.org; kvmarm@lists.cs.columbia.edu; > kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux- > acpi@vger.kernel.org; linux-efi@vger.kernel.org; devel@acpica.org; > Suzuki.Poulose@arm.com; punit.agrawal@arm.com; astone@redhat.com; > harba@codeaurora.org; Tyler Baicar; joe@perches.com; John Garry; > Gabriele Paoloni; Guohanjun (Hanjun Guo); wangxiongfeng (C); Zhengqiang > (turing) > Subject: Re: [RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV > notification type > > Hi Hanjun, > > On 01/03/17 08:27, Hanjun Guo wrote: > > On 2017/2/28 21:22, James Morse wrote: > >> On 27/02/17 18:19, Shiju Jose wrote: > >>> Add a new GHES error source handling function for GSIV(Global > System > >>> Interrupt Vector). > >>> If an error source's notification type is GSIV, then this handling > >>> function can be registered into the GSIV handler and it can parse > >>> and report error information when they occur. > >> > >> I'm missing some of the story here, but how is GSIV different from > >> 'External Interrupt'? I'm guessing something other than the CPU > takes this 'interrupt'... > > > > Yes, it's the same from CPU side (they are interrupts!), but there is > > history behind them and the usage is different. > > > > I think External Interrupt was introduced before ACPI is available on > > ARM (hardware reduced platforms), so I guess it was used for errors > > reported to OS which were not using SCI mechanism, for example, some > > IO error reporting. > > > > For External Interrupt, we don't use the ACPI event system, so for > the > > firmware, it just report the errors associate with the interrupt > > number, the kernel map the interrupt number and install the irq > > handler for it. > > > > For GSIV based event, it was introduced for hardware reduced platform > > in recent ACPI revision, and ARM64 is one of its consumers. When > > errors are reported via GSIV, ACPI event notification needs to be > > implemented and requires the platform to define a hardware error > > device > > (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013. > > Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means > the same but the route they take to get into APEI is different. > > > > For example, if we are using SPI (ARM GIC) 100 to report errors, > there > > is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, > when > > Aha, this is where the interrupt-magic happens. > > > > error happened and trigger the interrupt, ACPI0013 driver will notify > > the error device (PNP0C33), then error device driver > > (drivers/acpi/hed.c) will process the error data form APEI table... > > > >> The GHES GSIV code below is identical to the behaviour of the SCI > >> Notification type... are these two names for the same thing? (I'm > >> confused!) > > > > SCI is also an 'interrupt' but it's a special irq number for ACPI > > event, and it has GPE (general purpose event) registers behind it, > > which is used only on Intel platforms. SCI based event use > > Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV is > > used for HW-reduced platform which has no GPEs. > > > Hope it can clarify something :) > > Yes thanks! (the mist is slowly clearing...) > > If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive > notification from PNP0C33', is there any point having separate lists > and add/remove functions for them? > > Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list > to describe 'hed' instead, then group the two case statements together? > There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as > SCI is already built-in and this way the code added is tiny. The only > thing we would lose is the name 'GSIV' in the not-supported error > message which we don't need if its always supported. This method was tested ok. However we were not sure about reusing/changing the existing ghes_notifier_sci() for gsiv will be accepted. Thus added a separate handling function ghes_notifier_gsiv() for gsiv. Thanks, Shiju > > > Thanks, > > James
On 2017/3/2 21:45, Shiju Jose wrote: > Hi James, > >> >> Hi Hanjun, >> >> On 01/03/17 08:27, Hanjun Guo wrote: >>> On 2017/2/28 21:22, James Morse wrote: >>>> On 27/02/17 18:19, Shiju Jose wrote: >>>>> Add a new GHES error source handling function for GSIV(Global >> System >>>>> Interrupt Vector). >>>>> If an error source's notification type is GSIV, then this handling >>>>> function can be registered into the GSIV handler and it can parse >>>>> and report error information when they occur. >>>> >>>> I'm missing some of the story here, but how is GSIV different from >>>> 'External Interrupt'? I'm guessing something other than the CPU >> takes this 'interrupt'... >>> >>> Yes, it's the same from CPU side (they are interrupts!), but there is >>> history behind them and the usage is different. >>> >>> I think External Interrupt was introduced before ACPI is available on >>> ARM (hardware reduced platforms), so I guess it was used for errors >>> reported to OS which were not using SCI mechanism, for example, some >>> IO error reporting. >>> >>> For External Interrupt, we don't use the ACPI event system, so for >> the >>> firmware, it just report the errors associate with the interrupt >>> number, the kernel map the interrupt number and install the irq >>> handler for it. >>> >>> For GSIV based event, it was introduced for hardware reduced platform >>> in recent ACPI revision, and ARM64 is one of its consumers. When >>> errors are reported via GSIV, ACPI event notification needs to be >>> implemented and requires the platform to define a hardware error >>> device >>> (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013. >> >> Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means >> the same but the route they take to get into APEI is different. >> >> >>> For example, if we are using SPI (ARM GIC) 100 to report errors, >> there >>> is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, >> when >> >> Aha, this is where the interrupt-magic happens. >> >> >>> error happened and trigger the interrupt, ACPI0013 driver will notify >>> the error device (PNP0C33), then error device driver >>> (drivers/acpi/hed.c) will process the error data form APEI table... >> >> >>>> The GHES GSIV code below is identical to the behaviour of the SCI >>>> Notification type... are these two names for the same thing? (I'm >>>> confused!) >>> >>> SCI is also an 'interrupt' but it's a special irq number for ACPI >>> event, and it has GPE (general purpose event) registers behind it, >>> which is used only on Intel platforms. SCI based event use >>> Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV is >>> used for HW-reduced platform which has no GPEs. >> >>> Hope it can clarify something :) >> >> Yes thanks! (the mist is slowly clearing...) >> >> If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive >> notification from PNP0C33', is there any point having separate lists >> and add/remove functions for them? >> >> Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list >> to describe 'hed' instead, then group the two case statements together? >> There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as >> SCI is already built-in and this way the code added is tiny. The only >> thing we would lose is the name 'GSIV' in the not-supported error >> message which we don't need if its always supported. > > This method was tested ok. However we were not sure about reusing/changing the > existing ghes_notifier_sci() for gsiv will be accepted. For now a notify (SCI/GSIV/GPIO) will trigger the process of all the ghes data even they are on different list, so add them on a single list and process them will have the same effect. > Thus added a separate handling function ghes_notifier_gsiv() for gsiv. I think we can prepare the patch and send out for review. Thanks Hanjun
Hi Hanjun, > -----Original Message----- > From: Hanjun Guo [mailto:hanjun.guo@linaro.org] > Sent: 03 March 2017 04:20 > To: Shiju Jose; James Morse > Cc: christoffer.dall@linaro.org; marc.zyngier@arm.com; > pbonzini@redhat.com; rkrcmar@redhat.com; linux@armlinux.org.uk; > catalin.marinas@arm.com; will.deacon@arm.com; rjw@rjwysocki.net; > lenb@kernel.org; matt@codeblueprint.co.uk; robert.moore@intel.com; > lv.zheng@intel.com; nkaje@codeaurora.org; zjzhang@codeaurora.org; > mark.rutland@arm.com; akpm@linux-foundation.org; > eun.taik.lee@samsung.com; sandeepa.s.prabhu@gmail.com; > labbott@redhat.com; shijie.huang@arm.com; rruigrok@codeaurora.org; > paul.gortmaker@windriver.com; tn@semihalf.com; fu.wei@linaro.org; > rostedt@goodmis.org; bristot@redhat.com; linux-arm- > kernel@lists.infradead.org; kvmarm@lists.cs.columbia.edu; > kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux- > acpi@vger.kernel.org; linux-efi@vger.kernel.org; devel@acpica.org; > Suzuki.Poulose@arm.com; punit.agrawal@arm.com; astone@redhat.com; > harba@codeaurora.org; Tyler Baicar; joe@perches.com; John Garry; > Gabriele Paoloni; Guohanjun (Hanjun Guo); wangxiongfeng (C); Zhengqiang > (turing) > Subject: Re: [RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV > notification type > > On 2017/3/2 21:45, Shiju Jose wrote: > > Hi James, > > > >> > >> Hi Hanjun, > >> > >> On 01/03/17 08:27, Hanjun Guo wrote: > >>> On 2017/2/28 21:22, James Morse wrote: > >>>> On 27/02/17 18:19, Shiju Jose wrote: > >>>>> Add a new GHES error source handling function for GSIV(Global > >> System > >>>>> Interrupt Vector). > >>>>> If an error source's notification type is GSIV, then this > handling > >>>>> function can be registered into the GSIV handler and it can parse > >>>>> and report error information when they occur. > >>>> > >>>> I'm missing some of the story here, but how is GSIV different from > >>>> 'External Interrupt'? I'm guessing something other than the CPU > >> takes this 'interrupt'... > >>> > >>> Yes, it's the same from CPU side (they are interrupts!), but there > >>> is history behind them and the usage is different. > >>> > >>> I think External Interrupt was introduced before ACPI is available > >>> on ARM (hardware reduced platforms), so I guess it was used for > >>> errors reported to OS which were not using SCI mechanism, for > >>> example, some IO error reporting. > >>> > >>> For External Interrupt, we don't use the ACPI event system, so for > >> the > >>> firmware, it just report the errors associate with the interrupt > >>> number, the kernel map the interrupt number and install the irq > >>> handler for it. > >>> > >>> For GSIV based event, it was introduced for hardware reduced > >>> platform in recent ACPI revision, and ARM64 is one of its consumers. > >>> When errors are reported via GSIV, ACPI event notification needs to > >>> be implemented and requires the platform to define a hardware error > >>> device > >>> (PNP0C33) in ACPI namespace, and also a generic event device > ACPI0013. > >> > >> Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' > >> means the same but the route they take to get into APEI is different. > >> > >> > >>> For example, if we are using SPI (ARM GIC) 100 to report errors, > >> there > >>> is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, > >> when > >> > >> Aha, this is where the interrupt-magic happens. > >> > >> > >>> error happened and trigger the interrupt, ACPI0013 driver will > >>> notify the error device (PNP0C33), then error device driver > >>> (drivers/acpi/hed.c) will process the error data form APEI table... > >> > >> > >>>> The GHES GSIV code below is identical to the behaviour of the SCI > >>>> Notification type... are these two names for the same thing? (I'm > >>>> confused!) > >>> > >>> SCI is also an 'interrupt' but it's a special irq number for ACPI > >>> event, and it has GPE (general purpose event) registers behind it, > >>> which is used only on Intel platforms. SCI based event use > >>> Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV > is > >>> used for HW-reduced platform which has no GPEs. > >> > >>> Hope it can clarify something :) > >> > >> Yes thanks! (the mist is slowly clearing...) > >> > >> If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive > >> notification from PNP0C33', is there any point having separate lists > >> and add/remove functions for them? > >> > >> Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci > >> list to describe 'hed' instead, then group the two case statements > together? > >> There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as > >> SCI is already built-in and this way the code added is tiny. The > only > >> thing we would lose is the name 'GSIV' in the not-supported error > >> message which we don't need if its always supported. > > > > This method was tested ok. However we were not sure about > > reusing/changing the existing ghes_notifier_sci() for gsiv will be > accepted. > > For now a notify (SCI/GSIV/GPIO) will trigger the process of all the > ghes data even they are on different list, so add them on a single list > and process them will have the same effect. > > > Thus added a separate handling function ghes_notifier_gsiv() for gsiv. > > I think we can prepare the patch and send out for review. Ok. I will do this. > > Thanks > Hanjun
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1117421..e41fdcf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -53,6 +53,7 @@ config ARM64 select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_ACPI_APEI if (ACPI && EFI) + select HAVE_ACPI_APEI_GSIV if (ACPI && EFI) select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index b0140c8..f45ddb5 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -1,6 +1,15 @@ config HAVE_ACPI_APEI bool +config HAVE_ACPI_APEI_GSIV + bool "APEI GSIV(Global System Interrupt) logging/recovering support" + depends on ACPI_APEI && ACPI_APEI_GHES + help + This option should be enabled if the system supports + firmware first handling of GSIV (Global System Interrupt) + for the hardware errors and allows OS to do error + recovery/logging. + config HAVE_ACPI_APEI_NMI bool diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index e53bef6..e1611d2 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -721,6 +721,58 @@ static struct notifier_block ghes_notifier_sci = { .notifier_call = ghes_notify_sci, }; +#ifdef CONFIG_HAVE_ACPI_APEI_GSIV +static LIST_HEAD(ghes_gsiv); + +static int ghes_notify_gsiv(struct notifier_block *this, + unsigned long event, void *data) +{ + struct ghes *ghes; + int ret = NOTIFY_DONE; + + rcu_read_lock(); + list_for_each_entry_rcu(ghes, &ghes_gsiv, list) { + if (!ghes_proc(ghes)) + ret = NOTIFY_OK; + } + rcu_read_unlock(); + return ret; +} +static struct notifier_block ghes_notifier_gsiv = { + .notifier_call = ghes_notify_gsiv, +}; + +static void ghes_gsiv_add(struct ghes *ghes) +{ + mutex_lock(&ghes_list_mutex); + if (list_empty(&ghes_gsiv)) + register_acpi_hed_notifier(&ghes_notifier_gsiv); + list_add_rcu(&ghes->list, &ghes_gsiv); + mutex_unlock(&ghes_list_mutex); +} + +static void ghes_gsiv_remove(struct ghes *ghes) +{ + mutex_lock(&ghes_list_mutex); + list_del_rcu(&ghes->list); + if (list_empty(&ghes_gsiv)) + unregister_acpi_hed_notifier(&ghes_notifier_gsiv); + mutex_unlock(&ghes_list_mutex); +} +#else /* CONFIG_HAVE_ACPI_APEI_GSIV */ +static inline void ghes_gsiv_add(struct ghes *ghes) +{ + pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n", + ghes->generic->header.source_id); +} + +static inline void ghes_gsiv_remove(struct ghes *ghes) +{ + pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n", + ghes->generic->header.source_id); +} +#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */ + #ifdef CONFIG_HAVE_ACPI_APEI_NMI /* * printk is not safe in NMI context. So in NMI handler, we allocate @@ -973,6 +1025,14 @@ static int ghes_probe(struct platform_device *ghes_dev) goto err; } break; + case ACPI_HEST_NOTIFY_GSIV: + if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) { + pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n", + generic->header.source_id); + rc = -ENOTSUPP; + goto err; + } + break; case ACPI_HEST_NOTIFY_LOCAL: pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", generic->header.source_id); @@ -1034,6 +1094,9 @@ static int ghes_probe(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_NMI: ghes_nmi_add(ghes); break; + case ACPI_HEST_NOTIFY_GSIV: + ghes_gsiv_add(ghes); + break; default: BUG(); } @@ -1076,6 +1139,9 @@ static int ghes_remove(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_NMI: ghes_nmi_remove(ghes); break; + case ACPI_HEST_NOTIFY_GSIV: + ghes_gsiv_remove(ghes); + break; default: BUG(); break;