diff mbox

[RFC,V1,v4.10-rc3,1/1] acpi: apei: handle GSIV notification type

Message ID 86258A5CC0A3704780874CF6004BA8A62DCAB57E@lhreml502-mbs (mailing list archive)
State New, archived
Headers show

Commit Message

Shiju Jose Feb. 27, 2017, 6:19 p.m. UTC
Add a new GHES error source handling function for
GSIV(Global System Interrupt Vector).
If an error source's notification type is GSIV,
then this handling function can be registered
into the GSIV handler and it can parse
and report error information when they occur.

Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 arch/arm64/Kconfig        |  1 +
 drivers/acpi/apei/Kconfig |  9 +++++++
 drivers/acpi/apei/ghes.c  | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

Comments

Paul Gortmaker Feb. 27, 2017, 6:58 p.m. UTC | #1
[[RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV notification type] On 27/02/2017 (Mon 18:19) Shiju Jose wrote:

> Add a new GHES error source handling function for
> GSIV(Global System Interrupt Vector).
> If an error source's notification type is GSIV,
> then this handling function can be registered
> into the GSIV handler and it can parse
> and report error information when they occur.
> 
> Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
> ---
>  arch/arm64/Kconfig        |  1 +
>  drivers/acpi/apei/Kconfig |  9 +++++++
>  drivers/acpi/apei/ghes.c  | 66 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 76 insertions(+)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1117421..e41fdcf 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -53,6 +53,7 @@ config ARM64
>  	select HANDLE_DOMAIN_IRQ
>  	select HARDIRQS_SW_RESEND
>  	select HAVE_ACPI_APEI if (ACPI && EFI)
> +	select HAVE_ACPI_APEI_GSIV if (ACPI && EFI)
>  	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
>  	select HAVE_ARCH_AUDITSYSCALL
>  	select HAVE_ARCH_BITREVERSE
> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
> index b0140c8..f45ddb5 100644
> --- a/drivers/acpi/apei/Kconfig
> +++ b/drivers/acpi/apei/Kconfig
> @@ -1,6 +1,15 @@
>  config HAVE_ACPI_APEI
>  	bool
>  
> +config HAVE_ACPI_APEI_GSIV
> +        bool "APEI GSIV(Global System Interrupt) logging/recovering support"
> +        depends on ACPI_APEI && ACPI_APEI_GHES
> +        help
> +          This option should be enabled if the system supports
> +          firmware first handling of GSIV (Global System Interrupt)
> +	  for the hardware errors and allows OS to do error
> +          recovery/logging.
> +
>  config HAVE_ACPI_APEI_NMI
>  	bool

A "config HAVE_<foo>" option normally doesn't have a "help" section
since it isn't meant to be user selected.  You can see that in the other
two options that exist right here in the context of your patch.

Also your Cc list here seems rather large; not sure how you came up with
such a large list...

P.
--

>  
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index e53bef6..e1611d2 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -721,6 +721,58 @@ static struct notifier_block ghes_notifier_sci = {
>  	.notifier_call = ghes_notify_sci,
>  };
>  
> +#ifdef CONFIG_HAVE_ACPI_APEI_GSIV
> +static LIST_HEAD(ghes_gsiv);
> +
> +static int ghes_notify_gsiv(struct notifier_block *this,
> +				unsigned long event, void *data)
> +{
> +	struct ghes *ghes;
> +	int ret = NOTIFY_DONE;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ghes, &ghes_gsiv, list) {
> +	if (!ghes_proc(ghes))
> +		ret = NOTIFY_OK;
> +	}
> +	rcu_read_unlock();
> +	return ret;
> +}
> +static struct notifier_block ghes_notifier_gsiv = {
> +	.notifier_call = ghes_notify_gsiv,
> +};
> +
> +static void ghes_gsiv_add(struct ghes *ghes)
> +{
> +	mutex_lock(&ghes_list_mutex);
> +	if (list_empty(&ghes_gsiv))
> +		register_acpi_hed_notifier(&ghes_notifier_gsiv);
> +	list_add_rcu(&ghes->list, &ghes_gsiv);
> +	mutex_unlock(&ghes_list_mutex);
> +}
> +
> +static void ghes_gsiv_remove(struct ghes *ghes)
> +{
> +	mutex_lock(&ghes_list_mutex);
> +	list_del_rcu(&ghes->list);
> +	if (list_empty(&ghes_gsiv))
> +		unregister_acpi_hed_notifier(&ghes_notifier_gsiv);
> +	mutex_unlock(&ghes_list_mutex);
> +}
> +#else /* CONFIG_HAVE_ACPI_APEI_GSIV */
> +static inline void ghes_gsiv_add(struct ghes *ghes)
> +{
> +	pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n",
> +	       ghes->generic->header.source_id);
> +}
> +
> +static inline void ghes_gsiv_remove(struct ghes *ghes)
> +{
> +	pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n",
> +	       ghes->generic->header.source_id);
> +}
> +#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */
> +
>  #ifdef CONFIG_HAVE_ACPI_APEI_NMI
>  /*
>   * printk is not safe in NMI context.  So in NMI handler, we allocate
> @@ -973,6 +1025,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
>  			goto err;
>  		}
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) {
> +			pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n",
> +				generic->header.source_id);
> +			rc = -ENOTSUPP;
> +			goto err;
> +		}
> +		break;
>  	case ACPI_HEST_NOTIFY_LOCAL:
>  		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
>  			   generic->header.source_id);
> @@ -1034,6 +1094,9 @@ static int ghes_probe(struct platform_device *ghes_dev)
>  	case ACPI_HEST_NOTIFY_NMI:
>  		ghes_nmi_add(ghes);
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		ghes_gsiv_add(ghes);
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -1076,6 +1139,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
>  	case ACPI_HEST_NOTIFY_NMI:
>  		ghes_nmi_remove(ghes);
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		ghes_gsiv_remove(ghes);
> +		break;
>  	default:
>  		BUG();
>  		break;
> -- 
> 2.1.4
>
Steven Rostedt Feb. 27, 2017, 8:39 p.m. UTC | #2
On Mon, 27 Feb 2017 13:58:19 -0500
Paul Gortmaker <paul.gortmaker@windriver.com> wrote:


> > --- a/drivers/acpi/apei/Kconfig
> > +++ b/drivers/acpi/apei/Kconfig
> > @@ -1,6 +1,15 @@
> >  config HAVE_ACPI_APEI
> >  	bool
> >  
> > +config HAVE_ACPI_APEI_GSIV
> > +        bool "APEI GSIV(Global System Interrupt) logging/recovering support"
> > +        depends on ACPI_APEI && ACPI_APEI_GHES
> > +        help
> > +          This option should be enabled if the system supports
> > +          firmware first handling of GSIV (Global System Interrupt)
> > +	  for the hardware errors and allows OS to do error
> > +          recovery/logging.
> > +
> >  config HAVE_ACPI_APEI_NMI
> >  	bool  
> 
> A "config HAVE_<foo>" option normally doesn't have a "help" section
> since it isn't meant to be user selected.  You can see that in the other
> two options that exist right here in the context of your patch.

I will argue that having "help" messages for even configs that do not
get selected are useful. It helps reviewers of the code to know why one
of these events should be enabled or not.


> 
> Also your Cc list here seems rather large; not sure how you came up with
> such a large list...

+1

-- Steve
James Morse Feb. 28, 2017, 1:22 p.m. UTC | #3
Hi Shiju,

On 27/02/17 18:19, Shiju Jose wrote:
> Add a new GHES error source handling function for
> GSIV(Global System Interrupt Vector).
> If an error source's notification type is GSIV,
> then this handling function can be registered
> into the GSIV handler and it can parse
> and report error information when they occur.

I'm missing some of the story here, but how is GSIV different from 'External
Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'...


The GHES GSIV code below is identical to the behaviour of the SCI Notification
type... are these two names for the same thing? (I'm confused!)


Thanks,

James


> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index e53bef6..e1611d2 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -721,6 +721,58 @@ static struct notifier_block ghes_notifier_sci = {
>  	.notifier_call = ghes_notify_sci,
>  };
>  
> +#ifdef CONFIG_HAVE_ACPI_APEI_GSIV
> +static LIST_HEAD(ghes_gsiv);
> +
> +static int ghes_notify_gsiv(struct notifier_block *this,
> +				unsigned long event, void *data)
> +{
> +	struct ghes *ghes;
> +	int ret = NOTIFY_DONE;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ghes, &ghes_gsiv, list) {
> +	if (!ghes_proc(ghes))
> +		ret = NOTIFY_OK;
> +	}
> +	rcu_read_unlock();
> +	return ret;
> +}
> +static struct notifier_block ghes_notifier_gsiv = {
> +	.notifier_call = ghes_notify_gsiv,
> +};
> +
> +static void ghes_gsiv_add(struct ghes *ghes)
> +{
> +	mutex_lock(&ghes_list_mutex);
> +	if (list_empty(&ghes_gsiv))
> +		register_acpi_hed_notifier(&ghes_notifier_gsiv);
> +	list_add_rcu(&ghes->list, &ghes_gsiv);
> +	mutex_unlock(&ghes_list_mutex);
> +}
> +
> +static void ghes_gsiv_remove(struct ghes *ghes)
> +{
> +	mutex_lock(&ghes_list_mutex);
> +	list_del_rcu(&ghes->list);
> +	if (list_empty(&ghes_gsiv))
> +		unregister_acpi_hed_notifier(&ghes_notifier_gsiv);
> +	mutex_unlock(&ghes_list_mutex);
> +}
> +#else /* CONFIG_HAVE_ACPI_APEI_GSIV */
> +static inline void ghes_gsiv_add(struct ghes *ghes)
> +{
> +	pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n",
> +	       ghes->generic->header.source_id);
> +}
> +
> +static inline void ghes_gsiv_remove(struct ghes *ghes)
> +{
> +	pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n",
> +	       ghes->generic->header.source_id);
> +}
> +#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */
> +
>  #ifdef CONFIG_HAVE_ACPI_APEI_NMI
>  /*
>   * printk is not safe in NMI context.  So in NMI handler, we allocate
> @@ -973,6 +1025,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
>  			goto err;
>  		}
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) {
> +			pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n",
> +				generic->header.source_id);
> +			rc = -ENOTSUPP;
> +			goto err;
> +		}
> +		break;
>  	case ACPI_HEST_NOTIFY_LOCAL:
>  		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
>  			   generic->header.source_id);
> @@ -1034,6 +1094,9 @@ static int ghes_probe(struct platform_device *ghes_dev)
>  	case ACPI_HEST_NOTIFY_NMI:
>  		ghes_nmi_add(ghes);
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		ghes_gsiv_add(ghes);
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -1076,6 +1139,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
>  	case ACPI_HEST_NOTIFY_NMI:
>  		ghes_nmi_remove(ghes);
>  		break;
> +	case ACPI_HEST_NOTIFY_GSIV:
> +		ghes_gsiv_remove(ghes);
> +		break;
>  	default:
>  		BUG();
>  		break;
>
Hanjun Guo March 1, 2017, 8:27 a.m. UTC | #4
Hi James,

On 2017/2/28 21:22, James Morse wrote:
> Hi Shiju,
>
> On 27/02/17 18:19, Shiju Jose wrote:
>> Add a new GHES error source handling function for
>> GSIV(Global System Interrupt Vector).
>> If an error source's notification type is GSIV,
>> then this handling function can be registered
>> into the GSIV handler and it can parse
>> and report error information when they occur.
>
> I'm missing some of the story here, but how is GSIV different from 'External
> Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'...

Yes, it's the same from CPU side (they are interrupts!), but there
is history behind them and the usage is different.

I think External Interrupt was introduced before ACPI is available on
ARM (hardware reduced platforms), so I guess it was used for errors
reported to OS which were not using SCI mechanism, for example, some
IO error reporting.

For External Interrupt, we don't use the ACPI event system, so for
the firmware, it just report the errors associate with the interrupt
number, the kernel map the interrupt number and install the
irq handler for it.

For GSIV based event, it was introduced for hardware reduced platform
in recent ACPI revision, and ARM64 is one of its consumers. When
errors are reported via GSIV, ACPI event notification needs to be
implemented and requires the platform to define a hardware error device
(PNP0C33) in ACPI namespace, and also a generic event device ACPI0013.

For example, if we are using SPI (ARM GIC) 100 to report errors, there
is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, when
error happened and trigger the interrupt, ACPI0013 driver will
notify the error device (PNP0C33), then error device driver
(drivers/acpi/hed.c) will process the error data form APEI table...

So GSIV is quite different from 'External interrupt' in the way of
working from both firmware and kernel side.

>
>
> The GHES GSIV code below is identical to the behaviour of the SCI Notification
> type... are these two names for the same thing? (I'm confused!)

SCI is also an 'interrupt' but it's a special irq number for ACPI
event, and it has GPE (general purpose event) registers behind it,
which is used only on Intel platforms. SCI based event use
Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV
is used for HW-reduced platform which has no GPEs.

Hope it can clarify something :)

Thanks
Hanjun
James Morse March 1, 2017, 5:04 p.m. UTC | #5
Hi Hanjun,

On 01/03/17 08:27, Hanjun Guo wrote:
> On 2017/2/28 21:22, James Morse wrote:
>> On 27/02/17 18:19, Shiju Jose wrote:
>>> Add a new GHES error source handling function for
>>> GSIV(Global System Interrupt Vector).
>>> If an error source's notification type is GSIV,
>>> then this handling function can be registered
>>> into the GSIV handler and it can parse
>>> and report error information when they occur.
>>
>> I'm missing some of the story here, but how is GSIV different from 'External
>> Interrupt'? I'm guessing something other than the CPU takes this 'interrupt'...
> 
> Yes, it's the same from CPU side (they are interrupts!), but there
> is history behind them and the usage is different.
> 
> I think External Interrupt was introduced before ACPI is available on
> ARM (hardware reduced platforms), so I guess it was used for errors
> reported to OS which were not using SCI mechanism, for example, some
> IO error reporting.
> 
> For External Interrupt, we don't use the ACPI event system, so for
> the firmware, it just report the errors associate with the interrupt
> number, the kernel map the interrupt number and install the
> irq handler for it.
> 
> For GSIV based event, it was introduced for hardware reduced platform
> in recent ACPI revision, and ARM64 is one of its consumers. When
> errors are reported via GSIV, ACPI event notification needs to be
> implemented and requires the platform to define a hardware error device
> (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013.

Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means the same
but the route they take to get into APEI is different.


> For example, if we are using SPI (ARM GIC) 100 to report errors, there
> is a ACPI0013 driver in drivers/acpi/evged.c to register the irq, when

Aha, this is where the interrupt-magic happens.


> error happened and trigger the interrupt, ACPI0013 driver will
> notify the error device (PNP0C33), then error device driver
> (drivers/acpi/hed.c) will process the error data form APEI table...


>> The GHES GSIV code below is identical to the behaviour of the SCI Notification
>> type... are these two names for the same thing? (I'm confused!)
> 
> SCI is also an 'interrupt' but it's a special irq number for ACPI
> event, and it has GPE (general purpose event) registers behind it,
> which is used only on Intel platforms. SCI based event use
> Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV
> is used for HW-reduced platform which has no GPEs.

> Hope it can clarify something :)

Yes thanks! (the mist is slowly clearing...)

If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive
notification from PNP0C33', is there any point having separate lists and
add/remove functions for them?

Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list to
describe 'hed' instead, then group the two case statements together? There would
be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as SCI is already built-in
and this way the code added is tiny. The only thing we would lose is the name
'GSIV' in the not-supported error message which we don't need if its always
supported.


Thanks,

James
Shiju Jose March 2, 2017, 1:45 p.m. UTC | #6
Hi James,

> -----Original Message-----
> From: James Morse [mailto:james.morse@arm.com]
> Sent: 01 March 2017 17:04
> To: Hanjun Guo; Shiju Jose
> Cc: christoffer.dall@linaro.org; marc.zyngier@arm.com;
> pbonzini@redhat.com; rkrcmar@redhat.com; linux@armlinux.org.uk;
> catalin.marinas@arm.com; will.deacon@arm.com; rjw@rjwysocki.net;
> lenb@kernel.org; matt@codeblueprint.co.uk; robert.moore@intel.com;
> lv.zheng@intel.com; nkaje@codeaurora.org; zjzhang@codeaurora.org;
> mark.rutland@arm.com; akpm@linux-foundation.org;
> eun.taik.lee@samsung.com; sandeepa.s.prabhu@gmail.com;
> labbott@redhat.com; shijie.huang@arm.com; rruigrok@codeaurora.org;
> paul.gortmaker@windriver.com; tn@semihalf.com; fu.wei@linaro.org;
> rostedt@goodmis.org; bristot@redhat.com; linux-arm-
> kernel@lists.infradead.org; kvmarm@lists.cs.columbia.edu;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> acpi@vger.kernel.org; linux-efi@vger.kernel.org; devel@acpica.org;
> Suzuki.Poulose@arm.com; punit.agrawal@arm.com; astone@redhat.com;
> harba@codeaurora.org; Tyler Baicar; joe@perches.com; John Garry;
> Gabriele Paoloni; Guohanjun (Hanjun Guo); wangxiongfeng (C); Zhengqiang
> (turing)
> Subject: Re: [RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV
> notification type
> 
> Hi Hanjun,
> 
> On 01/03/17 08:27, Hanjun Guo wrote:
> > On 2017/2/28 21:22, James Morse wrote:
> >> On 27/02/17 18:19, Shiju Jose wrote:
> >>> Add a new GHES error source handling function for GSIV(Global
> System
> >>> Interrupt Vector).
> >>> If an error source's notification type is GSIV, then this handling
> >>> function can be registered into the GSIV handler and it can parse
> >>> and report error information when they occur.
> >>
> >> I'm missing some of the story here, but how is GSIV different from
> >> 'External Interrupt'? I'm guessing something other than the CPU
> takes this 'interrupt'...
> >
> > Yes, it's the same from CPU side (they are interrupts!), but there is
> > history behind them and the usage is different.
> >
> > I think External Interrupt was introduced before ACPI is available on
> > ARM (hardware reduced platforms), so I guess it was used for errors
> > reported to OS which were not using SCI mechanism, for example, some
> > IO error reporting.
> >
> > For External Interrupt, we don't use the ACPI event system, so for
> the
> > firmware, it just report the errors associate with the interrupt
> > number, the kernel map the interrupt number and install the irq
> > handler for it.
> >
> > For GSIV based event, it was introduced for hardware reduced platform
> > in recent ACPI revision, and ARM64 is one of its consumers. When
> > errors are reported via GSIV, ACPI event notification needs to be
> > implemented and requires the platform to define a hardware error
> > device
> > (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013.
> 
> Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means
> the same but the route they take to get into APEI is different.
> 
> 
> > For example, if we are using SPI (ARM GIC) 100 to report errors,
> there
> > is a ACPI0013 driver in drivers/acpi/evged.c to register the irq,
> when
> 
> Aha, this is where the interrupt-magic happens.
> 
> 
> > error happened and trigger the interrupt, ACPI0013 driver will notify
> > the error device (PNP0C33), then error device driver
> > (drivers/acpi/hed.c) will process the error data form APEI table...
> 
> 
> >> The GHES GSIV code below is identical to the behaviour of the SCI
> >> Notification type... are these two names for the same thing? (I'm
> >> confused!)
> >
> > SCI is also an 'interrupt' but it's a special irq number for ACPI
> > event, and it has GPE (general purpose event) registers behind it,
> > which is used only on Intel platforms. SCI based event use
> > Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV is
> > used for HW-reduced platform which has no GPEs.
> 
> > Hope it can clarify something :)
> 
> Yes thanks! (the mist is slowly clearing...)
> 
> If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive
> notification from PNP0C33', is there any point having separate lists
> and add/remove functions for them?
> 
> Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list
> to describe 'hed' instead, then group the two case statements together?
> There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as
> SCI is already built-in and this way the code added is tiny. The only
> thing we would lose is the name 'GSIV' in the not-supported error
> message which we don't need if its always supported.

This method was tested ok. However we were not sure about reusing/changing the 
existing ghes_notifier_sci() for gsiv will be accepted. 
Thus added a separate handling function ghes_notifier_gsiv() for gsiv.       

Thanks,
Shiju
> 
> 
> Thanks,
> 
> James
Hanjun Guo March 3, 2017, 4:20 a.m. UTC | #7
On 2017/3/2 21:45, Shiju Jose wrote:
> Hi James,
>
>>
>> Hi Hanjun,
>>
>> On 01/03/17 08:27, Hanjun Guo wrote:
>>> On 2017/2/28 21:22, James Morse wrote:
>>>> On 27/02/17 18:19, Shiju Jose wrote:
>>>>> Add a new GHES error source handling function for GSIV(Global
>> System
>>>>> Interrupt Vector).
>>>>> If an error source's notification type is GSIV, then this handling
>>>>> function can be registered into the GSIV handler and it can parse
>>>>> and report error information when they occur.
>>>>
>>>> I'm missing some of the story here, but how is GSIV different from
>>>> 'External Interrupt'? I'm guessing something other than the CPU
>> takes this 'interrupt'...
>>>
>>> Yes, it's the same from CPU side (they are interrupts!), but there is
>>> history behind them and the usage is different.
>>>
>>> I think External Interrupt was introduced before ACPI is available on
>>> ARM (hardware reduced platforms), so I guess it was used for errors
>>> reported to OS which were not using SCI mechanism, for example, some
>>> IO error reporting.
>>>
>>> For External Interrupt, we don't use the ACPI event system, so for
>> the
>>> firmware, it just report the errors associate with the interrupt
>>> number, the kernel map the interrupt number and install the irq
>>> handler for it.
>>>
>>> For GSIV based event, it was introduced for hardware reduced platform
>>> in recent ACPI revision, and ARM64 is one of its consumers. When
>>> errors are reported via GSIV, ACPI event notification needs to be
>>> implemented and requires the platform to define a hardware error
>>> device
>>> (PNP0C33) in ACPI namespace, and also a generic event device ACPI0013.
>>
>> Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI' means
>> the same but the route they take to get into APEI is different.
>>
>>
>>> For example, if we are using SPI (ARM GIC) 100 to report errors,
>> there
>>> is a ACPI0013 driver in drivers/acpi/evged.c to register the irq,
>> when
>>
>> Aha, this is where the interrupt-magic happens.
>>
>>
>>> error happened and trigger the interrupt, ACPI0013 driver will notify
>>> the error device (PNP0C33), then error device driver
>>> (drivers/acpi/hed.c) will process the error data form APEI table...
>>
>>
>>>> The GHES GSIV code below is identical to the behaviour of the SCI
>>>> Notification type... are these two names for the same thing? (I'm
>>>> confused!)
>>>
>>> SCI is also an 'interrupt' but it's a special irq number for ACPI
>>> event, and it has GPE (general purpose event) registers behind it,
>>> which is used only on Intel platforms. SCI based event use
>>> Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV is
>>> used for HW-reduced platform which has no GPEs.
>>
>>> Hope it can clarify something :)
>>
>> Yes thanks! (the mist is slowly clearing...)
>>
>> If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive
>> notification from PNP0C33', is there any point having separate lists
>> and add/remove functions for them?
>>
>> Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci list
>> to describe 'hed' instead, then group the two case statements together?
>> There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as
>> SCI is already built-in and this way the code added is tiny. The only
>> thing we would lose is the name 'GSIV' in the not-supported error
>> message which we don't need if its always supported.
>
> This method was tested ok. However we were not sure about reusing/changing the
> existing ghes_notifier_sci() for gsiv will be accepted.

For now a notify (SCI/GSIV/GPIO) will trigger the process of all the
ghes data even they are on different list, so add them on a single
list and process them will have the same effect.

> Thus added a separate handling function ghes_notifier_gsiv() for gsiv.

I think we can prepare the patch and send out for review.

Thanks
Hanjun
Shiju Jose March 3, 2017, 12:48 p.m. UTC | #8
Hi Hanjun,

> -----Original Message-----
> From: Hanjun Guo [mailto:hanjun.guo@linaro.org]
> Sent: 03 March 2017 04:20
> To: Shiju Jose; James Morse
> Cc: christoffer.dall@linaro.org; marc.zyngier@arm.com;
> pbonzini@redhat.com; rkrcmar@redhat.com; linux@armlinux.org.uk;
> catalin.marinas@arm.com; will.deacon@arm.com; rjw@rjwysocki.net;
> lenb@kernel.org; matt@codeblueprint.co.uk; robert.moore@intel.com;
> lv.zheng@intel.com; nkaje@codeaurora.org; zjzhang@codeaurora.org;
> mark.rutland@arm.com; akpm@linux-foundation.org;
> eun.taik.lee@samsung.com; sandeepa.s.prabhu@gmail.com;
> labbott@redhat.com; shijie.huang@arm.com; rruigrok@codeaurora.org;
> paul.gortmaker@windriver.com; tn@semihalf.com; fu.wei@linaro.org;
> rostedt@goodmis.org; bristot@redhat.com; linux-arm-
> kernel@lists.infradead.org; kvmarm@lists.cs.columbia.edu;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> acpi@vger.kernel.org; linux-efi@vger.kernel.org; devel@acpica.org;
> Suzuki.Poulose@arm.com; punit.agrawal@arm.com; astone@redhat.com;
> harba@codeaurora.org; Tyler Baicar; joe@perches.com; John Garry;
> Gabriele Paoloni; Guohanjun (Hanjun Guo); wangxiongfeng (C); Zhengqiang
> (turing)
> Subject: Re: [RFC PATCH V1 v4.10-rc3 1/1] acpi: apei: handle GSIV
> notification type
> 
> On 2017/3/2 21:45, Shiju Jose wrote:
> > Hi James,
> >
> >>
> >> Hi Hanjun,
> >>
> >> On 01/03/17 08:27, Hanjun Guo wrote:
> >>> On 2017/2/28 21:22, James Morse wrote:
> >>>> On 27/02/17 18:19, Shiju Jose wrote:
> >>>>> Add a new GHES error source handling function for GSIV(Global
> >> System
> >>>>> Interrupt Vector).
> >>>>> If an error source's notification type is GSIV, then this
> handling
> >>>>> function can be registered into the GSIV handler and it can parse
> >>>>> and report error information when they occur.
> >>>>
> >>>> I'm missing some of the story here, but how is GSIV different from
> >>>> 'External Interrupt'? I'm guessing something other than the CPU
> >> takes this 'interrupt'...
> >>>
> >>> Yes, it's the same from CPU side (they are interrupts!), but there
> >>> is history behind them and the usage is different.
> >>>
> >>> I think External Interrupt was introduced before ACPI is available
> >>> on ARM (hardware reduced platforms), so I guess it was used for
> >>> errors reported to OS which were not using SCI mechanism, for
> >>> example, some IO error reporting.
> >>>
> >>> For External Interrupt, we don't use the ACPI event system, so for
> >> the
> >>> firmware, it just report the errors associate with the interrupt
> >>> number, the kernel map the interrupt number and install the irq
> >>> handler for it.
> >>>
> >>> For GSIV based event, it was introduced for hardware reduced
> >>> platform in recent ACPI revision, and ARM64 is one of its consumers.
> >>> When errors are reported via GSIV, ACPI event notification needs to
> >>> be implemented and requires the platform to define a hardware error
> >>> device
> >>> (PNP0C33) in ACPI namespace, and also a generic event device
> ACPI0013.
> >>
> >> Okay, so for APEI this really means PNP0C33 was Notify()d. 'SCI'
> >> means the same but the route they take to get into APEI is different.
> >>
> >>
> >>> For example, if we are using SPI (ARM GIC) 100 to report errors,
> >> there
> >>> is a ACPI0013 driver in drivers/acpi/evged.c to register the irq,
> >> when
> >>
> >> Aha, this is where the interrupt-magic happens.
> >>
> >>
> >>> error happened and trigger the interrupt, ACPI0013 driver will
> >>> notify the error device (PNP0C33), then error device driver
> >>> (drivers/acpi/hed.c) will process the error data form APEI table...
> >>
> >>
> >>>> The GHES GSIV code below is identical to the behaviour of the SCI
> >>>> Notification type... are these two names for the same thing? (I'm
> >>>> confused!)
> >>>
> >>> SCI is also an 'interrupt' but it's a special irq number for ACPI
> >>> event, and it has GPE (general purpose event) registers behind it,
> >>> which is used only on Intel platforms. SCI based event use
> >>> Method(\_GPE._L0x) to notify the error device (PNP0C33), but GSIV
> is
> >>> used for HW-reduced platform which has no GPEs.
> >>
> >>> Hope it can clarify something :)
> >>
> >> Yes thanks! (the mist is slowly clearing...)
> >>
> >> If ACPI_HEST_NOTIFY_SCI and ACPI_HEST_NOTIFY_GSIV both mean 'receive
> >> notification from PNP0C33', is there any point having separate lists
> >> and add/remove functions for them?
> >>
> >> Instead, could we rename Linux's ghes_notifier_sci() and ghes_sci
> >> list to describe 'hed' instead, then group the two case statements
> together?
> >> There would be no need to add a selectable CONFIG_ACPI_APEI_GSIV, as
> >> SCI is already built-in and this way the code added is tiny. The
> only
> >> thing we would lose is the name 'GSIV' in the not-supported error
> >> message which we don't need if its always supported.
> >
> > This method was tested ok. However we were not sure about
> > reusing/changing the existing ghes_notifier_sci() for gsiv will be
> accepted.
> 
> For now a notify (SCI/GSIV/GPIO) will trigger the process of all the
> ghes data even they are on different list, so add them on a single list
> and process them will have the same effect.
> 
> > Thus added a separate handling function ghes_notifier_gsiv() for gsiv.
> 
> I think we can prepare the patch and send out for review.
Ok. I will do this.
> 
> Thanks
> Hanjun
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1117421..e41fdcf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -53,6 +53,7 @@  config ARM64
 	select HANDLE_DOMAIN_IRQ
 	select HARDIRQS_SW_RESEND
 	select HAVE_ACPI_APEI if (ACPI && EFI)
+	select HAVE_ACPI_APEI_GSIV if (ACPI && EFI)
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index b0140c8..f45ddb5 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -1,6 +1,15 @@ 
 config HAVE_ACPI_APEI
 	bool
 
+config HAVE_ACPI_APEI_GSIV
+        bool "APEI GSIV(Global System Interrupt) logging/recovering support"
+        depends on ACPI_APEI && ACPI_APEI_GHES
+        help
+          This option should be enabled if the system supports
+          firmware first handling of GSIV (Global System Interrupt)
+	  for the hardware errors and allows OS to do error
+          recovery/logging.
+
 config HAVE_ACPI_APEI_NMI
 	bool
 
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e53bef6..e1611d2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -721,6 +721,58 @@  static struct notifier_block ghes_notifier_sci = {
 	.notifier_call = ghes_notify_sci,
 };
 
+#ifdef CONFIG_HAVE_ACPI_APEI_GSIV
+static LIST_HEAD(ghes_gsiv);
+
+static int ghes_notify_gsiv(struct notifier_block *this,
+				unsigned long event, void *data)
+{
+	struct ghes *ghes;
+	int ret = NOTIFY_DONE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, &ghes_gsiv, list) {
+	if (!ghes_proc(ghes))
+		ret = NOTIFY_OK;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+static struct notifier_block ghes_notifier_gsiv = {
+	.notifier_call = ghes_notify_gsiv,
+};
+
+static void ghes_gsiv_add(struct ghes *ghes)
+{
+	mutex_lock(&ghes_list_mutex);
+	if (list_empty(&ghes_gsiv))
+		register_acpi_hed_notifier(&ghes_notifier_gsiv);
+	list_add_rcu(&ghes->list, &ghes_gsiv);
+	mutex_unlock(&ghes_list_mutex);
+}
+
+static void ghes_gsiv_remove(struct ghes *ghes)
+{
+	mutex_lock(&ghes_list_mutex);
+	list_del_rcu(&ghes->list);
+	if (list_empty(&ghes_gsiv))
+		unregister_acpi_hed_notifier(&ghes_notifier_gsiv);
+	mutex_unlock(&ghes_list_mutex);
+}
+#else /* CONFIG_HAVE_ACPI_APEI_GSIV */
+static inline void ghes_gsiv_add(struct ghes *ghes)
+{
+	pr_err(GHES_PFX "ID: %d, trying to add GSIV notification which is not supported\n",
+	       ghes->generic->header.source_id);
+}
+
+static inline void ghes_gsiv_remove(struct ghes *ghes)
+{
+	pr_err(GHES_PFX "ID: %d, trying to remove GSIV notification which is not supported\n",
+	       ghes->generic->header.source_id);
+}
+#endif /* CONFIG_HAVE_ACPI_APEI_GSIV */
+
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 /*
  * printk is not safe in NMI context.  So in NMI handler, we allocate
@@ -973,6 +1025,14 @@  static int ghes_probe(struct platform_device *ghes_dev)
 			goto err;
 		}
 		break;
+	case ACPI_HEST_NOTIFY_GSIV:
+		if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_GSIV)) {
+			pr_warn(GHES_PFX "Generic hardware error source: %d notified via notification GSIV is not supported\n",
+				generic->header.source_id);
+			rc = -ENOTSUPP;
+			goto err;
+		}
+		break;
 	case ACPI_HEST_NOTIFY_LOCAL:
 		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
 			   generic->header.source_id);
@@ -1034,6 +1094,9 @@  static int ghes_probe(struct platform_device *ghes_dev)
 	case ACPI_HEST_NOTIFY_NMI:
 		ghes_nmi_add(ghes);
 		break;
+	case ACPI_HEST_NOTIFY_GSIV:
+		ghes_gsiv_add(ghes);
+		break;
 	default:
 		BUG();
 	}
@@ -1076,6 +1139,9 @@  static int ghes_remove(struct platform_device *ghes_dev)
 	case ACPI_HEST_NOTIFY_NMI:
 		ghes_nmi_remove(ghes);
 		break;
+	case ACPI_HEST_NOTIFY_GSIV:
+		ghes_gsiv_remove(ghes);
+		break;
 	default:
 		BUG();
 		break;