diff mbox

[02/11] ACPI / APEI: Generalise the estatus queue's add/remove and notify code

Message ID 20180215185606.26736-3-james.morse@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

James Morse Feb. 15, 2018, 6:55 p.m. UTC
To support asynchronous NMI-like notifications on arm64 we need to use
the estatus-queue. These patches refactor it to allow multiple APEI
notification types to use it.

Refactor the estatus queue's pool grow/shrink code and notification
routine from NOTIFY_NMI's handlers. This will allow another notification
method to use the estatus queue without duplicating this code.

This patch adds rcu_read_lock()/rcu_read_unlock() around the list
list_for_each_entry_rcu() walker. These aren't strictly necessary as
the whole nmi_enter/nmi_exit() window is a spooky RCU read-side
critical section.

Keep the oops_begin() call for x86, arm64 doesn't have one of these,
and APEI is the only thing outside arch code calling this..

The existing ghes_estatus_pool_shrink() is folded into the new
ghes_estatus_queue_shrink_pool() as only the queue uses it.

_in_nmi_notify_one() is separate from the rcu-list walker for a later
caller that doesn't need to walk a list.

Signed-off-by: James Morse <james.morse@arm.com>
---
 drivers/acpi/apei/ghes.c | 103 +++++++++++++++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 35 deletions(-)

Comments

Punit Agrawal Feb. 20, 2018, 6:26 p.m. UTC | #1
A few of typos and comments below.

James Morse <james.morse@arm.com> writes:

> To support asynchronous NMI-like notifications on arm64 we need to use
> the estatus-queue. These patches refactor it to allow multiple APEI
> notification types to use it.
>
> Refactor the estatus queue's pool grow/shrink code and notification
> routine from NOTIFY_NMI's handlers. This will allow another notification
> method to use the estatus queue without duplicating this code.
>
> This patch adds rcu_read_lock()/rcu_read_unlock() around the list
> list_for_each_entry_rcu() walker. These aren't strictly necessary as
> the whole nmi_enter/nmi_exit() window is a spooky RCU read-side
> critical section.
>
> Keep the oops_begin() call for x86, arm64 doesn't have one of these,
> and APEI is the only thing outside arch code calling this..
>
> The existing ghes_estatus_pool_shrink() is folded into the new
> ghes_estatus_queue_shrink_pool() as only the queue uses it.
>
> _in_nmi_notify_one() is separate from the rcu-list walker for a later
> caller that doesn't need to walk a list.
>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  drivers/acpi/apei/ghes.c | 103 +++++++++++++++++++++++++++++++----------------
>  1 file changed, 68 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index e42b587c509b..d3cc5bd5b496 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -749,6 +749,54 @@ static void __process_error(struct ghes *ghes)
>  #endif
>  }
>  
> +static int _in_nmi_notify_one(struct ghes *ghes)
> +{
> +	int sev;
> +	int ret = -ENOENT;

If ret is initialised to 0 ...

> +
> +	if (ghes_read_estatus(ghes, 1)) {
> +		ghes_clear_estatus(ghes);
> +		return ret;

and return -ENOENT here...

> +	} else {
> +		ret = 0;
> +	}

... then the else block can be dropped.

> +
> +	sev = ghes_severity(ghes->estatus->error_severity);
> +	if (sev >= GHES_SEV_PANIC) {
> +#ifdef CONFIG_X86
> +		oops_begin();
> +#endif

Can you use IS_ENABLED() here as well?

> +		ghes_print_queued_estatus();
> +		__ghes_panic(ghes);
> +	}
> +
> +	if (!(ghes->flags & GHES_TO_CLEAR))
> +		return ret;
> +
> +	__process_error(ghes);
> +	ghes_clear_estatus(ghes);
> +
> +	return ret;
> +}
> +
> +static int ghes_estatus_queue_notified(struct list_head *rcu_list)
> +{
> +	int ret = -ENOENT;
> +	struct ghes *ghes;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ghes, rcu_list, list) {
> +		if (!_in_nmi_notify_one(ghes))
> +			ret = 0;
> +	}
> +	rcu_read_unlock();
> +
> +	if (IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && ret == 0)
> +		irq_work_queue(&ghes_proc_irq_work);
> +
> +	return ret;
> +}
> +
>  static unsigned long ghes_esource_prealloc_size(
>  	const struct acpi_hest_generic *generic)
>  {
> @@ -764,11 +812,24 @@ static unsigned long ghes_esource_prealloc_size(
>  	return prealloc_size;
>  }
>  
> -static void ghes_estatus_pool_shrink(unsigned long len)
> +/* After removing a queue user, we can shrink to pool */
                                                 ^
                                                 the

Thanks,
Punit

> +static void ghes_estatus_queue_shrink_pool(struct ghes *ghes)
>  {
> +	unsigned long len;
> +
> +	len = ghes_esource_prealloc_size(ghes->generic);
>  	ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
>  }
>  

[...]
James Morse Feb. 23, 2018, 6:21 p.m. UTC | #2
Hi Punit,

On 20/02/18 18:26, Punit Agrawal wrote:
> James Morse <james.morse@arm.com> writes:
> 
>> To support asynchronous NMI-like notifications on arm64 we need to use
>> the estatus-queue. These patches refactor it to allow multiple APEI
>> notification types to use it.
>>
>> Refactor the estatus queue's pool grow/shrink code and notification
>> routine from NOTIFY_NMI's handlers. This will allow another notification
>> method to use the estatus queue without duplicating this code.
>>
>> This patch adds rcu_read_lock()/rcu_read_unlock() around the list
>> list_for_each_entry_rcu() walker. These aren't strictly necessary as
>> the whole nmi_enter/nmi_exit() window is a spooky RCU read-side
>> critical section.
>>
>> Keep the oops_begin() call for x86, arm64 doesn't have one of these,
>> and APEI is the only thing outside arch code calling this..
>>
>> The existing ghes_estatus_pool_shrink() is folded into the new
>> ghes_estatus_queue_shrink_pool() as only the queue uses it.
>>
>> _in_nmi_notify_one() is separate from the rcu-list walker for a later
>> caller that doesn't need to walk a list.

>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index e42b587c509b..d3cc5bd5b496 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -749,6 +749,54 @@ static void __process_error(struct ghes *ghes)
>>  #endif
>>  }
>>  
>> +static int _in_nmi_notify_one(struct ghes *ghes)
>> +{
>> +	int sev;
>> +	int ret = -ENOENT;
> 
> If ret is initialised to 0 ...
> 
>> +
>> +	if (ghes_read_estatus(ghes, 1)) {
>> +		ghes_clear_estatus(ghes);
>> +		return ret;
> 
> and return -ENOENT here...
> 
>> +	} else {
>> +		ret = 0;
>> +	}
> 
> ... then the else block can be dropped.


Good point, this happened because I was trying to keep the same shape as the
existing notify_nmi() code as far as possible.


>> +
>> +	sev = ghes_severity(ghes->estatus->error_severity);
>> +	if (sev >= GHES_SEV_PANIC) {
>> +#ifdef CONFIG_X86
>> +		oops_begin();
>> +#endif
> 
> Can you use IS_ENABLED() here as well?

I didn't think that would build without an empty declaration for arm64, I
assumed it would generate an implicit-declaration-of warning. But, I've tried
it, and evidently today's toolchain does dead-code elimination before generating
implicit-declaration-of warnings...
I'd prefer to leave this (ugly as it is), to avoid warnings on a different
version of the compiler.


Thanks,

James
Borislav Petkov March 1, 2018, 3:01 p.m. UTC | #3
On Thu, Feb 15, 2018 at 06:55:57PM +0000, James Morse wrote:
> Keep the oops_begin() call for x86,

That oops_begin() in generic code is such a layering violation, grrr.

> arm64 doesn't have one of these,
> and APEI is the only thing outside arch code calling this..

So looking at:

arch/arm/kernel/traps.c:die()

it does call oops_begin() ... oops_end() just like the x86 version of
die().

I'm wondering if we could move the code to do die() in a prepatch? My
assumption is that all the arches should have die()... A quick grep does
show a bunch of other arches having die()...
Punit Agrawal March 1, 2018, 6:06 p.m. UTC | #4
Hi Borislav,

Borislav Petkov <bp@alien8.de> writes:

> On Thu, Feb 15, 2018 at 06:55:57PM +0000, James Morse wrote:
>> Keep the oops_begin() call for x86,
>
> That oops_begin() in generic code is such a layering violation, grrr.
>
>> arm64 doesn't have one of these,
>> and APEI is the only thing outside arch code calling this..
>
> So looking at:
>
> arch/arm/kernel/traps.c:die()
>
> it does call oops_begin() ... oops_end() just like the x86 version of
> die().

You're looking at support for the 32-bit ARM systems. The 64-bit support
lives in arch/arm64 and the die() there doesn't contain an
oops_begin()/oops_end(). But the lack of oops_begin() on arm64 doesn't
really matter here.

>
> I'm wondering if we could move the code to do die() in a prepatch? My
> assumption is that all the arches should have die()... A quick grep does
> show a bunch of other arches having die()...

One issue I see with calling die() is that it is defined in different
includes across various architectures, (e.g., include/asm/kdebug.h for
x86, include/asm/system_misc.h in arm64, etc.)
Borislav Petkov March 1, 2018, 10:35 p.m. UTC | #5
On Thu, Mar 01, 2018 at 06:06:59PM +0000, Punit Agrawal wrote:
> You're looking at support for the 32-bit ARM systems.

I know. That's why I'm asking.

> The 64-bit support lives in arch/arm64 and the die() there doesn't
> contain an oops_begin()/oops_end(). But the lack of oops_begin() on
> arm64 doesn't really matter here.

Yap.

> One issue I see with calling die() is that it is defined in different
> includes across various architectures, (e.g., include/asm/kdebug.h for
> x86, include/asm/system_misc.h in arm64, etc.)

I don't think that's insurmountable.

The more important question is, can we do the same set of calls when
panic severity on all architectures which support APEI or should we have
arch-specific ghes_panic() callbacks or so.

As it is now, it would turn into a mess if we start with the ifdeffery
and the different requirements architectures might have...

Thx.
James Morse March 7, 2018, 6:15 p.m. UTC | #6
Hi Borislav, Punit,

On 01/03/18 22:35, Borislav Petkov wrote:
> On Thu, Mar 01, 2018 at 06:06:59PM +0000, Punit Agrawal wrote:
>> The 64-bit support lives in arch/arm64 and the die() there doesn't
>> contain an oops_begin()/oops_end(). But the lack of oops_begin() on
>> arm64 doesn't really matter here.

>> One issue I see with calling die() is that it is defined in different
>> includes across various architectures, (e.g., include/asm/kdebug.h for
>> x86, include/asm/system_misc.h in arm64, etc.)
> 
> I don't think that's insurmountable.

I don't think die() helps us, its not quite the same as oops_begin()/panic(),
which means we're interpreting the APEI notification's severity differently,
depending on when we took it.


> The more important question is, can we do the same set of calls when
> panic severity on all architectures which support APEI or should we have
> arch-specific ghes_panic() callbacks or so.

I think the purpose of this oops_begin() is to ensure two CPUs calling
oops_begin() at the same time don't have their traces interleaved, unblanks the
screen and 'busts' any spinlocks printk() may need (console etc).

This code is called in_nmi(), printk() now supports this so it doesn't need its
locks busting.
When called in_nmi(), printk batches the messages into its per-cpu
printk_safe_seq_buf, which in our case is dumped by panic() using
printk_safe_flush_on_panic(). So provided we call panic(), the in_nmi() messages
from ghes.c are already batched, and printed behind panic()'s atomic_cmpxchg()
exclusion thing.

If your arm64 system has one of these futuristic 'screens', they get unblanked
when panic() calls console_verbose() and bust_spinlocks(1).


> As it is now, it would turn into a mess if we start with the ifdeffery
> and the different requirements architectures might have...

Today its just x86 and arm64. arm64 doesn't have a hook to do this. I'm happy to
add an empty declaration or leave it under an ifdef until someone complains
about any behaviour I missed!


Thanks,

James
Borislav Petkov March 8, 2018, 10:44 a.m. UTC | #7
On Wed, Mar 07, 2018 at 06:15:02PM +0000, James Morse wrote:
> Today its just x86 and arm64. arm64 doesn't have a hook to do this. I'm happy to
> add an empty declaration or leave it under an ifdef until someone complains
> about any behaviour I missed!

So I did some more staring at the code and I think oops_begin() is
needed mainly, as you point out, to prevent two oops messages from
interleaving. And yap, the other stuff with printk() is not true anymore
because the commit which added oops_begin():

  81e88fdc432a ("ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support")

still saw an NMI-unsafe printk. Which is long taken care of now.

So only the interleaving issue remains.

Which begs the question: how are you guys preventing the interleaving on
arm64? Because arch/arm64/kernel/traps.c:200 grabs the die_lock too, so
interleaving can happen on arm64 too, AFAICT.

And by that logic, you should technically grab that lock here too in
_in_nmi_notify_one().

Or?
James Morse March 19, 2018, 2:29 p.m. UTC | #8
Hi Borislav,

On 08/03/18 10:44, Borislav Petkov wrote:
> On Wed, Mar 07, 2018 at 06:15:02PM +0000, James Morse wrote:
>> Today its just x86 and arm64. arm64 doesn't have a hook to do this. I'm happy to
>> add an empty declaration or leave it under an ifdef until someone complains
>> about any behaviour I missed!
> 
> So I did some more staring at the code and I think oops_begin() is
> needed mainly, as you point out, to prevent two oops messages from
> interleaving. And yap, the other stuff with printk() is not true anymore
> because the commit which added oops_begin():
> 
>   81e88fdc432a ("ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support")
> 
> still saw an NMI-unsafe printk. Which is long taken care of now.
> 
> So only the interleaving issue remains.
> 
> Which begs the question: how are you guys preventing the interleaving on
> arm64? Because arch/arm64/kernel/traps.c:200 grabs the die_lock too, so
> interleaving can happen on arm64 too, AFAICT.

die() messages are stopped from interleaving with each other by that die_lock.
panic()s atomic_cmpxchg() then panic_smp_self_stop() means panic() is
first-past-the-post.

So our problem is interleaving of the two. The sequence is roughly:
1. oops_begin(); // I'm going to panic()
2. printk(some stuff);
3. panic();

Everything we print at (2) gets batched up by vprintk_nmi(), and is only printed
from (3) when we call printk_safe_flush_on_panic().

... and now I spot there are two calls to printk_safe_flush_on_panic(), one of
which happens before any smp_send_stop() calls.

This means we can get interleaving with panic() as we flush the printk_safe
buffer before smp_send_stop(), and even if we change that a remote CPU may
refuse to die. (Both x86 and arm64 have timeouts in their smp_send_stop() code).


> And by that logic, you should technically grab that lock here too in
> _in_nmi_notify_one().

I don't think the die_lock really helps here, do we really want to wait for a
remote CPU to finish printing an OOPs about user-space's bad memory accesses,
before we bring the machine down due to this system-wide fatal RAS error? The
presence of firmware-first means we know this error, and any other oops are
unrelated.

Grabbing the die_lock doesn't stop remote CPUs printing messages via a mechanism
other than die()/_in_nmi_notify_one(). I think oops_begin() is just plastering
over a problem. (how come this exclusion isn't done by oops_enter()/oops_exit()?)

Isn't oops_begin() trying to guarantee any messages printk()d by this CPU appear
'with' the subsequent panic()? I can't see any way to stop a remote CPU from
messing this up by printk()ing in a loop with interrupts masked, preventing us
from smp_send_stop()ing it, and making it difficult to take the lock.

I'd like to leave this under the x86-ifdef for now. For arm64 it would be an
APEI specific arch hook to stop the arch code from printing some messages,
meanwhile the rest of the kernel is unaffected. I suspect this sort of thing
really needs support from printk(). (maybe some printk() severity that mutes
other CPUs, or redirects them to the printk_safe buffer).


Thanks,

James
Borislav Petkov March 27, 2018, 5:25 p.m. UTC | #9
Hi James,

On Mon, Mar 19, 2018 at 02:29:13PM +0000, James Morse wrote:
> I don't think the die_lock really helps here, do we really want to wait for a
> remote CPU to finish printing an OOPs about user-space's bad memory accesses,
> before we bring the machine down due to this system-wide fatal RAS error? The
> presence of firmware-first means we know this error, and any other oops are
> unrelated.

Hmm, now that you put it this way...

> I'd like to leave this under the x86-ifdef for now. For arm64 it would be an
> APEI specific arch hook to stop the arch code from printing some messages,

... I'm thinking we should ignore the whole serializing of oopses and
really dump that hw error ASAP. If it really is a fatal error, our main
and only goal is to get it out as fast as possible so that it has the
highest chance to appear on some screen or logging facility and thus the
system can be serviced successfully.

And the other oopses have lower prio.

Hmmm?
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e42b587c509b..d3cc5bd5b496 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -749,6 +749,54 @@  static void __process_error(struct ghes *ghes)
 #endif
 }
 
+static int _in_nmi_notify_one(struct ghes *ghes)
+{
+	int sev;
+	int ret = -ENOENT;
+
+	if (ghes_read_estatus(ghes, 1)) {
+		ghes_clear_estatus(ghes);
+		return ret;
+	} else {
+		ret = 0;
+	}
+
+	sev = ghes_severity(ghes->estatus->error_severity);
+	if (sev >= GHES_SEV_PANIC) {
+#ifdef CONFIG_X86
+		oops_begin();
+#endif
+		ghes_print_queued_estatus();
+		__ghes_panic(ghes);
+	}
+
+	if (!(ghes->flags & GHES_TO_CLEAR))
+		return ret;
+
+	__process_error(ghes);
+	ghes_clear_estatus(ghes);
+
+	return ret;
+}
+
+static int ghes_estatus_queue_notified(struct list_head *rcu_list)
+{
+	int ret = -ENOENT;
+	struct ghes *ghes;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, rcu_list, list) {
+		if (!_in_nmi_notify_one(ghes))
+			ret = 0;
+	}
+	rcu_read_unlock();
+
+	if (IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && ret == 0)
+		irq_work_queue(&ghes_proc_irq_work);
+
+	return ret;
+}
+
 static unsigned long ghes_esource_prealloc_size(
 	const struct acpi_hest_generic *generic)
 {
@@ -764,11 +812,24 @@  static unsigned long ghes_esource_prealloc_size(
 	return prealloc_size;
 }
 
-static void ghes_estatus_pool_shrink(unsigned long len)
+/* After removing a queue user, we can shrink to pool */
+static void ghes_estatus_queue_shrink_pool(struct ghes *ghes)
 {
+	unsigned long len;
+
+	len = ghes_esource_prealloc_size(ghes->generic);
 	ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
 }
 
+/* Before adding a queue user, grow the pool */
+static void ghes_estatus_queue_grow_pool(struct ghes *ghes)
+{
+	unsigned long len;
+
+	len = ghes_esource_prealloc_size(ghes->generic);
+	ghes_estatus_pool_expand(len);
+}
+
 static void ghes_proc_in_irq(struct irq_work *irq_work)
 {
 	struct llist_node *llnode, *next;
@@ -967,48 +1028,22 @@  static LIST_HEAD(ghes_nmi);
 
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
-	struct ghes *ghes;
-	int sev, ret = NMI_DONE;
+	int ret = NMI_DONE;
 
 	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
 		return ret;
 
-	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
-		if (ghes_read_estatus(ghes, 1)) {
-			ghes_clear_estatus(ghes);
-			continue;
-		} else {
-			ret = NMI_HANDLED;
-		}
-
-		sev = ghes_severity(ghes->estatus->error_severity);
-		if (sev >= GHES_SEV_PANIC) {
-			oops_begin();
-			ghes_print_queued_estatus();
-			__ghes_panic(ghes);
-		}
-
-		if (!(ghes->flags & GHES_TO_CLEAR))
-			continue;
-
-		__process_error(ghes);
-		ghes_clear_estatus(ghes);
-	}
+	if (!ghes_estatus_queue_notified(&ghes_nmi))
+		ret = NMI_HANDLED;
 
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-	if (ret == NMI_HANDLED)
-		irq_work_queue(&ghes_proc_irq_work);
-#endif
 	atomic_dec(&ghes_in_nmi);
 	return ret;
 }
 
 static void ghes_nmi_add(struct ghes *ghes)
 {
-	unsigned long len;
+	ghes_estatus_queue_grow_pool(ghes);
 
-	len = ghes_esource_prealloc_size(ghes->generic);
-	ghes_estatus_pool_expand(len);
 	mutex_lock(&ghes_list_mutex);
 	if (list_empty(&ghes_nmi))
 		register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
@@ -1018,8 +1053,6 @@  static void ghes_nmi_add(struct ghes *ghes)
 
 static void ghes_nmi_remove(struct ghes *ghes)
 {
-	unsigned long len;
-
 	mutex_lock(&ghes_list_mutex);
 	list_del_rcu(&ghes->list);
 	if (list_empty(&ghes_nmi))
@@ -1030,8 +1063,8 @@  static void ghes_nmi_remove(struct ghes *ghes)
 	 * freed after NMI handler finishes.
 	 */
 	synchronize_rcu();
-	len = ghes_esource_prealloc_size(ghes->generic);
-	ghes_estatus_pool_shrink(len);
+
+	ghes_estatus_queue_shrink_pool(ghes);
 }
 
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */