diff mbox

[v3,7/8] arm64: exception: handle asynchronous SError interrupt

Message ID 1490869877-118713-8-git-send-email-xiexiuqi@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xie XiuQi March 30, 2017, 10:31 a.m. UTC
Error Synchronization Barrier (ESB; part of the ARMv8.2 Extensions)
is used to synchronize Unrecoverable errors. That is, containable errors
architecturally consumed by the PE and not silently propagated.

With ESB it is generally possible to isolate an unrecoverable error
between two ESB instructions. So, it's possible to recovery from
unrecoverable errors reported by asynchronous SError interrupt.

If ARMv8.2 RAS Extension is not support, ESB is treated as a NOP.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Wang Xiongfeng <wangxiongfengi2@huawei.com>
---
 arch/arm64/Kconfig           | 16 ++++++++++
 arch/arm64/include/asm/esr.h | 14 +++++++++
 arch/arm64/kernel/entry.S    | 70 ++++++++++++++++++++++++++++++++++++++++++--
 arch/arm64/kernel/traps.c    | 54 ++++++++++++++++++++++++++++++++--
 4 files changed, 150 insertions(+), 4 deletions(-)

Comments

Xiongfeng Wang April 13, 2017, 8:44 a.m. UTC | #1
Hi Xiuqi,

On 2017/3/30 18:31, Xie XiuQi wrote:
> Error Synchronization Barrier (ESB; part of the ARMv8.2 Extensions)
> is used to synchronize Unrecoverable errors. That is, containable errors
> architecturally consumed by the PE and not silently propagated.
> 
> With ESB it is generally possible to isolate an unrecoverable error
> between two ESB instructions. So, it's possible to recovery from

>  /* ISS field definitions for exceptions taken in to Hyp */
>  #define ESR_ELx_CV		(UL(1) << 24)
>  #define ESR_ELx_COND_SHIFT	(20)
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 43512d4..d8a7306 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -69,7 +69,14 @@
>  #define BAD_FIQ		2
>  #define BAD_ERROR	3
>  
> +	.arch_extension ras
> +
>  	.macro	kernel_entry, el, regsize = 64
> +#ifdef CONFIG_ARM64_ESB
> +	.if	\el == 0
> +	esb
> +	.endif
> +#endif
>  	sub	sp, sp, #S_FRAME_SIZE
>  	.if	\regsize == 32
>  	mov	w0, w0				// zero upper 32 bits of x0
> @@ -208,6 +215,7 @@ alternative_else_nop_endif
>  #endif
>  
>  	.if	\el == 0
> +	msr	daifset, #0xF			// Set flags
>  	ldr	x23, [sp, #S_SP]		// load return stack pointer
>  	msr	sp_el0, x23
>  #ifdef CONFIG_ARM64_ERRATUM_845719
> @@ -226,6 +234,15 @@ alternative_else_nop_endif
>  
>  	msr	elr_el1, x21			// set up the return data
>  	msr	spsr_el1, x22
> +
> +#ifdef CONFIG_ARM64_ESB
> +	.if \el == 0
> +	esb					// Error Synchronization Barrier
> +	mrs	x21, disr_el1			// Check for deferred error
> +	tbnz	x21, #31, el1_sei

We may need to clear disr_el1.A after reading it because the hardware won't clear it.

> +	.endif
> +#endif
> +
>  	ldp	x0, x1, [sp, #16 * 0]
>  	ldp	x2, x3, [sp, #16 * 1]
>  	ldp	x4, x5, [sp, #16 * 2]
> @@ -318,7 +335,7 @@ ENTRY(vectors)
>  	ventry	el1_sync_invalid		// Synchronous EL1t
>  	ventry	el1_irq_invalid			// IRQ EL1t
>  	ventry	el1_fiq_invalid			// FIQ EL1t
> -	ventry	el1_error_invalid		// Error EL1t
> +	ventry	el1_error			// Error EL1t
>  
>  	ventry	el1_sync			// Synchronous EL1h
>  	ventry	el1_irq				// IRQ EL1h
> @@ -328,7 +345,7 @@ ENTRY(vectors)
>  	ventry	el0_sync			// Synchronous 64-bit EL0
>  	ventry	el0_irq				// IRQ 64-bit EL0
>  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
> -	ventry	el0_error_invalid		// Error 64-bit EL0
> +	ventry	el0_error			// Error 64-bit EL0
>  
>  #ifdef CONFIG_COMPAT
>  	ventry	el0_sync_compat			// Synchronous 32-bit EL0
> @@ -508,12 +525,31 @@ el1_preempt:
>  	ret	x24
>  #endif
>  
> +	.align	6
> +el1_error:
> +	kernel_entry 1
> +el1_sei:
> +	/*
> +	 * asynchronous SError interrupt from kernel
> +	 */
> +	mov	x0, sp
> +	mrs	x1, esr_el1
> +	mov	x2, #1				// exception level of SEI generated
> +	b	do_sei
> +ENDPROC(el1_error)
> +
> +
>  /*
>   * EL0 mode handlers.
>   */
>  	.align	6
>  el0_sync:
>  	kernel_entry 0
> +#ifdef CONFIG_ARM64_ESB
> +	mrs     x26, disr_el1
> +	tbnz    x26, #31, el0_sei		// check DISR.A
> +	msr	daifclr, #0x4			// unmask SEI
> +#endif
>  	mrs	x25, esr_el1			// read the syndrome register
>  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
>  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
> @@ -688,8 +724,38 @@ el0_inv:
>  ENDPROC(el0_sync)
>  
>  	.align	6
> +el0_error:
> +	kernel_entry 0
> +el0_sei:
> +	/*
> +	 * asynchronous SError interrupt from userspace
> +	 */
> +	ct_user_exit
> +	mov	x0, sp
> +	mrs	x1, esr_el1
> +	mov	x2, #0
> +	bl	do_sei
> +	b	ret_to_user
> +ENDPROC(el0_error)
> +
> +	.align	6
>  el0_irq:
>  	kernel_entry 0
> +#ifdef CONFIG_ARM64_ESB
> +	mrs     x26, disr_el1
> +	tbz     x26, #31, el0_irq_naked          // check DISR.A
> +
> +	mov	x0, sp
> +	mrs	x1, esr_el1
> +	mov	x2, 0
> +
> +	/*
> +	 * The SEI generated at EL0 is not affect this irq context,
> +	 * so after sei handler, we continue process this irq.
> +	 */
> +	bl	do_sei
> +	msr     daifclr, #0x4                   // unmask SEI
> +#endif
>  el0_irq_naked:
>  	enable_dbg
>  #ifdef CONFIG_TRACE_IRQFLAGS

Thanks,
Wang Xiongfeng
Mark Rutland April 13, 2017, 10:51 a.m. UTC | #2
Hi,

On Thu, Mar 30, 2017 at 06:31:07PM +0800, Xie XiuQi wrote:
> diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
> index f20c64a..22f9c90 100644
> --- a/arch/arm64/include/asm/esr.h
> +++ b/arch/arm64/include/asm/esr.h
> @@ -106,6 +106,20 @@
>  #define ESR_ELx_AR 		(UL(1) << 14)
>  #define ESR_ELx_CM 		(UL(1) << 8)
>  
> +#define ESR_Elx_DFSC_SEI	(0x11)

We should probably have a definition for the uncategorized DFSC value,
too.

[...]

> index 43512d4..d8a7306 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -69,7 +69,14 @@
>  #define BAD_FIQ		2
>  #define BAD_ERROR	3
>  
> +	.arch_extension ras

Generally, arch_extension is a warning sign that code isn't going to
work with contemporary assemblers, which we likely need to support.

> +
>  	.macro	kernel_entry, el, regsize = 64
> +#ifdef CONFIG_ARM64_ESB
> +	.if	\el == 0
> +	esb

Here, I think that we'll need to macro this such that we can build with
existing toolchains.

e.g. in <asm/assembler.h> we need something like:

	#define HINT_IMM_ESB	16

	.macro ESB
	hint	#HINT_IMM_ESB
	.endm

> +	.endif
> +#endif
>  	sub	sp, sp, #S_FRAME_SIZE
>  	.if	\regsize == 32
>  	mov	w0, w0				// zero upper 32 bits of x0
> @@ -208,6 +215,7 @@ alternative_else_nop_endif
>  #endif
>  
>  	.if	\el == 0
> +	msr	daifset, #0xF			// Set flags

Elsewhere in head.S we use helpers to fiddle with DAIF bits.

Please be consistent with that. Add an enable_all macro if we need one.

>  	ldr	x23, [sp, #S_SP]		// load return stack pointer
>  	msr	sp_el0, x23
>  #ifdef CONFIG_ARM64_ERRATUM_845719
> @@ -226,6 +234,15 @@ alternative_else_nop_endif
>  
>  	msr	elr_el1, x21			// set up the return data
>  	msr	spsr_el1, x22
> +
> +#ifdef CONFIG_ARM64_ESB
> +	.if \el == 0
> +	esb					// Error Synchronization Barrier
> +	mrs	x21, disr_el1			// Check for deferred error

We'll need an <asm/sysreg.h> definition for this register. With that, we
can use mrs_s here.

> +	tbnz	x21, #31, el1_sei
> +	.endif
> +#endif
> +
>  	ldp	x0, x1, [sp, #16 * 0]
>  	ldp	x2, x3, [sp, #16 * 1]
>  	ldp	x4, x5, [sp, #16 * 2]
> @@ -318,7 +335,7 @@ ENTRY(vectors)
>  	ventry	el1_sync_invalid		// Synchronous EL1t
>  	ventry	el1_irq_invalid			// IRQ EL1t
>  	ventry	el1_fiq_invalid			// FIQ EL1t
> -	ventry	el1_error_invalid		// Error EL1t
> +	ventry	el1_error			// Error EL1t
>  
>  	ventry	el1_sync			// Synchronous EL1h
>  	ventry	el1_irq				// IRQ EL1h
> @@ -328,7 +345,7 @@ ENTRY(vectors)
>  	ventry	el0_sync			// Synchronous 64-bit EL0
>  	ventry	el0_irq				// IRQ 64-bit EL0
>  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
> -	ventry	el0_error_invalid		// Error 64-bit EL0
> +	ventry	el0_error			// Error 64-bit EL0
>  
>  #ifdef CONFIG_COMPAT
>  	ventry	el0_sync_compat			// Synchronous 32-bit EL0
> @@ -508,12 +525,31 @@ el1_preempt:
>  	ret	x24
>  #endif
>  
> +	.align	6
> +el1_error:
> +	kernel_entry 1
> +el1_sei:
> +	/*
> +	 * asynchronous SError interrupt from kernel
> +	 */
> +	mov	x0, sp
> +	mrs	x1, esr_el1

I don't think this is correct if we branched here from kernel_exit.
Surely we want the DISR_EL1 value, and ESR_EL1 is unrelated?

> +	mov	x2, #1				// exception level of SEI generated
> +	b	do_sei

You don't need to figure out the EL here. In do_sei() we can determine
the exception level from the regs (e.g. using user_mode(regs)).

> +ENDPROC(el1_error)
> +
> +
>  /*
>   * EL0 mode handlers.
>   */
>  	.align	6
>  el0_sync:
>  	kernel_entry 0
> +#ifdef CONFIG_ARM64_ESB
> +	mrs     x26, disr_el1
> +	tbnz    x26, #31, el0_sei		// check DISR.A
> +	msr	daifclr, #0x4			// unmask SEI
> +#endif

Why do we duplicate this across the EL0 handlers, rather than making it
common in the el0 kernel_entry code?

>  	mrs	x25, esr_el1			// read the syndrome register
>  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
>  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
> @@ -688,8 +724,38 @@ el0_inv:
>  ENDPROC(el0_sync)
>  
>  	.align	6
> +el0_error:
> +	kernel_entry 0
> +el0_sei:
> +	/*
> +	 * asynchronous SError interrupt from userspace
> +	 */
> +	ct_user_exit
> +	mov	x0, sp
> +	mrs	x1, esr_el1

As with el1_sei, I don't think this is correct if we branched to
el0_sei. As far as I am aware, ESR_EL1 will contain whatever exception
we took, and the value we want is in DISR_EL1.

> +	mov	x2, #0

This EL parameter can go.

> +	bl	do_sei
> +	b	ret_to_user
> +ENDPROC(el0_error)
> +
> +	.align	6
>  el0_irq:
>  	kernel_entry 0
> +#ifdef CONFIG_ARM64_ESB
> +	mrs     x26, disr_el1
> +	tbz     x26, #31, el0_irq_naked          // check DISR.A
> +
> +	mov	x0, sp
> +	mrs	x1, esr_el1
> +	mov	x2, 0
> +
> +	/*
> +	 * The SEI generated at EL0 is not affect this irq context,
> +	 * so after sei handler, we continue process this irq.
> +	 */
> +	bl	do_sei
> +	msr     daifclr, #0x4                   // unmask SEI

This rough pattern is duplicated several times across the EL0 entry
paths. I think it should be made common.

> +#endif
>  el0_irq_naked:
>  	enable_dbg
>  #ifdef CONFIG_TRACE_IRQFLAGS
> diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> index b6d6727..99be6d8 100644
> --- a/arch/arm64/kernel/traps.c
> +++ b/arch/arm64/kernel/traps.c
> @@ -643,6 +643,34 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>  		handler[reason], smp_processor_id(), esr,
>  		esr_get_class_string(esr));
>  
> +	die("Oops - bad mode", regs, 0);
> +	local_irq_disable();
> +	panic("bad mode");
> +}
> +
> +static const char *sei_context[] = {
> +	"userspace",			/* EL0 */
> +	"kernel",			/* EL1 */
> +};

This should go. It's only used in one place, and would be clearer with
the strings inline. More on that below.

> +
> +static const char *sei_severity[] = {

Please name this for what it actually represents:

static const char *esr_aet_str[] = {

> +	[0 ... ESR_ELx_AET_MAX] =	"Unknown",

For consistency with esr_class_str, please make this:

	[0 ... ESR_ELx_AET_MAX] =	"UNRECOGNIZED AET",

... which makes it clear that this isn't some AET value which reports an
"Unknown" status.

> +	[ESR_ELx_AET_UC]	=	"Uncontainable",
> +	[ESR_ELx_AET_UEU]	=	"Unrecoverable",
> +	[ESR_ELx_AET_UEO]	=	"Restartable",
> +	[ESR_ELx_AET_UER]	=	"Recoverable",
> +	[ESR_ELx_AET_CE]	=	"Corrected",
> +};
> +
> +DEFINE_PER_CPU(int, sei_in_process);

A previous patch added definition of this.

> +asmlinkage void do_sei(struct pt_regs *regs, unsigned int esr, int el)
> +{
> +	int aet = ESR_ELx_AET(esr);

The AET field is only valid when the DFSC is 0b010001, so we need to
check that before we interpret AET.

> +	console_verbose();
> +
> +	pr_crit("Asynchronous SError interrupt detected on CPU%d, %s, %s\n",
> +		smp_processor_id(), sei_context[el], sei_severity[aet]);

We should dump the full ESR_ELx value, regardless of what automated
decoding we do, so that we have a chance of debugging issues in the
field.

It would also be nice to align with how bad_mode reports this today.
Please make this:

	pr_crit("SError detected on CPU%d while in %s mode: code: 0x%08x -- %s\n",
		smp_processor_id(), user_mode(regs) ? "user" : "kernel",
		esr, esr_aet_str[aet]);

... though it might be best to dump the raw SPSR rather than trying to
say user/kernel, so that we can distinguish EL1/EL2 with VHE, etc.

> +
>  	/*
>  	 * In firmware first mode, we could assume firmware will only generate one
>  	 * of cper records at a time. There is no risk for one cpu to parse ghes table.
> @@ -653,9 +681,31 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>  		this_cpu_dec(sei_in_process);
>  	}
>  
> -	die("Oops - bad mode", regs, 0);
> +	if (el == 0 && IS_ENABLED(CONFIG_ARM64_ESB) &&

Please use user_mode(regs), and get rid of the el parameter to this
function entirely.

> +	    cpus_have_cap(ARM64_HAS_RAS_EXTN)) {
> +		siginfo_t info;
> +		void __user *pc = (void __user *)instruction_pointer(regs);
> +
> +		if (aet >= ESR_ELx_AET_UEO)
> +			return;

We need to check the DFSC first, and 0b111 is a reserved value (which
the ARM ARM doesn't define the recoverability of), so I don't think this
is correct.

We should probably test the DSFC, then switch on the AET value, so as to
handle only the cases we are aware of.

> +
> +		if (aet == ESR_ELx_AET_UEU) {
> +			info.si_signo = SIGILL;
> +			info.si_errno = 0;
> +			info.si_code  = ILL_ILLOPC;
> +			info.si_addr  = pc;

An unrecoverable error is not necessarily a particular bad instruction,
so I'm not sure this makes sense.

Thanks,
Mark.
Xie XiuQi April 14, 2017, 7:03 a.m. UTC | #3
Hi Mark,

Thanks for your comments.

On 2017/4/13 18:51, Mark Rutland wrote:
> Hi,
>
> On Thu, Mar 30, 2017 at 06:31:07PM +0800, Xie XiuQi wrote:
>> diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
>> index f20c64a..22f9c90 100644
>> --- a/arch/arm64/include/asm/esr.h
>> +++ b/arch/arm64/include/asm/esr.h
>> @@ -106,6 +106,20 @@
>>  #define ESR_ELx_AR 		(UL(1) << 14)
>>  #define ESR_ELx_CM 		(UL(1) << 8)
>>
>> +#define ESR_Elx_DFSC_SEI	(0x11)
>
> We should probably have a definition for the uncategorized DFSC value,
> too.
>

Will do, thanks.

How about "#define ESR_Elx_DFSC_UNCATEGORIZED	(0)" ?

> [...]
>
>> index 43512d4..d8a7306 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -69,7 +69,14 @@
>>  #define BAD_FIQ		2
>>  #define BAD_ERROR	3
>>
>> +	.arch_extension ras
>
> Generally, arch_extension is a warning sign that code isn't going to
> work with contemporary assemblers, which we likely need to support.
>
>> +
>>  	.macro	kernel_entry, el, regsize = 64
>> +#ifdef CONFIG_ARM64_ESB
>> +	.if	\el == 0
>> +	esb
>
> Here, I think that we'll need to macro this such that we can build with
> existing toolchains.
>
> e.g. in <asm/assembler.h> we need something like:
>
> 	#define HINT_IMM_ESB	16
>
> 	.macro ESB
> 	hint	#HINT_IMM_ESB
> 	.endm
>

Good, thanks for your suggestion. I'll use this macro in next versin.

>> +	.endif
>> +#endif
>>  	sub	sp, sp, #S_FRAME_SIZE
>>  	.if	\regsize == 32
>>  	mov	w0, w0				// zero upper 32 bits of x0
>> @@ -208,6 +215,7 @@ alternative_else_nop_endif
>>  #endif
>>
>>  	.if	\el == 0
>> +	msr	daifset, #0xF			// Set flags
>
> Elsewhere in head.S we use helpers to fiddle with DAIF bits.
>
> Please be consistent with that. Add an enable_all macro if we need one.

OK, I'll do it refer to head.S.

>
>>  	ldr	x23, [sp, #S_SP]		// load return stack pointer
>>  	msr	sp_el0, x23
>>  #ifdef CONFIG_ARM64_ERRATUM_845719
>> @@ -226,6 +234,15 @@ alternative_else_nop_endif
>>
>>  	msr	elr_el1, x21			// set up the return data
>>  	msr	spsr_el1, x22
>> +
>> +#ifdef CONFIG_ARM64_ESB
>> +	.if \el == 0
>> +	esb					// Error Synchronization Barrier
>> +	mrs	x21, disr_el1			// Check for deferred error
>
> We'll need an <asm/sysreg.h> definition for this register. With that, we
> can use mrs_s here.

OK, thanks.

>
>> +	tbnz	x21, #31, el1_sei
>> +	.endif
>> +#endif
>> +
>>  	ldp	x0, x1, [sp, #16 * 0]
>>  	ldp	x2, x3, [sp, #16 * 1]
>>  	ldp	x4, x5, [sp, #16 * 2]
>> @@ -318,7 +335,7 @@ ENTRY(vectors)
>>  	ventry	el1_sync_invalid		// Synchronous EL1t
>>  	ventry	el1_irq_invalid			// IRQ EL1t
>>  	ventry	el1_fiq_invalid			// FIQ EL1t
>> -	ventry	el1_error_invalid		// Error EL1t
>> +	ventry	el1_error			// Error EL1t
>>
>>  	ventry	el1_sync			// Synchronous EL1h
>>  	ventry	el1_irq				// IRQ EL1h
>> @@ -328,7 +345,7 @@ ENTRY(vectors)
>>  	ventry	el0_sync			// Synchronous 64-bit EL0
>>  	ventry	el0_irq				// IRQ 64-bit EL0
>>  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
>> -	ventry	el0_error_invalid		// Error 64-bit EL0
>> +	ventry	el0_error			// Error 64-bit EL0
>>
>>  #ifdef CONFIG_COMPAT
>>  	ventry	el0_sync_compat			// Synchronous 32-bit EL0
>> @@ -508,12 +525,31 @@ el1_preempt:
>>  	ret	x24
>>  #endif
>>
>> +	.align	6
>> +el1_error:
>> +	kernel_entry 1
>> +el1_sei:
>> +	/*
>> +	 * asynchronous SError interrupt from kernel
>> +	 */
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
>
> I don't think this is correct if we branched here from kernel_exit.
> Surely we want the DISR_EL1 value, and ESR_EL1 is unrelated?

Yes, indeed. I'll change it in next version.

>
>> +	mov	x2, #1				// exception level of SEI generated
>> +	b	do_sei
>
> You don't need to figure out the EL here. In do_sei() we can determine
> the exception level from the regs (e.g. using user_mode(regs)).

Yes, you're right. I'll fix it.

>
>> +ENDPROC(el1_error)
>> +
>> +
>>  /*
>>   * EL0 mode handlers.
>>   */
>>  	.align	6
>>  el0_sync:
>>  	kernel_entry 0
>> +#ifdef CONFIG_ARM64_ESB
>> +	mrs     x26, disr_el1
>> +	tbnz    x26, #31, el0_sei		// check DISR.A
>> +	msr	daifclr, #0x4			// unmask SEI
>> +#endif
>
> Why do we duplicate this across the EL0 handlers, rather than making it
> common in the el0 kernel_entry code?

It's different between el0_sync and el0_irq. If sei come from el0_sync,
the context is the same process, so we could just return to user space
after do_sei, instead of continue the syscall. The process may be killed
very likely, it's not necessary to continue the system call.

However, if sei come from el0_irq, in the irq context which is not related
the current process, we should continue the irq handler after do_sei.

So I think we should handle these two situation differently. If I'm wrong,
please correct me, Thanks.

>
>>  	mrs	x25, esr_el1			// read the syndrome register
>>  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
>>  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
>> @@ -688,8 +724,38 @@ el0_inv:
>>  ENDPROC(el0_sync)
>>
>>  	.align	6
>> +el0_error:
>> +	kernel_entry 0
>> +el0_sei:
>> +	/*
>> +	 * asynchronous SError interrupt from userspace
>> +	 */
>> +	ct_user_exit
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
>
> As with el1_sei, I don't think this is correct if we branched to
> el0_sei. As far as I am aware, ESR_EL1 will contain whatever exception
> we took, and the value we want is in DISR_EL1.

Will fix, thanks.

>
>> +	mov	x2, #0
>
> This EL parameter can go.
>
>> +	bl	do_sei
>> +	b	ret_to_user
>> +ENDPROC(el0_error)
>> +
>> +	.align	6
>>  el0_irq:
>>  	kernel_entry 0
>> +#ifdef CONFIG_ARM64_ESB
>> +	mrs     x26, disr_el1
>> +	tbz     x26, #31, el0_irq_naked          // check DISR.A
>> +
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
>> +	mov	x2, 0
>> +
>> +	/*
>> +	 * The SEI generated at EL0 is not affect this irq context,
>> +	 * so after sei handler, we continue process this irq.
>> +	 */
>> +	bl	do_sei
>> +	msr     daifclr, #0x4                   // unmask SEI
>
> This rough pattern is duplicated several times across the EL0 entry
> paths. I think it should be made common.

I agree, I'll do it in next version, thanks.

>
>> +#endif
>>  el0_irq_naked:
>>  	enable_dbg
>>  #ifdef CONFIG_TRACE_IRQFLAGS
>> diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
>> index b6d6727..99be6d8 100644
>> --- a/arch/arm64/kernel/traps.c
>> +++ b/arch/arm64/kernel/traps.c
>> @@ -643,6 +643,34 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>>  		handler[reason], smp_processor_id(), esr,
>>  		esr_get_class_string(esr));
>>
>> +	die("Oops - bad mode", regs, 0);
>> +	local_irq_disable();
>> +	panic("bad mode");
>> +}
>> +
>> +static const char *sei_context[] = {
>> +	"userspace",			/* EL0 */
>> +	"kernel",			/* EL1 */
>> +};
>
> This should go. It's only used in one place, and would be clearer with
> the strings inline. More on that below.
>

OK, thanks.

>> +
>> +static const char *sei_severity[] = {
>
> Please name this for what it actually represents:
>
> static const char *esr_aet_str[] = {
>
>> +	[0 ... ESR_ELx_AET_MAX] =	"Unknown",
>
> For consistency with esr_class_str, please make this:
>
> 	[0 ... ESR_ELx_AET_MAX] =	"UNRECOGNIZED AET",
>
> ... which makes it clear that this isn't some AET value which reports an
> "Unknown" status.
>

OK, thanks.

>> +	[ESR_ELx_AET_UC]	=	"Uncontainable",
>> +	[ESR_ELx_AET_UEU]	=	"Unrecoverable",
>> +	[ESR_ELx_AET_UEO]	=	"Restartable",
>> +	[ESR_ELx_AET_UER]	=	"Recoverable",
>> +	[ESR_ELx_AET_CE]	=	"Corrected",
>> +};
>> +
>> +DEFINE_PER_CPU(int, sei_in_process);
>
> A previous patch added definition of this.
>

I'll remote it, thanks.

>> +asmlinkage void do_sei(struct pt_regs *regs, unsigned int esr, int el)
>> +{
>> +	int aet = ESR_ELx_AET(esr);
>
> The AET field is only valid when the DFSC is 0b010001, so we need to
> check that before we interpret AET.
>

Will fix.

>> +	console_verbose();
>> +
>> +	pr_crit("Asynchronous SError interrupt detected on CPU%d, %s, %s\n",
>> +		smp_processor_id(), sei_context[el], sei_severity[aet]);
>
> We should dump the full ESR_ELx value, regardless of what automated
> decoding we do, so that we have a chance of debugging issues in the
> field.
>
> It would also be nice to align with how bad_mode reports this today.
> Please make this:
>
> 	pr_crit("SError detected on CPU%d while in %s mode: code: 0x%08x -- %s\n",
> 		smp_processor_id(), user_mode(regs) ? "user" : "kernel",
> 		esr, esr_aet_str[aet]);
>
> ... though it might be best to dump the raw SPSR rather than trying to
> say user/kernel, so that we can distinguish EL1/EL2 with VHE, etc.
>

OK, I'll modify in next version.

>> +
>>  	/*
>>  	 * In firmware first mode, we could assume firmware will only generate one
>>  	 * of cper records at a time. There is no risk for one cpu to parse ghes table.
>> @@ -653,9 +681,31 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>>  		this_cpu_dec(sei_in_process);
>>  	}
>>
>> -	die("Oops - bad mode", regs, 0);
>> +	if (el == 0 && IS_ENABLED(CONFIG_ARM64_ESB) &&
>
> Please use user_mode(regs), and get rid of the el parameter to this
> function entirely.
>

Will fix.

>> +	    cpus_have_cap(ARM64_HAS_RAS_EXTN)) {
>> +		siginfo_t info;
>> +		void __user *pc = (void __user *)instruction_pointer(regs);
>> +
>> +		if (aet >= ESR_ELx_AET_UEO)
>> +			return;
>
> We need to check the DFSC first, and 0b111 is a reserved value (which
> the ARM ARM doesn't define the recoverability of), so I don't think this
> is correct.
>
> We should probably test the DSFC, then switch on the AET value, so as to
> handle only the cases we are aware of.

Will fix.

>
>> +
>> +		if (aet == ESR_ELx_AET_UEU) {
>> +			info.si_signo = SIGILL;
>> +			info.si_errno = 0;
>> +			info.si_code  = ILL_ILLOPC;
>> +			info.si_addr  = pc;
>
> An unrecoverable error is not necessarily a particular bad instruction,
> so I'm not sure this makes sense.
>

Generally, a SEI is generated when PE consumes an uncorrectable error.
So, may be we could send a SIGBUS instead.

I'm not sure too. Any other suggestion?

> Thanks,
> Mark.
>
> .
>
Xiongfeng Wang April 18, 2017, 1:09 a.m. UTC | #4
Hi Mark,

I have some confusion about the RAS feature when VHE is enabled. Does RAS spec support
the situation when VHE is enabled. When VHE is disabled, the hyperviosr delegates the error
exception to EL1 by setting HCR_EL2.VSE to 1, and this will inject a virtual SEI into OS.
My understanding is that HCR_EL2.VSE is only used to inject a virtual SEI into EL1.
But when VHE is enabled, the host OS will run at EL2. We can't inject a virtual SEI into
host OS. I don't know if RAS spec can handle this situation.

Thanks,
Wang Xiongfeng

On 2017/4/13 18:51, Mark Rutland wrote:
> Hi,
> 
> On Thu, Mar 30, 2017 at 06:31:07PM +0800, Xie XiuQi wrote:
>> diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
>> index f20c64a..22f9c90 100644
>> --- a/arch/arm64/include/asm/esr.h
>> +++ b/arch/arm64/include/asm/esr.h
>> @@ -106,6 +106,20 @@
>>  #define ESR_ELx_AR 		(UL(1) << 14)
>>  #define ESR_ELx_CM 		(UL(1) << 8)
>>  
>> +#define ESR_Elx_DFSC_SEI	(0x11)
> 
> We should probably have a definition for the uncategorized DFSC value,
> too.
> 
> [...]
> 
>> index 43512d4..d8a7306 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -69,7 +69,14 @@
>>  #define BAD_FIQ		2
>>  #define BAD_ERROR	3
>>  
>> +	.arch_extension ras
> 
> Generally, arch_extension is a warning sign that code isn't going to
> work with contemporary assemblers, which we likely need to support.
> 
>> +
>>  	.macro	kernel_entry, el, regsize = 64
>> +#ifdef CONFIG_ARM64_ESB
>> +	.if	\el == 0
>> +	esb
> 
> Here, I think that we'll need to macro this such that we can build with
> existing toolchains.
> 
> e.g. in <asm/assembler.h> we need something like:
> 
> 	#define HINT_IMM_ESB	16
> 
> 	.macro ESB
> 	hint	#HINT_IMM_ESB
> 	.endm
> 
>> +	.endif
>> +#endif
>>  	sub	sp, sp, #S_FRAME_SIZE
>>  	.if	\regsize == 32
>>  	mov	w0, w0				// zero upper 32 bits of x0
>> @@ -208,6 +215,7 @@ alternative_else_nop_endif
>>  #endif
>>  
>>  	.if	\el == 0
>> +	msr	daifset, #0xF			// Set flags
> 
> Elsewhere in head.S we use helpers to fiddle with DAIF bits.
> 
> Please be consistent with that. Add an enable_all macro if we need one.
> 
>>  	ldr	x23, [sp, #S_SP]		// load return stack pointer
>>  	msr	sp_el0, x23
>>  #ifdef CONFIG_ARM64_ERRATUM_845719
>> @@ -226,6 +234,15 @@ alternative_else_nop_endif
>>  
>>  	msr	elr_el1, x21			// set up the return data
>>  	msr	spsr_el1, x22
>> +
>> +#ifdef CONFIG_ARM64_ESB
>> +	.if \el == 0
>> +	esb					// Error Synchronization Barrier
>> +	mrs	x21, disr_el1			// Check for deferred error
> 
> We'll need an <asm/sysreg.h> definition for this register. With that, we
> can use mrs_s here.
> 
>> +	tbnz	x21, #31, el1_sei
>> +	.endif
>> +#endif
>> +
>>  	ldp	x0, x1, [sp, #16 * 0]
>>  	ldp	x2, x3, [sp, #16 * 1]
>>  	ldp	x4, x5, [sp, #16 * 2]
>> @@ -318,7 +335,7 @@ ENTRY(vectors)
>>  	ventry	el1_sync_invalid		// Synchronous EL1t
>>  	ventry	el1_irq_invalid			// IRQ EL1t
>>  	ventry	el1_fiq_invalid			// FIQ EL1t
>> -	ventry	el1_error_invalid		// Error EL1t
>> +	ventry	el1_error			// Error EL1t
>>  
>>  	ventry	el1_sync			// Synchronous EL1h
>>  	ventry	el1_irq				// IRQ EL1h
>> @@ -328,7 +345,7 @@ ENTRY(vectors)
>>  	ventry	el0_sync			// Synchronous 64-bit EL0
>>  	ventry	el0_irq				// IRQ 64-bit EL0
>>  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
>> -	ventry	el0_error_invalid		// Error 64-bit EL0
>> +	ventry	el0_error			// Error 64-bit EL0
>>  
>>  #ifdef CONFIG_COMPAT
>>  	ventry	el0_sync_compat			// Synchronous 32-bit EL0
>> @@ -508,12 +525,31 @@ el1_preempt:
>>  	ret	x24
>>  #endif
>>  
>> +	.align	6
>> +el1_error:
>> +	kernel_entry 1
>> +el1_sei:
>> +	/*
>> +	 * asynchronous SError interrupt from kernel
>> +	 */
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
> 
> I don't think this is correct if we branched here from kernel_exit.
> Surely we want the DISR_EL1 value, and ESR_EL1 is unrelated?
> 
>> +	mov	x2, #1				// exception level of SEI generated
>> +	b	do_sei
> 
> You don't need to figure out the EL here. In do_sei() we can determine
> the exception level from the regs (e.g. using user_mode(regs)).
> 
>> +ENDPROC(el1_error)
>> +
>> +
>>  /*
>>   * EL0 mode handlers.
>>   */
>>  	.align	6
>>  el0_sync:
>>  	kernel_entry 0
>> +#ifdef CONFIG_ARM64_ESB
>> +	mrs     x26, disr_el1
>> +	tbnz    x26, #31, el0_sei		// check DISR.A
>> +	msr	daifclr, #0x4			// unmask SEI
>> +#endif
> 
> Why do we duplicate this across the EL0 handlers, rather than making it
> common in the el0 kernel_entry code?
> 
>>  	mrs	x25, esr_el1			// read the syndrome register
>>  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
>>  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
>> @@ -688,8 +724,38 @@ el0_inv:
>>  ENDPROC(el0_sync)
>>  
>>  	.align	6
>> +el0_error:
>> +	kernel_entry 0
>> +el0_sei:
>> +	/*
>> +	 * asynchronous SError interrupt from userspace
>> +	 */
>> +	ct_user_exit
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
> 
> As with el1_sei, I don't think this is correct if we branched to
> el0_sei. As far as I am aware, ESR_EL1 will contain whatever exception
> we took, and the value we want is in DISR_EL1.
> 
>> +	mov	x2, #0
> 
> This EL parameter can go.
> 
>> +	bl	do_sei
>> +	b	ret_to_user
>> +ENDPROC(el0_error)
>> +
>> +	.align	6
>>  el0_irq:
>>  	kernel_entry 0
>> +#ifdef CONFIG_ARM64_ESB
>> +	mrs     x26, disr_el1
>> +	tbz     x26, #31, el0_irq_naked          // check DISR.A
>> +
>> +	mov	x0, sp
>> +	mrs	x1, esr_el1
>> +	mov	x2, 0
>> +
>> +	/*
>> +	 * The SEI generated at EL0 is not affect this irq context,
>> +	 * so after sei handler, we continue process this irq.
>> +	 */
>> +	bl	do_sei
>> +	msr     daifclr, #0x4                   // unmask SEI
> 
> This rough pattern is duplicated several times across the EL0 entry
> paths. I think it should be made common.
> 
>> +#endif
>>  el0_irq_naked:
>>  	enable_dbg
>>  #ifdef CONFIG_TRACE_IRQFLAGS
>> diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
>> index b6d6727..99be6d8 100644
>> --- a/arch/arm64/kernel/traps.c
>> +++ b/arch/arm64/kernel/traps.c
>> @@ -643,6 +643,34 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>>  		handler[reason], smp_processor_id(), esr,
>>  		esr_get_class_string(esr));
>>  
>> +	die("Oops - bad mode", regs, 0);
>> +	local_irq_disable();
>> +	panic("bad mode");
>> +}
>> +
>> +static const char *sei_context[] = {
>> +	"userspace",			/* EL0 */
>> +	"kernel",			/* EL1 */
>> +};
> 
> This should go. It's only used in one place, and would be clearer with
> the strings inline. More on that below.
> 
>> +
>> +static const char *sei_severity[] = {
> 
> Please name this for what it actually represents:
> 
> static const char *esr_aet_str[] = {
> 
>> +	[0 ... ESR_ELx_AET_MAX] =	"Unknown",
> 
> For consistency with esr_class_str, please make this:
> 
> 	[0 ... ESR_ELx_AET_MAX] =	"UNRECOGNIZED AET",
> 
> ... which makes it clear that this isn't some AET value which reports an
> "Unknown" status.
> 
>> +	[ESR_ELx_AET_UC]	=	"Uncontainable",
>> +	[ESR_ELx_AET_UEU]	=	"Unrecoverable",
>> +	[ESR_ELx_AET_UEO]	=	"Restartable",
>> +	[ESR_ELx_AET_UER]	=	"Recoverable",
>> +	[ESR_ELx_AET_CE]	=	"Corrected",
>> +};
>> +
>> +DEFINE_PER_CPU(int, sei_in_process);
> 
> A previous patch added definition of this.
> 
>> +asmlinkage void do_sei(struct pt_regs *regs, unsigned int esr, int el)
>> +{
>> +	int aet = ESR_ELx_AET(esr);
> 
> The AET field is only valid when the DFSC is 0b010001, so we need to
> check that before we interpret AET.
> 
>> +	console_verbose();
>> +
>> +	pr_crit("Asynchronous SError interrupt detected on CPU%d, %s, %s\n",
>> +		smp_processor_id(), sei_context[el], sei_severity[aet]);
> 
> We should dump the full ESR_ELx value, regardless of what automated
> decoding we do, so that we have a chance of debugging issues in the
> field.
> 
> It would also be nice to align with how bad_mode reports this today.
> Please make this:
> 
> 	pr_crit("SError detected on CPU%d while in %s mode: code: 0x%08x -- %s\n",
> 		smp_processor_id(), user_mode(regs) ? "user" : "kernel",
> 		esr, esr_aet_str[aet]);
> 
> ... though it might be best to dump the raw SPSR rather than trying to
> say user/kernel, so that we can distinguish EL1/EL2 with VHE, etc.
> 
>> +
>>  	/*
>>  	 * In firmware first mode, we could assume firmware will only generate one
>>  	 * of cper records at a time. There is no risk for one cpu to parse ghes table.
>> @@ -653,9 +681,31 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
>>  		this_cpu_dec(sei_in_process);
>>  	}
>>  
>> -	die("Oops - bad mode", regs, 0);
>> +	if (el == 0 && IS_ENABLED(CONFIG_ARM64_ESB) &&
> 
> Please use user_mode(regs), and get rid of the el parameter to this
> function entirely.
> 
>> +	    cpus_have_cap(ARM64_HAS_RAS_EXTN)) {
>> +		siginfo_t info;
>> +		void __user *pc = (void __user *)instruction_pointer(regs);
>> +
>> +		if (aet >= ESR_ELx_AET_UEO)
>> +			return;
> 
> We need to check the DFSC first, and 0b111 is a reserved value (which
> the ARM ARM doesn't define the recoverability of), so I don't think this
> is correct.
> 
> We should probably test the DSFC, then switch on the AET value, so as to
> handle only the cases we are aware of.
> 
>> +
>> +		if (aet == ESR_ELx_AET_UEU) {
>> +			info.si_signo = SIGILL;
>> +			info.si_errno = 0;
>> +			info.si_code  = ILL_ILLOPC;
>> +			info.si_addr  = pc;
> 
> An unrecoverable error is not necessarily a particular bad instruction,
> so I'm not sure this makes sense.
> 
> Thanks,
> Mark.
> 
> .
>
James Morse April 18, 2017, 10:51 a.m. UTC | #5
Hi Wang Xiongfeng,

On 18/04/17 02:09, Xiongfeng Wang wrote:
> I have some confusion about the RAS feature when VHE is enabled. Does RAS spec support
> the situation when VHE is enabled. When VHE is disabled, the hyperviosr delegates the error
> exception to EL1 by setting HCR_EL2.VSE to 1, and this will inject a virtual SEI into OS.

(The ARM-ARM also requires the HCR_EL2.AMO to be set so that physical SError
 Interrupts are taken to EL2, meaning EL1 can never receive a physical SError)


> My understanding is that HCR_EL2.VSE is only used to inject a virtual SEI into EL1.

... mine too ...

> But when VHE is enabled, the host OS will run at EL2. We can't inject a virtual SEI into
> host OS. I don't know if RAS spec can handle this situation.

The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
describe a way to inject these as they are generated by the CPU.

Am I right in thinking you want this to use SError Interrupts as an APEI
notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)

This is straightforward for the hyper-visor to implement using Virtual SError.
I don't think its not always feasible for the host as Physical SError is routed
to EL3 by SCR_EL3.EA, meaning there is no hardware generated SError that can
reach EL2. Another APEI notification mechanism may be more appropriate.

EL3 may be able to 'fake' an SError by returning into the appropriate EL2 vector
if the exception came from EL{0,1}, or from EL2 and PSTATE.A is clear.
If the SError came from EL2 and the ESR_EL3.IESB bit is set, we can write an
appropriate ESR into DISR.
You cant use SError to cover all the possible RAS exceptions. We already have
this problem using SEI if PSTATE.A was set and the exception was an imprecise
abort from EL2. We can't return to the interrupted context and we can't deliver
an SError to EL2 either.

Setting SCR_EL3.EA allows firmware to handle these ugly corner cases. Notifying
the OS is a separate problem where APEI's SEI may not always be the best choice.


Thanks,

James
Xiongfeng Wang April 19, 2017, 2:37 a.m. UTC | #6
Hi James,

Thanks for your reply.

On 2017/4/18 18:51, James Morse wrote:
> Hi Wang Xiongfeng,
> 
> On 18/04/17 02:09, Xiongfeng Wang wrote:
>> I have some confusion about the RAS feature when VHE is enabled. Does RAS spec support
>> the situation when VHE is enabled. When VHE is disabled, the hyperviosr delegates the error
>> exception to EL1 by setting HCR_EL2.VSE to 1, and this will inject a virtual SEI into OS.
> 
> (The ARM-ARM also requires the HCR_EL2.AMO to be set so that physical SError
>  Interrupts are taken to EL2, meaning EL1 can never receive a physical SError)
> 
> 
>> My understanding is that HCR_EL2.VSE is only used to inject a virtual SEI into EL1.
> 
> ... mine too ...
> 
>> But when VHE is enabled, the host OS will run at EL2. We can't inject a virtual SEI into
>> host OS. I don't know if RAS spec can handle this situation.
> 
> The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
> describe a way to inject these as they are generated by the CPU.
> 
> Am I right in thinking you want this to use SError Interrupts as an APEI
> notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)

Yes, using sei as an APEI notification is one part of my consideration. Another use is for ESB.
RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
describes the SEI recovery process when ESB is implemented.

In this situation, SEI is routed to EL3 (SCR_EL3.EA = 1). When an SEI occurs in EL0 and not been taken immediately,
and then an ESB instruction at SVC entry is executed, SEI is taken to EL3. The ESB at SVC entry is
used for preventing the error propagating from user space to kernel space. The EL3 SEI handler collects
the errors and fills in the APEI table, and then jump to EL2 SEI handler. EL2 SEI handler inject
an vSEI into EL1 by setting HCR_EL2.VSE = 1, so that when returned to OS, an SEI is pending.
Then ESB is executed again, and DISR_EL1.A is set by hardware (2.4.4 ESB and virtual errors), so that
the following process can be executed.

So we want to inject a vSEI into OS, when control is returned from EL3/2 to OS, no matter whether
it is on host OS or guest OS. I don't know if my understanding is right here.
> 
> This is straightforward for the hyper-visor to implement using Virtual SError.
> I don't think its not always feasible for the host as Physical SError is routed
> to EL3 by SCR_EL3.EA, meaning there is no hardware generated SError that can
> reach EL2. Another APEI notification mechanism may be more appropriate.

> 
> EL3 may be able to 'fake' an SError by returning into the appropriate EL2 vector
> if the exception came from EL{0,1}, or from EL2 and PSTATE.A is clear.
> If the SError came from EL2 and the ESR_EL3.IESB bit is set, we can write an
> appropriate ESR into DISR.

Yes, this can work. When VHE is enabled, we can set DISR.A by software, and 'fake'
an SError by returning into the EL2 SEI vector.

> You cant use SError to cover all the possible RAS exceptions. We already have
> this problem using SEI if PSTATE.A was set and the exception was an imprecise
> abort from EL2. We can't return to the interrupted context and we can't deliver
> an SError to EL2 either.

SEI came from EL2 and PSTATE.A is set. Is it the situation where VHE is enabled and CPU is running
in kernel space. If SEI occurs in kernel space, can we just panic or shutdown.
> 
> Setting SCR_EL3.EA allows firmware to handle these ugly corner cases. Notifying
> the OS is a separate problem where APEI's SEI may not always be the best choice.
> 
> 
> Thanks,
> 
> James
> 
> .
> 
Thanks,
Wang Xiongfeng
James Morse April 20, 2017, 8:52 a.m. UTC | #7
Hi Wang Xiongfeng,

On 19/04/17 03:37, Xiongfeng Wang wrote:
> On 2017/4/18 18:51, James Morse wrote:
>> The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
>> describe a way to inject these as they are generated by the CPU.
>>
>> Am I right in thinking you want this to use SError Interrupts as an APEI
>> notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)
> 
> Yes, using sei as an APEI notification is one part of my consideration. Another use is for ESB.
> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
> describes the SEI recovery process when ESB is implemented.
> 
> In this situation, SEI is routed to EL3 (SCR_EL3.EA = 1). When an SEI occurs in EL0 and not been taken immediately,
> and then an ESB instruction at SVC entry is executed, SEI is taken to EL3. The ESB at SVC entry is
> used for preventing the error propagating from user space to kernel space. The EL3 SEI handler collects

> the errors and fills in the APEI table, and then jump to EL2 SEI handler. EL2 SEI handler inject
> an vSEI into EL1 by setting HCR_EL2.VSE = 1, so that when returned to OS, an SEI is pending.

This step has confused me. How would this work with VHE where the host runs at
EL2 and there is nothing at Host:EL1?
From your description I assume you have some firmware resident at EL2.


> Then ESB is executed again, and DISR_EL1.A is set by hardware (2.4.4 ESB and virtual errors), so that
> the following process can be executed.


> So we want to inject a vSEI into OS, when control is returned from EL3/2 to OS, no matter whether
> it is on host OS or guest OS.

I disagree. With Linux/KVM you can't use Virtual SError to notify the host OS.
The host OS expects to receive Physical SError, these shouldn't be taken to EL2
unless a guest is running. Notifications from firmware that use SEA or SEI
should follow the routing rules in the ARM-ARM, which means they should never
reach a guest OS.

For VHE the host runs at EL2 and sets HCR_EL2.AMO. Any firmware notification
should come from EL3 to the host at EL2. The host may choose to notify the
guest, but this should always go via Qemu.

For non-VHE systems the host runs at EL1 and KVM lives at EL2 to do the world
switch. When KVM is running a guest it sets HCR_EL2.AMO, when it has switched
back to the host it clears it.

Notifications from EL3 that pretend to be SError should route SError to EL2 or
EL1 depending on HCR_EL2.AMO.
When KVM gets a Synchronous External Abort or an SError while a guest is running
it will switch back to the host and tell the handle_exit() code.  Again the host
may choose to notify the guest, but this should always go via Qemu.

The ARM-ARM pseudo code for the routing rules is in
AArch64.TakePhysicalSErrorException(). Firmware injecting fake SError should
behave exactly like this.


Newly appeared in the ARM-ARM is HCR_EL2.TEA, which takes Synchronous External
Aborts to EL2 (if SCR_EL3 hasn't claimed them). We should set/clear this bit
like we do HCR_EL2.AMO if the CPU has the RAS extensions.


>> You cant use SError to cover all the possible RAS exceptions. We already have
>> this problem using SEI if PSTATE.A was set and the exception was an imprecise
>> abort from EL2. We can't return to the interrupted context and we can't deliver
>> an SError to EL2 either.
> 
> SEI came from EL2 and PSTATE.A is set. Is it the situation where VHE is enabled and CPU is running
> in kernel space. If SEI occurs in kernel space, can we just panic or shutdown.

firmware-panic? We can do a little better than that:
If the IESB bit is set in the ESR we can behave as if this were an ESB and have
firmware write an appropriate ESR to DISR_EL1 if PSTATE.A is set and the
exception came from EL2.

Linux should have an ESB in its __switch_to(), I've re-worked the daif masking
in entry.S so that PSTATE.A will always be unmasked here.

Other than these two cases, yes, this CPU really is wrecked. Firmware can power
it off via PSCI, and could notify another CPU about what happened. UEFI's table
250 'Processor Generic Error Section' has a 'Processor ID' field, with a note
that on ARM this is the MPIDR_EL1. Table 260 'ARM Processor Error Section' has
something similar.


Thanks,

James
Xiongfeng Wang April 21, 2017, 10:46 a.m. UTC | #8
Hi XiuQi,

On 2017/3/30 18:31, Xie XiuQi wrote:
> Error Synchronization Barrier (ESB; part of the ARMv8.2 Extensions)
> is used to synchronize Unrecoverable errors. That is, containable errors
> architecturally consumed by the PE and not silently propagated.
> 
> With ESB it is generally possible to isolate an unrecoverable error
> between two ESB instructions. So, it's possible to recovery from
> unrecoverable errors reported by asynchronous SError interrupt.
> 
> If ARMv8.2 RAS Extension is not support, ESB is treated as a NOP.
> 
> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
> Signed-off-by: Wang Xiongfeng <wangxiongfengi2@huawei.com>
> ---
>  arch/arm64/Kconfig           | 16 ++++++++++
>  arch/arm64/include/asm/esr.h | 14 +++++++++
>  arch/arm64/kernel/entry.S    | 70 ++++++++++++++++++++++++++++++++++++++++++--
>  arch/arm64/kernel/traps.c    | 54 ++++++++++++++++++++++++++++++++--
>  4 files changed, 150 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 859a90e..7402175 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -911,6 +911,22 @@ endmenu
>  
>  menu "ARMv8.2 architectural features"
>  
> +config ARM64_ESB
> +	bool "Enable support for Error Synchronization Barrier (ESB)"
> +	default n
> +	help
> +	  Error Synchronization Barrier (ESB; part of the ARMv8.2 Extensions)
> +	  is used to synchronize Unrecoverable errors. That is, containable errors
> +	  architecturally consumed by the PE and not silently propagated.
> +
> +	  Without ESB it is not generally possible to isolate an Unrecoverable
> +	  error because it is not known which instruction generated the error.
> +
> +	  Selecting this option allows inject esb instruction before the exception
> +	  change. If ARMv8.2 RAS Extension is not support, ESB is treated as a NOP.
> +
> +	  Note that ESB instruction can introduce slight overhead, so say N if unsure.
> +
>  config ARM64_UAO
>  	bool "Enable support for User Access Override (UAO)"
>  	default y
> diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
> index f20c64a..22f9c90 100644
> --- a/arch/arm64/include/asm/esr.h
> +++ b/arch/arm64/include/asm/esr.h
> @@ -106,6 +106,20 @@
>  #define ESR_ELx_AR 		(UL(1) << 14)
>  #define ESR_ELx_CM 		(UL(1) << 8)
>  
> +#define ESR_Elx_DFSC_SEI	(0x11)
> +
> +#define ESR_ELx_AET_SHIFT	(10)
> +#define ESR_ELx_AET_MAX		(7)
> +#define ESR_ELx_AET_MASK	(UL(7) << ESR_ELx_AET_SHIFT)
> +#define ESR_ELx_AET(esr)	(((esr) & ESR_ELx_AET_MASK) >> ESR_ELx_AET_SHIFT)
> +
> +#define ESR_ELx_AET_UC		(0)
> +#define ESR_ELx_AET_UEU		(1)
> +#define ESR_ELx_AET_UEO		(2)
> +#define ESR_ELx_AET_UER		(3)
> +#define ESR_ELx_AET_CE		(6)
> +
> +
>  /* ISS field definitions for exceptions taken in to Hyp */
>  #define ESR_ELx_CV		(UL(1) << 24)
>  #define ESR_ELx_COND_SHIFT	(20)
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 43512d4..d8a7306 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -69,7 +69,14 @@
>  #define BAD_FIQ		2
>  #define BAD_ERROR	3
>  
> +	.arch_extension ras
> +
>  	.macro	kernel_entry, el, regsize = 64
because we also want to take SEI exception in kernel space, we may need to unmask SEI in kernel_entry.
> +#ifdef CONFIG_ARM64_ESB
> +	.if	\el == 0
> +	esb
> +	.endif
> +#endif
>  	sub	sp, sp, #S_FRAME_SIZE
>  	.if	\regsize == 32
>  	mov	w0, w0				// zero upper 32 bits of x0
> @@ -208,6 +215,7 @@ alternative_else_nop_endif
>  #endif
>  
>  	.if	\el == 0
> +	msr	daifset, #0xF			// Set flags
>  	ldr	x23, [sp, #S_SP]		// load return stack pointer
>  	msr	sp_el0, x23
>  #ifdef CONFIG_ARM64_ERRATUM_845719
> @@ -226,6 +234,15 @@ alternative_else_nop_endif
>  
>  	msr	elr_el1, x21			// set up the return data
>  	msr	spsr_el1, x22
> +
> +#ifdef CONFIG_ARM64_ESB
> +	.if \el == 0
> +	esb					// Error Synchronization Barrier
> +	mrs	x21, disr_el1			// Check for deferred error
> +	tbnz	x21, #31, el1_sei
> +	.endif
> +#endif
> +
>  	ldp	x0, x1, [sp, #16 * 0]
>  	ldp	x2, x3, [sp, #16 * 1]
>  	ldp	x4, x5, [sp, #16 * 2]
> @@ -318,7 +335,7 @@ ENTRY(vectors)
>  	ventry	el1_sync_invalid		// Synchronous EL1t
>  	ventry	el1_irq_invalid			// IRQ EL1t
>  	ventry	el1_fiq_invalid			// FIQ EL1t
> -	ventry	el1_error_invalid		// Error EL1t
> +	ventry	el1_error			// Error EL1t
>  
>  	ventry	el1_sync			// Synchronous EL1h
>  	ventry	el1_irq				// IRQ EL1h
> @@ -328,7 +345,7 @@ ENTRY(vectors)
>  	ventry	el0_sync			// Synchronous 64-bit EL0
>  	ventry	el0_irq				// IRQ 64-bit EL0
>  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
> -	ventry	el0_error_invalid		// Error 64-bit EL0
> +	ventry	el0_error			// Error 64-bit EL0

for the situation 'Current EL with SPx', we also need to change the sError vector to el1_error.
>  
>  #ifdef CONFIG_COMPAT
>  	ventry	el0_sync_compat			// Synchronous 32-bit EL0
> @@ -508,12 +525,31 @@ el1_preempt:
>  	ret	x24
>  #endif
>  
> +	.align	6
> +el1_error:
> +	kernel_entry 1
> +el1_sei:
> +	/*
> +	 * asynchronous SError interrupt from kernel
> +	 */
> +	mov	x0, sp
> +	mrs	x1, esr_el1
> +	mov	x2, #1				// exception level of SEI generated
> +	b	do_sei
> +ENDPROC(el1_error)
> +
> +


Thanks,

Wang Xiongfeng
Xiongfeng Wang April 21, 2017, 11:33 a.m. UTC | #9
Hi James,

Thanks for your reply.

On 2017/4/20 16:52, James Morse wrote:
> Hi Wang Xiongfeng,
> 
> On 19/04/17 03:37, Xiongfeng Wang wrote:
>> On 2017/4/18 18:51, James Morse wrote:
>>> The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
>>> describe a way to inject these as they are generated by the CPU.
>>>
>>> Am I right in thinking you want this to use SError Interrupts as an APEI
>>> notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)
>>
>> Yes, using sei as an APEI notification is one part of my consideration. Another use is for ESB.
>> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
>> describes the SEI recovery process when ESB is implemented.
>>
>> In this situation, SEI is routed to EL3 (SCR_EL3.EA = 1). When an SEI occurs in EL0 and not been taken immediately,
>> and then an ESB instruction at SVC entry is executed, SEI is taken to EL3. The ESB at SVC entry is
>> used for preventing the error propagating from user space to kernel space. The EL3 SEI handler collects
> 
>> the errors and fills in the APEI table, and then jump to EL2 SEI handler. EL2 SEI handler inject
>> an vSEI into EL1 by setting HCR_EL2.VSE = 1, so that when returned to OS, an SEI is pending.
> 
> This step has confused me. How would this work with VHE where the host runs at
> EL2 and there is nothing at Host:EL1?

RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
I don't know whether the step described in the section is designed for guest os or host os or both.
Yes, it doesn't work with VHE where the host runs at EL2.

>>From your description I assume you have some firmware resident at EL2.

Our actual SEI step is planned as follows:
Host OS:  EL0/EL1 -> EL3 -> EL0/EL1
Guest OS:  EL0/EL1 -> EL3 -> EL2 -> EL0/EL1

Our problem is that, when returning to EL0/EL1, whether we should jump to EL1 SEI vector or just return to where the SEI is taken from?
In guest os situation, we can inject an vSEI and return to where the SEI is taken from.
But in host os situation, we can't inject an vSEI (if we don't set HCR_EL2.AMO), so we have to jump to EL1 SEI vector.
Then the process following ESB won't be executed becuase SEI is taken to EL3 from the ESB instruction in EL1, and when control
is returned to OS, we are in EL1 SEI vector rather than the ESB instruction.

It is ok to just ignore the process following the ESB instruction in el0_sync, because the process will be sent SIGBUS signal.
But for el0_irq, the irq process following the ESB may should be executed because the irq is not related to the user process.

If we set HCR_EL2.AMO when SCR_EL3.EA is set. We can still inject an vSEI into host OS in EL3.
Physical SError won't be taken to EL2 because SCR_EL3.EA is set. But this may be too racy is
not consistent with ARM rules.
> 
> 
>> Then ESB is executed again, and DISR_EL1.A is set by hardware (2.4.4 ESB and virtual errors), so that
>> the following process can be executed.
> 
> 
>> So we want to inject a vSEI into OS, when control is returned from EL3/2 to OS, no matter whether
>> it is on host OS or guest OS.
> 
> I disagree. With Linux/KVM you can't use Virtual SError to notify the host OS.
> The host OS expects to receive Physical SError, these shouldn't be taken to EL2
> unless a guest is running. Notifications from firmware that use SEA or SEI
> should follow the routing rules in the ARM-ARM, which means they should never
> reach a guest OS.

Yes, I agree. When running the host os, exception should not be taken to EL2, because
EL2 SEI vector always suppose that exception is taken from guest os and save
guest context in the first place.
> 
> For VHE the host runs at EL2 and sets HCR_EL2.AMO. Any firmware notification
> should come from EL3 to the host at EL2. The host may choose to notify the
> guest, but this should always go via Qemu.
> 
> For non-VHE systems the host runs at EL1 and KVM lives at EL2 to do the world
> switch. When KVM is running a guest it sets HCR_EL2.AMO, when it has switched
> back to the host it clears it.
> 
> Notifications from EL3 that pretend to be SError should route SError to EL2 or
> EL1 depending on HCR_EL2.AMO.
> When KVM gets a Synchronous External Abort or an SError while a guest is running
> it will switch back to the host and tell the handle_exit() code.  Again the host
> may choose to notify the guest, but this should always go via Qemu.
> 
> The ARM-ARM pseudo code for the routing rules is in
> AArch64.TakePhysicalSErrorException(). Firmware injecting fake SError should
> behave exactly like this.
> 
> 
> Newly appeared in the ARM-ARM is HCR_EL2.TEA, which takes Synchronous External
> Aborts to EL2 (if SCR_EL3 hasn't claimed them). We should set/clear this bit
> like we do HCR_EL2.AMO if the CPU has the RAS extensions.
> 
> 
>>> You cant use SError to cover all the possible RAS exceptions. We already have
>>> this problem using SEI if PSTATE.A was set and the exception was an imprecise
>>> abort from EL2. We can't return to the interrupted context and we can't deliver
>>> an SError to EL2 either.
>>
>> SEI came from EL2 and PSTATE.A is set. Is it the situation where VHE is enabled and CPU is running
>> in kernel space. If SEI occurs in kernel space, can we just panic or shutdown.
> 
> firmware-panic? We can do a little better than that:
> If the IESB bit is set in the ESR we can behave as if this were an ESB and have
> firmware write an appropriate ESR to DISR_EL1 if PSTATE.A is set and the
> exception came from EL2.

This method may solve the problem I said above.
Is IESB a new bit added int ESR in the newest RAS spec?
> 
> Linux should have an ESB in its __switch_to(), I've re-worked the daif masking
> in entry.S so that PSTATE.A will always be unmasked here.
> 
> Other than these two cases, yes, this CPU really is wrecked. Firmware can power
> it off via PSCI, and could notify another CPU about what happened. UEFI's table
> 250 'Processor Generic Error Section' has a 'Processor ID' field, with a note
> that on ARM this is the MPIDR_EL1. Table 260 'ARM Processor Error Section' has
> something similar.
> 
> 
> Thanks,
> 
> James
> 
> .
> 
Thanks,

Wang Xiongfeng
James Morse April 24, 2017, 5:14 p.m. UTC | #10
Hi Wang Xiongfeng,

On 21/04/17 12:33, Xiongfeng Wang wrote:
> On 2017/4/20 16:52, James Morse wrote:
>> On 19/04/17 03:37, Xiongfeng Wang wrote:
>>> On 2017/4/18 18:51, James Morse wrote:
>>>> The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
>>>> describe a way to inject these as they are generated by the CPU.
>>>>
>>>> Am I right in thinking you want this to use SError Interrupts as an APEI
>>>> notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)
>>>
>>> Yes, using sei as an APEI notification is one part of my consideration. Another use is for ESB.
>>> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
>>> describes the SEI recovery process when ESB is implemented.
>>>
>>> In this situation, SEI is routed to EL3 (SCR_EL3.EA = 1). When an SEI occurs in EL0 and not been taken immediately,
>>> and then an ESB instruction at SVC entry is executed, SEI is taken to EL3. The ESB at SVC entry is
>>> used for preventing the error propagating from user space to kernel space. The EL3 SEI handler collects
>>
>>> the errors and fills in the APEI table, and then jump to EL2 SEI handler. EL2 SEI handler inject
>>> an vSEI into EL1 by setting HCR_EL2.VSE = 1, so that when returned to OS, an SEI is pending.
>>
>> This step has confused me. How would this work with VHE where the host runs at
>> EL2 and there is nothing at Host:EL1?
> 
> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
> I don't know whether the step described in the section is designed for guest os or host os or both.
> Yes, it doesn't work with VHE where the host runs at EL2.

If it uses Virtual SError, it must be for an OS running at EL1, as the code
running at EL2 (VHE:Linux, firmware or Xen) must set HCR_EL2.AMO to make this work.

I can't find the section you describe in this RAS spec:
https://developer.arm.com/docs/ddi0587/latest/armreliabilityavailabilityandserviceabilityrasspecificationarmv8forthearmv8aarchitectureprofile


>> >From your description I assume you have some firmware resident at EL2.

> Our actual SEI step is planned as follows:

Ah, okay. You can't use Virtual SError at all then, as firmware doesn't own EL2:
KVM does. KVM will save/restore the HCR_EL2 register as part of its world
switch. Firmware must not modify the hyp registers behind the hyper-visors back.


> Host OS:  EL0/EL1 -> EL3 -> EL0/EL1

i.e. While the host was running, so EL3 sees HCR_EL2.AMO clear, and directs the
SEI to EL1.

> Guest OS:  EL0/EL1 -> EL3 -> EL2 -> EL0/EL1

i.e. While the guest was running, so EL3 sees HCR_EL2.AMO set, directs the SEI
to EL2 where KVM performs the world-switch and returns to host EL1.

Looks sensible.


> In guest os situation, we can inject an vSEI and return to where the SEI is taken from.

An SEI from firmware should always end up in the host OS. If a guest was running
KVM is responsible for doing the world switch to restore host EL1 and return
there. This should all work today.


> But in host os situation, we can't inject an vSEI (if we don't set HCR_EL2.AMO), so we have to jump to EL1 SEI vector.

Because your firmware is at EL3 you have to check PSTATE.A and jump into the
SError vector in both cases, you just choose if its VBAR_EL2 or VBAR_EL1 based
on HCR_EL2.AMO.


> Then the process following ESB won't be executed becuase SEI is taken to EL3 from the ESB instruction in EL1, and when control
> is returned to OS, we are in EL1 SEI vector rather than the ESB instruction.

Firmware should set ELR_EL1 so that an eret from the EL1 SError vector takes you
back to the ESB instruction that took us to EL3 in the first place.


There is a problem here mixing SError as the CPU->Software notification of RAS
errors and as APEI's SEI Software->Software notification that a firmware-first
error has been handled by EL3.

To elaborate:
The routine in this patch was something like:
1 ESB
2 Read DISR_EL1
3 If set, branch to the SError handler.

If we have firmware-first ESB will generate an SError as PSTATE.A at EL{1,2}
doesn't mask SError for EL3. Firmware can then handle the RAS error. With this
the CPU->Software story is done. Firmware notifying the OS via APEI is a
different problem.

If the error doesn't need to be notified to the OS via SEI, firmware can return
to step 1 above with DISR_EL1 clear. The OS may be told via another mechanism
such as polling or an irq.

If PSTATE.A was clear, firmware can return to the SError vector. If PSTATE.A was
set and firmware knows this was due to an ESB, it can set DISR_EL1. The problem
is firmware can't know if the SError was due to an ESB.

The only time we are likely to have PSTATE.A masked is when changing exception
level. For this we can rely on IESB, so step 1 isn't necessary in the routine
above. Firmware knows if an SError taken to EL3 is due to an IESB, as the
ESR_EL3.IESB bit will be set, in this case it can write to DISR_EL1 and return.

In Linux we can use IESB for changes in exception level, and ESB with PSTATE.A
clear for any other need. (e.g. __switch_to()). Firmware can notify us using SEI
if we use either of these two patterns. This may not work for other OS (Xen?),
but this is a problem with using SEI as an APEI notification method.

(Some cleanup of our SError unmasking is needed, I have some patches I will post
on a branch shortly)


> It is ok to just ignore the process following the ESB instruction in el0_sync, because the process will be sent SIGBUS signal.

I don't understand. How will Linux know the process caused an error if we
neither take an SError nor read DISR_EL1 after an ESB?


> But for el0_irq, the irq process following the ESB may should be executed because the irq is not related to the user process.

If we exit EL0 for an IRQ, and take an SError due to IESB we know the fault was
due to EL0, not the exception level we entered to process the IRQ.


> If we set HCR_EL2.AMO when SCR_EL3.EA is set. We can still inject an vSEI into host OS in EL3.

(from EL3?) Only if the host OS is running at EL1 and isn't using EL2. Firmware
can't know if we're using EL2 so this can't work.


> Physical SError won't be taken to EL2 because SCR_EL3.EA is set. But this may be too racy is
> not consistent with ARM rules.

EL3 flipping bits in EL2 system registers behind EL2 software's back is going to
have strange side effects. e.g. some Hypervisor may not change HCR_EL2 when
switching VM-A to VM-B as the configuration is the same; now the virtual SError
will be delivered to the wrong VM. (KVM doesn't do this, but Xen might.)


>>>> You cant use SError to cover all the possible RAS exceptions. We already have
>>>> this problem using SEI if PSTATE.A was set and the exception was an imprecise
>>>> abort from EL2. We can't return to the interrupted context and we can't deliver
>>>> an SError to EL2 either.
>>>
>>> SEI came from EL2 and PSTATE.A is set. Is it the situation where VHE is enabled and CPU is running
>>> in kernel space. If SEI occurs in kernel space, can we just panic or shutdown.
>>
>> firmware-panic? We can do a little better than that:
>> If the IESB bit is set in the ESR we can behave as if this were an ESB and have
>> firmware write an appropriate ESR to DISR_EL1 if PSTATE.A is set and the
>> exception came from EL2.
> 
> This method may solve the problem I said above.
> Is IESB a new bit added int ESR in the newest RAS spec?

The latest ARM-ARM (DDI0487B.a) describes it as part of the "RAS Extension in
ARMv8.2" in section 'A1.7.5 The Reliability, Availability, and Serviceability
(RAS) Extension'.

Jumping to 'D7.2.28 ESR_ELx, Exception Syndrome Register (ELx)', then page
D7-2284 for the 'ISS encoding for an SError interrupt', it indicates "the SError
interrupt was synchronized by the implicit [ESB] and taken immediately"


As above, with IESB for changes in exception level, and unmasking PSTATE.A for
any other use of ESB, Linux can make sure that firmware can always notify us of
an APEI event using SEI.

Another OS (or even UEFI) may not do the same, so firmware should be prepared
for ESB to be executed with PSTATE.A masked (i.e, not using IESB). If there is
no wider firmware-policy for this (reboot, notify a remote CPU), then returning
to the faulting location and trying to inject SError later is all we can do.


Thanks,

James
Xiongfeng Wang April 28, 2017, 2:55 a.m. UTC | #11
Hi James,

Thanks for your explanation and suggests.

On 2017/4/25 1:14, James Morse wrote:
> Hi Wang Xiongfeng,
> 
> On 21/04/17 12:33, Xiongfeng Wang wrote:
>> On 2017/4/20 16:52, James Morse wrote:
>>> On 19/04/17 03:37, Xiongfeng Wang wrote:
>>>> On 2017/4/18 18:51, James Morse wrote:
>>>>> The host expects to receive physical SError Interrupts. The ARM-ARM doesn't
>>>>> describe a way to inject these as they are generated by the CPU.
>>>>>
>>>>> Am I right in thinking you want this to use SError Interrupts as an APEI
>>>>> notification? (This isn't a CPU thing so the RAS spec doesn't cover this use)
>>>>
>>>> Yes, using sei as an APEI notification is one part of my consideration. Another use is for ESB.
>>>> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
>>>> describes the SEI recovery process when ESB is implemented.
>>>>
>>>> In this situation, SEI is routed to EL3 (SCR_EL3.EA = 1). When an SEI occurs in EL0 and not been taken immediately,
>>>> and then an ESB instruction at SVC entry is executed, SEI is taken to EL3. The ESB at SVC entry is
>>>> used for preventing the error propagating from user space to kernel space. The EL3 SEI handler collects
>>>
>>>> the errors and fills in the APEI table, and then jump to EL2 SEI handler. EL2 SEI handler inject
>>>> an vSEI into EL1 by setting HCR_EL2.VSE = 1, so that when returned to OS, an SEI is pending.
>>>
>>> This step has confused me. How would this work with VHE where the host runs at
>>> EL2 and there is nothing at Host:EL1?
>>
>> RAS spec 6.5.3 'Example software sequences: Variant: asynchronous External Abort with ESB'
>> I don't know whether the step described in the section is designed for guest os or host os or both.
>> Yes, it doesn't work with VHE where the host runs at EL2.
> 
> If it uses Virtual SError, it must be for an OS running at EL1, as the code
> running at EL2 (VHE:Linux, firmware or Xen) must set HCR_EL2.AMO to make this work.
> 
> I can't find the section you describe in this RAS spec:
> https://developer.arm.com/docs/ddi0587/latest/armreliabilityavailabilityandserviceabilityrasspecificationarmv8forthearmv8aarchitectureprofile
> 
I got the RAS spec from my company's ARM account. The document number is PRD03-PRDC-010953 .

> 
>>> >From your description I assume you have some firmware resident at EL2.
> 
>> Our actual SEI step is planned as follows:
> 
> Ah, okay. You can't use Virtual SError at all then, as firmware doesn't own EL2:
> KVM does. KVM will save/restore the HCR_EL2 register as part of its world
> switch. Firmware must not modify the hyp registers behind the hyper-visors back.
> 
> 
>> Host OS:  EL0/EL1 -> EL3 -> EL0/EL1
> 
> i.e. While the host was running, so EL3 sees HCR_EL2.AMO clear, and directs the
> SEI to EL1.
> 
>> Guest OS:  EL0/EL1 -> EL3 -> EL2 -> EL0/EL1
> 
> i.e. While the guest was running, so EL3 sees HCR_EL2.AMO set, directs the SEI
> to EL2 where KVM performs the world-switch and returns to host EL1.
> 
> Looks sensible.
> 
> 
>> In guest os situation, we can inject an vSEI and return to where the SEI is taken from.
> 
> An SEI from firmware should always end up in the host OS. If a guest was running
> KVM is responsible for doing the world switch to restore host EL1 and return
> there. This should all work today.
> 
> 
>> But in host os situation, we can't inject an vSEI (if we don't set HCR_EL2.AMO), so we have to jump to EL1 SEI vector.
> 
> Because your firmware is at EL3 you have to check PSTATE.A and jump into the
> SError vector in both cases, you just choose if its VBAR_EL2 or VBAR_EL1 based
> on HCR_EL2.AMO.
> 
> 
>> Then the process following ESB won't be executed becuase SEI is taken to EL3 from the ESB instruction in EL1, and when control
>> is returned to OS, we are in EL1 SEI vector rather than the ESB instruction.
> 
> Firmware should set ELR_EL1 so that an eret from the EL1 SError vector takes you
> back to the ESB instruction that took us to EL3 in the first place.
> 
> 
> There is a problem here mixing SError as the CPU->Software notification of RAS
> errors and as APEI's SEI Software->Software notification that a firmware-first
> error has been handled by EL3.
> 
> To elaborate:
> The routine in this patch was something like:
> 1 ESB
> 2 Read DISR_EL1
> 3 If set, branch to the SError handler.
> 
> If we have firmware-first ESB will generate an SError as PSTATE.A at EL{1,2}
> doesn't mask SError for EL3. Firmware can then handle the RAS error. With this
> the CPU->Software story is done. Firmware notifying the OS via APEI is a
> different problem.
> 
> If the error doesn't need to be notified to the OS via SEI, firmware can return
> to step 1 above with DISR_EL1 clear. The OS may be told via another mechanism
> such as polling or an irq.
> 
> If PSTATE.A was clear, firmware can return to the SError vector. If PSTATE.A was
> set and firmware knows this was due to an ESB, it can set DISR_EL1. The problem
> is firmware can't know if the SError was due to an ESB.

Yes, if implicit ESB is not implemented, we can't know if the sError was due to
an ESB unless we only set PSTATE.A when ESB is executed.
> 
> The only time we are likely to have PSTATE.A masked is when changing exception
> level. For this we can rely on IESB, so step 1 isn't necessary in the routine
> above. Firmware knows if an SError taken to EL3 is due to an IESB, as the
> ESR_EL3.IESB bit will be set, in this case it can write to DISR_EL1 and return.
> 
> In Linux we can use IESB for changes in exception level, and ESB with PSTATE.A
> clear for any other need. (e.g. __switch_to()). Firmware can notify us using SEI
> if we use either of these two patterns. This may not work for other OS (Xen?),
> but this is a problem with using SEI as an APEI notification method.
> 
> (Some cleanup of our SError unmasking is needed, I have some patches I will post
> on a branch shortly)
> 
> 
>> It is ok to just ignore the process following the ESB instruction in el0_sync, because the process will be sent SIGBUS signal.
> 
> I don't understand. How will Linux know the process caused an error if we
> neither take an SError nor read DISR_EL1 after an ESB?
> 
I think there may be some misunderstanding here. The ESB instruction is placed in kernel_entry
of el0_sync and el0_irq. For the el0_sync, such as an syscall from userspace, after ESB is executed,
we check whether DISR.A is set. If it is not set, we go on to process the syscall. If it is set, we
jump to sError vector and then just eret.
But for el0_irq, after ESB is executed, we check whether DISR.A is set. If it is set, we jump to
(using BL) sError vector, process SEI, return back to irq handler and process irq, and then eret.

> 
>> But for el0_irq, the irq process following the ESB may should be executed because the irq is not related to the user process.
> 
> If we exit EL0 for an IRQ, and take an SError due to IESB we know the fault was
> due to EL0, not the exception level we entered to process the IRQ.
> 
> 
>> If we set HCR_EL2.AMO when SCR_EL3.EA is set. We can still inject an vSEI into host OS in EL3.
> 
> (from EL3?) Only if the host OS is running at EL1 and isn't using EL2. Firmware
> can't know if we're using EL2 so this can't work.
> 
> 
>> Physical SError won't be taken to EL2 because SCR_EL3.EA is set. But this may be too racy is
>> not consistent with ARM rules.
> 
> EL3 flipping bits in EL2 system registers behind EL2 software's back is going to
> have strange side effects. e.g. some Hypervisor may not change HCR_EL2 when
> switching VM-A to VM-B as the configuration is the same; now the virtual SError
> will be delivered to the wrong VM. (KVM doesn't do this, but Xen might.)
> 
> 
>>>>> You cant use SError to cover all the possible RAS exceptions. We already have
>>>>> this problem using SEI if PSTATE.A was set and the exception was an imprecise
>>>>> abort from EL2. We can't return to the interrupted context and we can't deliver
>>>>> an SError to EL2 either.
>>>>
>>>> SEI came from EL2 and PSTATE.A is set. Is it the situation where VHE is enabled and CPU is running
>>>> in kernel space. If SEI occurs in kernel space, can we just panic or shutdown.
>>>
>>> firmware-panic? We can do a little better than that:
>>> If the IESB bit is set in the ESR we can behave as if this were an ESB and have
>>> firmware write an appropriate ESR to DISR_EL1 if PSTATE.A is set and the
>>> exception came from EL2.
>>
>> This method may solve the problem I said above.
>> Is IESB a new bit added int ESR in the newest RAS spec?
> 
> The latest ARM-ARM (DDI0487B.a) describes it as part of the "RAS Extension in
> ARMv8.2" in section 'A1.7.5 The Reliability, Availability, and Serviceability
> (RAS) Extension'.
> 
> Jumping to 'D7.2.28 ESR_ELx, Exception Syndrome Register (ELx)', then page
> D7-2284 for the 'ISS encoding for an SError interrupt', it indicates "the SError
> interrupt was synchronized by the implicit [ESB] and taken immediately"
> 
> 
> As above, with IESB for changes in exception level, and unmasking PSTATE.A for
> any other use of ESB, Linux can make sure that firmware can always notify us of
> an APEI event using SEI.
> 
> Another OS (or even UEFI) may not do the same, so firmware should be prepared
> for ESB to be executed with PSTATE.A masked (i.e, not using IESB). If there is
> no wider firmware-policy for this (reboot, notify a remote CPU), then returning
> to the faulting location and trying to inject SError later is all we can do.
> 
> 
> Thanks,
> 
> James
> 
> .
> 
Thanks,
Wang Xiongfeng
James Morse May 8, 2017, 5:27 p.m. UTC | #12
Hi Xiongfeng Wang,

On 28/04/17 03:55, Xiongfeng Wang wrote:
>>> >> It is ok to just ignore the process following the ESB instruction in el0_sync, because the process will be sent SIGBUS signal.
>> > 
>> > I don't understand. How will Linux know the process caused an error if we
>> > neither take an SError nor read DISR_EL1 after an ESB?

> I think there may be some misunderstanding here. The ESB instruction is placed in kernel_entry
> of el0_sync and el0_irq. For the el0_sync, such as an syscall from userspace, after ESB is executed,
> we check whether DISR.A is set. If it is not set, we go on to process the syscall. If it is set, we
> jump to sError vector and then just eret.

Ah, this looks like an early optimisation!

We can't assume that the SError will result in the processing being killed, the
AET bits of the SError ISS Encoding (page D7-2284 of ARM-ARM DDI0487B.a), has a
'corrected' error encoding.
For these I would expect the SError-vector C code to do nothing and return to
where it came from. In this case the syscall should still be run.


Thanks,

James
Xiongfeng Wang May 9, 2017, 2:16 a.m. UTC | #13
Hi James,

On 2017/5/9 1:27, James Morse wrote:
> Hi Xiongfeng Wang,
> 
> On 28/04/17 03:55, Xiongfeng Wang wrote:
>>>>>> It is ok to just ignore the process following the ESB instruction in el0_sync, because the process will be sent SIGBUS signal.
>>>>
>>>> I don't understand. How will Linux know the process caused an error if we
>>>> neither take an SError nor read DISR_EL1 after an ESB?
> 
>> I think there may be some misunderstanding here. The ESB instruction is placed in kernel_entry
>> of el0_sync and el0_irq. For the el0_sync, such as an syscall from userspace, after ESB is executed,
>> we check whether DISR.A is set. If it is not set, we go on to process the syscall. If it is set, we
>> jump to sError vector and then just eret.
> 
> Ah, this looks like an early optimisation!
> 
> We can't assume that the SError will result in the processing being killed, the
> AET bits of the SError ISS Encoding (page D7-2284 of ARM-ARM DDI0487B.a), has a
> 'corrected' error encoding.
> For these I would expect the SError-vector C code to do nothing and return to
> where it came from. In this case the syscall should still be run.
> 
Yes, it makes sense, so we should add a return value for the do_sei handler.

Thanks,
Wang Xiongfeng
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 859a90e..7402175 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -911,6 +911,22 @@  endmenu
 
 menu "ARMv8.2 architectural features"
 
+config ARM64_ESB
+	bool "Enable support for Error Synchronization Barrier (ESB)"
+	default n
+	help
+	  Error Synchronization Barrier (ESB; part of the ARMv8.2 Extensions)
+	  is used to synchronize Unrecoverable errors. That is, containable errors
+	  architecturally consumed by the PE and not silently propagated.
+
+	  Without ESB it is not generally possible to isolate an Unrecoverable
+	  error because it is not known which instruction generated the error.
+
+	  Selecting this option allows inject esb instruction before the exception
+	  change. If ARMv8.2 RAS Extension is not support, ESB is treated as a NOP.
+
+	  Note that ESB instruction can introduce slight overhead, so say N if unsure.
+
 config ARM64_UAO
 	bool "Enable support for User Access Override (UAO)"
 	default y
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index f20c64a..22f9c90 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -106,6 +106,20 @@ 
 #define ESR_ELx_AR 		(UL(1) << 14)
 #define ESR_ELx_CM 		(UL(1) << 8)
 
+#define ESR_Elx_DFSC_SEI	(0x11)
+
+#define ESR_ELx_AET_SHIFT	(10)
+#define ESR_ELx_AET_MAX		(7)
+#define ESR_ELx_AET_MASK	(UL(7) << ESR_ELx_AET_SHIFT)
+#define ESR_ELx_AET(esr)	(((esr) & ESR_ELx_AET_MASK) >> ESR_ELx_AET_SHIFT)
+
+#define ESR_ELx_AET_UC		(0)
+#define ESR_ELx_AET_UEU		(1)
+#define ESR_ELx_AET_UEO		(2)
+#define ESR_ELx_AET_UER		(3)
+#define ESR_ELx_AET_CE		(6)
+
+
 /* ISS field definitions for exceptions taken in to Hyp */
 #define ESR_ELx_CV		(UL(1) << 24)
 #define ESR_ELx_COND_SHIFT	(20)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4..d8a7306 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -69,7 +69,14 @@ 
 #define BAD_FIQ		2
 #define BAD_ERROR	3
 
+	.arch_extension ras
+
 	.macro	kernel_entry, el, regsize = 64
+#ifdef CONFIG_ARM64_ESB
+	.if	\el == 0
+	esb
+	.endif
+#endif
 	sub	sp, sp, #S_FRAME_SIZE
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
@@ -208,6 +215,7 @@  alternative_else_nop_endif
 #endif
 
 	.if	\el == 0
+	msr	daifset, #0xF			// Set flags
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
 #ifdef CONFIG_ARM64_ERRATUM_845719
@@ -226,6 +234,15 @@  alternative_else_nop_endif
 
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
+
+#ifdef CONFIG_ARM64_ESB
+	.if \el == 0
+	esb					// Error Synchronization Barrier
+	mrs	x21, disr_el1			// Check for deferred error
+	tbnz	x21, #31, el1_sei
+	.endif
+#endif
+
 	ldp	x0, x1, [sp, #16 * 0]
 	ldp	x2, x3, [sp, #16 * 1]
 	ldp	x4, x5, [sp, #16 * 2]
@@ -318,7 +335,7 @@  ENTRY(vectors)
 	ventry	el1_sync_invalid		// Synchronous EL1t
 	ventry	el1_irq_invalid			// IRQ EL1t
 	ventry	el1_fiq_invalid			// FIQ EL1t
-	ventry	el1_error_invalid		// Error EL1t
+	ventry	el1_error			// Error EL1t
 
 	ventry	el1_sync			// Synchronous EL1h
 	ventry	el1_irq				// IRQ EL1h
@@ -328,7 +345,7 @@  ENTRY(vectors)
 	ventry	el0_sync			// Synchronous 64-bit EL0
 	ventry	el0_irq				// IRQ 64-bit EL0
 	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
-	ventry	el0_error_invalid		// Error 64-bit EL0
+	ventry	el0_error			// Error 64-bit EL0
 
 #ifdef CONFIG_COMPAT
 	ventry	el0_sync_compat			// Synchronous 32-bit EL0
@@ -508,12 +525,31 @@  el1_preempt:
 	ret	x24
 #endif
 
+	.align	6
+el1_error:
+	kernel_entry 1
+el1_sei:
+	/*
+	 * asynchronous SError interrupt from kernel
+	 */
+	mov	x0, sp
+	mrs	x1, esr_el1
+	mov	x2, #1				// exception level of SEI generated
+	b	do_sei
+ENDPROC(el1_error)
+
+
 /*
  * EL0 mode handlers.
  */
 	.align	6
 el0_sync:
 	kernel_entry 0
+#ifdef CONFIG_ARM64_ESB
+	mrs     x26, disr_el1
+	tbnz    x26, #31, el0_sei		// check DISR.A
+	msr	daifclr, #0x4			// unmask SEI
+#endif
 	mrs	x25, esr_el1			// read the syndrome register
 	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
 	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
@@ -688,8 +724,38 @@  el0_inv:
 ENDPROC(el0_sync)
 
 	.align	6
+el0_error:
+	kernel_entry 0
+el0_sei:
+	/*
+	 * asynchronous SError interrupt from userspace
+	 */
+	ct_user_exit
+	mov	x0, sp
+	mrs	x1, esr_el1
+	mov	x2, #0
+	bl	do_sei
+	b	ret_to_user
+ENDPROC(el0_error)
+
+	.align	6
 el0_irq:
 	kernel_entry 0
+#ifdef CONFIG_ARM64_ESB
+	mrs     x26, disr_el1
+	tbz     x26, #31, el0_irq_naked          // check DISR.A
+
+	mov	x0, sp
+	mrs	x1, esr_el1
+	mov	x2, 0
+
+	/*
+	 * The SEI generated at EL0 is not affect this irq context,
+	 * so after sei handler, we continue process this irq.
+	 */
+	bl	do_sei
+	msr     daifclr, #0x4                   // unmask SEI
+#endif
 el0_irq_naked:
 	enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index b6d6727..99be6d8 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -643,6 +643,34 @@  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
 		handler[reason], smp_processor_id(), esr,
 		esr_get_class_string(esr));
 
+	die("Oops - bad mode", regs, 0);
+	local_irq_disable();
+	panic("bad mode");
+}
+
+static const char *sei_context[] = {
+	"userspace",			/* EL0 */
+	"kernel",			/* EL1 */
+};
+
+static const char *sei_severity[] = {
+	[0 ... ESR_ELx_AET_MAX] =	"Unknown",
+	[ESR_ELx_AET_UC]	=	"Uncontainable",
+	[ESR_ELx_AET_UEU]	=	"Unrecoverable",
+	[ESR_ELx_AET_UEO]	=	"Restartable",
+	[ESR_ELx_AET_UER]	=	"Recoverable",
+	[ESR_ELx_AET_CE]	=	"Corrected",
+};
+
+DEFINE_PER_CPU(int, sei_in_process);
+asmlinkage void do_sei(struct pt_regs *regs, unsigned int esr, int el)
+{
+	int aet = ESR_ELx_AET(esr);
+	console_verbose();
+
+	pr_crit("Asynchronous SError interrupt detected on CPU%d, %s, %s\n",
+		smp_processor_id(), sei_context[el], sei_severity[aet]);
+
 	/*
 	 * In firmware first mode, we could assume firmware will only generate one
 	 * of cper records at a time. There is no risk for one cpu to parse ghes table.
@@ -653,9 +681,31 @@  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
 		this_cpu_dec(sei_in_process);
 	}
 
-	die("Oops - bad mode", regs, 0);
+	if (el == 0 && IS_ENABLED(CONFIG_ARM64_ESB) &&
+	    cpus_have_cap(ARM64_HAS_RAS_EXTN)) {
+		siginfo_t info;
+		void __user *pc = (void __user *)instruction_pointer(regs);
+
+		if (aet >= ESR_ELx_AET_UEO)
+			return;
+
+		if (aet == ESR_ELx_AET_UEU) {
+			info.si_signo = SIGILL;
+			info.si_errno = 0;
+			info.si_code  = ILL_ILLOPC;
+			info.si_addr  = pc;
+
+			current->thread.fault_address = 0;
+			current->thread.fault_code = 0;
+
+			force_sig_info(info.si_signo, &info, current);
+
+			return;
+		}
+	}
+
 	local_irq_disable();
-	panic("bad mode");
+	panic("Asynchronous SError interrupt");
 }
 
 /*