diff mbox series

[v6] arm64: Expose FAR_EL1 tag bits in sigcontext

Message ID 20200513180914.50892-1-pcc@google.com (mailing list archive)
State New, archived
Headers show
Series [v6] arm64: Expose FAR_EL1 tag bits in sigcontext | expand

Commit Message

Peter Collingbourne May 13, 2020, 6:09 p.m. UTC
The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v6:
- bring back comment about __reserved[]

v5:
- add padding to fault_addr_top_byte_context in order to ensure the correct
  size and preserve sp alignment

v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 +++++----
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 22 +++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 77 insertions(+), 35 deletions(-)

Comments

Dave Martin May 13, 2020, 8:28 p.m. UTC | #1
On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address fields,
> because there may be existing userspace applications that are expecting the tag
> bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> v6:
> - bring back comment about __reserved[]
> 
> v5:
> - add padding to fault_addr_top_byte_context in order to ensure the correct
>   size and preserve sp alignment
> 
> v4:
> - expose only the tag bits in the context instead of the entire FAR_EL1
> - remove mention of the new context from the sigcontext.__reserved[] note
> 
> v3:
> - add documentation to tagged-pointers.rst
> - update comments in sigcontext.h
> 
> v2:
> - revert changes to hw_breakpoint.c
> - rename set_thread_esr to set_thread_far_esr
> 
>  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
>  arch/arm64/include/asm/exception.h       |  2 +-
>  arch/arm64/include/asm/processor.h       |  2 +-
>  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
>  arch/arm64/kernel/entry-common.c         |  2 --
>  arch/arm64/kernel/signal.c               | 22 +++++++++++-
>  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
>  7 files changed, 77 insertions(+), 35 deletions(-)
> 
> diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> index eab4323609b9..c6e9592a9dea 100644
> --- a/Documentation/arm64/tagged-pointers.rst
> +++ b/Documentation/arm64/tagged-pointers.rst
> @@ -53,12 +53,17 @@ visibility.
>  Preserving tags
>  ---------------
>  
> -Non-zero tags are not preserved when delivering signals. This means that
> -signal handlers in applications making use of tags cannot rely on the
> -tag information for user virtual addresses being maintained for fields
> -inside siginfo_t. One exception to this rule is for signals raised in
> -response to watchpoint debug exceptions, where the tag information will
> -be preserved.
> +Non-zero tags are not preserved in the fault address fields
> +siginfo.si_addr or sigcontext.fault_address when delivering
> +signals. This means that signal handlers in applications making use
> +of tags cannot rely on the tag information for user virtual addresses
> +being maintained in these fields. One exception to this rule is for
> +signals raised in response to watchpoint debug exceptions, where the
> +tag information will be preserved.
> +
> +The fault address tag is preserved in the fault_addr_top_byte field of
> +the signal frame record fault_addr_top_byte_context, which is present
> +for signals raised in response to data aborts and instruction aborts.
>  
>  The architecture prevents the use of a tagged PC, so the upper byte will
>  be set to a sign-extension of bit 55 on exception return.
> diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> index 7a6e81ca23a8..90e772d9b2cd 100644
> --- a/arch/arm64/include/asm/exception.h
> +++ b/arch/arm64/include/asm/exception.h
> @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
>  }
>  
>  asmlinkage void enter_from_user_mode(void);
> -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
>  void do_undefinstr(struct pt_regs *regs);
>  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
>  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> index 240fe5e5b720..63185be29ff9 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -144,7 +144,7 @@ struct thread_struct {
>  	void			*sve_state;	/* SVE registers, if any */
>  	unsigned int		sve_vl;		/* SVE vector length */
>  	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
> -	unsigned long		fault_address;	/* fault info */
> +	unsigned long		fault_address;	/* FAR_EL1 value */
>  	unsigned long		fault_code;	/* ESR_EL1 value */
>  	struct debug_info	debug;		/* debugging */
>  #ifdef CONFIG_ARM64_PTR_AUTH
> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8b0ebce92427..2a3fe3de899d 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> @@ -44,11 +44,12 @@ struct sigcontext {
>   *
>   *	0x210		fpsimd_context
>   *	 0x10		esr_context
> + *	 0x10		fault_addr_top_byte_context
>   *	0x8a0		sve_context (vl <= 64) (optional)
>   *	 0x20		extra_context (optional)
>   *	 0x10		terminator (null _aarch64_ctx)
>   *
> - *	0x510		(reserved for future allocation)
> + *	0x500		(reserved for future allocation)
>   *
>   * New records that can exceed this space need to be opt-in for userspace, so
>   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> @@ -94,17 +95,26 @@ struct esr_context {
>  	__u64 esr;
>  };
>  
> +/* Top byte of fault address (normally not exposed via si_addr) */
> +#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
> +
> +struct fault_addr_top_byte_context {
> +	struct _aarch64_ctx head;
> +	__u8 fault_addr_top_byte;
> +	__u8 __reserved[7];
> +};
> +

Nit: the name here is a bit cumbersome (obviously bikeshedding...)


For the rest, some of my comments may be bogus -- I haven't dug into
this stuff for a little while!


Anyway:

Do we really get the whole top byte of the address in the FAR?  If so,
fine, but I'm having trouble finding a clear statement in the
architecture one way or the other.  (I didn't attempt to dive into the
pseudocode.)


Also, since we're burning 16 bytes here, I'd prefer if we make this
extensible.  At present the __reserved[7] is unusable because
userspace has no way to know whether it's valid or not.

Options include an additional flag byte (0 for now), or just making
the whole thing a __u64.  In that case we can leave the top byte bits
in their original positions if we want, but it would be a good idea to
include a flag to say that field is valid at all.  (See comments below
on Synchronous external abort.)

So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
(with #defines for the bits/fields as appropriate).


>  /*
>   * extra_context: describes extra space in the signal frame for
>   * additional structures that don't fit in sigcontext.__reserved[].
>   *
>   * Note:
>   *
> - * 1) fpsimd_context, esr_context and extra_context must be placed in
> - * sigcontext.__reserved[] if present.  They cannot be placed in the
> - * extra space.  Any other record can be placed either in the extra
> - * space or in sigcontext.__reserved[], unless otherwise specified in
> - * this file.
> + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> + * extra_context must be placed in sigcontext.__reserved[] if present.
> + * They cannot be placed in the extra space.  Any other record can be
> + * placed either in the extra space or in sigcontext.__reserved[],
> + * unless otherwise specified in this file.
>   *
>   * 2) There must not be more than one extra_context.
>   *
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index c839b5bf1904..045b4f518836 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
>  	unsigned long far = read_sysreg(far_el1);
>  
>  	local_daif_inherit(regs);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el1_abort);
> @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
>  
>  	user_exit_irqoff();
>  	local_daif_restore(DAIF_PROCCTX);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el0_da);
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 339882db5a91..baa88dc02e5c 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
>  
>  	unsigned long fpsimd_offset;
>  	unsigned long esr_offset;
> +	unsigned long ftb_offset;
>  	unsigned long sve_offset;
>  	unsigned long extra_offset;
>  	unsigned long end_offset;
> @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
>  			break;
>  
>  		case ESR_MAGIC:
> +		case FAULT_ADDR_TOP_BYTE_MAGIC:
>  			/* ignore */
>  			break;
>  
> @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>  				     sizeof(struct esr_context));
>  		if (err)
>  			return err;
> +
> +		err = sigframe_alloc(
> +			user, &user->ftb_offset,
> +			sizeof(struct fault_addr_top_byte_context));

Nit: inconsistent indentation?

(Mostly just because it makes the change look odd against the hunk
context, but not a big deal.)

> +		if (err)
> +			return err;
>  	}
>  
>  	if (system_supports_sve()) {
> @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
>  	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
>  
> -	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> +	__put_user_error(untagged_addr(current->thread.fault_address),
> +			 &sf->uc.uc_mcontext.fault_address, err);
>  
>  	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
>  
> @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
>  	}
>  
> +	if (err == 0 && user->ftb_offset) {
> +		struct fault_addr_top_byte_context __user *ftb_ctx =
> +			apply_user_offset(user, user->ftb_offset);
> +
> +		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> +				 &ftb_ctx->head.magic, err);
> +		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> +		__put_user_error(current->thread.fault_address >> 56,
> +				 &ftb_ctx->fault_addr_top_byte, err);
> +	}
> +

How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?

For Synchronous external aborts in particular, those bits are documented
as UNKNOWN, but I don't see any special handling,  There may be other
cases I haven't spotted.

For preference we can omit this record entirely if we don't have any
information we can report, but certainly we shouldn't expose UNKNOWN
bits.


[ Aside:

Also, what if we're not reporting a memory abort at all?  Does
thread.fault_address just contain junk from the last fault?  I see
nothing anywhere that cleans this up.  (This is historical and not
your fault, but it would be good to close this down while we're about
it.)


Hmmm, looking at the code I think we probably leak fault_address etc.
across execve() too, so it may even be stale junk from an old process
:/

Maybe I just confused myself. 

End aside. ]


Apart from these issues, the actual code looks reasonable.

Cheers
---Dave


>  	/* Scalable Vector Extension state, if present */
>  	if (system_supports_sve() && err == 0 && user->sve_offset) {
>  		struct sve_context __user *sve_ctx =
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index c9cedc0432d2..39bbaa05f162 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -41,7 +41,7 @@
>  #include <asm/traps.h>
>  
>  struct fault_info {
> -	int	(*fn)(unsigned long addr, unsigned int esr,
> +	int	(*fn)(unsigned long far, unsigned int esr,
>  		      struct pt_regs *regs);
>  	int	sig;
>  	int	code;
> @@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
>  	die_kernel_fault(msg, addr, esr, regs);
>  }
>  
> -static void set_thread_esr(unsigned long address, unsigned int esr)
> +static void set_thread_far_esr(unsigned long far, unsigned int esr)
>  {
> -	current->thread.fault_address = address;
> +	unsigned long addr = untagged_addr(far);
> +
> +	current->thread.fault_address = far;
>  
>  	/*
>  	 * If the faulting address is in the kernel, we must sanitize the ESR.
> @@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
>  	 * type", so we ignore this wrinkle and just return the translation
>  	 * fault.)
>  	 */
> -	if (!is_ttbr0_addr(current->thread.fault_address)) {
> +	if (!is_ttbr0_addr(addr)) {
>  		switch (ESR_ELx_EC(esr)) {
>  		case ESR_ELx_EC_DABT_LOW:
>  			/*
> @@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
>  	current->thread.fault_code = esr;
>  }
>  
> -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static void do_bad_area(unsigned long far, unsigned int esr,
> +			struct pt_regs *regs)
>  {
> +	unsigned long addr = untagged_addr(far);
> +
>  	/*
>  	 * If we are in kernel mode at this point, we have no context to
>  	 * handle this fault with.
> @@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
>  	if (user_mode(regs)) {
>  		const struct fault_info *inf = esr_to_fault_info(esr);
>  
> -		set_thread_esr(addr, esr);
> +		set_thread_far_esr(far, esr);
>  		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
>  				      inf->name);
>  	} else {
> @@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
>  	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
>  }
>  
> -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
> +static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
>  				   struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
> @@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	vm_fault_t fault, major = 0;
>  	unsigned long vm_flags = VM_ACCESS_FLAGS;
>  	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
> +	unsigned long addr = untagged_addr(far);
>  
>  	if (kprobe_page_fault(regs, esr))
>  		return 0;
> @@ -570,7 +576,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	}
>  
>  	inf = esr_to_fault_info(esr);
> -	set_thread_esr(addr, esr);
> +	set_thread_far_esr(far, esr);
>  	if (fault & VM_FAULT_SIGBUS) {
>  		/*
>  		 * We had some memory, but were unable to successfully fix up
> @@ -605,30 +611,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	return 0;
>  }
>  
> -static int __kprobes do_translation_fault(unsigned long addr,
> +static int __kprobes do_translation_fault(unsigned long far,
>  					  unsigned int esr,
>  					  struct pt_regs *regs)
>  {
> +	unsigned long addr = untagged_addr(far);
> +
>  	if (is_ttbr0_addr(addr))
> -		return do_page_fault(addr, esr, regs);
> +		return do_page_fault(far, esr, regs);
>  
> -	do_bad_area(addr, esr, regs);
> +	do_bad_area(far, esr, regs);
>  	return 0;
>  }
>  
> -static int do_alignment_fault(unsigned long addr, unsigned int esr,
> +static int do_alignment_fault(unsigned long far, unsigned int esr,
>  			      struct pt_regs *regs)
>  {
> -	do_bad_area(addr, esr, regs);
> +	do_bad_area(far, esr, regs);
>  	return 0;
>  }
>  
> -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	return 1; /* "fault" */
>  }
>  
> -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
>  	void __user *siaddr;
> @@ -644,7 +652,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
>  	if (esr & ESR_ELx_FnV)
>  		siaddr = NULL;
>  	else
> -		siaddr  = (void __user *)addr;
> +		siaddr  = (void __user *)untagged_addr(far);
>  	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
>  
>  	return 0;
> @@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = {
>  	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
>  };
>  
> -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf = esr_to_fault_info(esr);
> +	unsigned long addr = untagged_addr(far);
>  
> -	if (!inf->fn(addr, esr, regs))
> +	if (!inf->fn(far, esr, regs))
>  		return;
>  
>  	if (!user_mode(regs)) {
> -- 
> 2.26.2.645.ge9eca65c58-goog
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Peter Collingbourne May 15, 2020, 12:58 a.m. UTC | #2
On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> >
> > We should not stop clearing these bits in the existing fault address fields,
> > because there may be existing userspace applications that are expecting the tag
> > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> >
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v6:
> > - bring back comment about __reserved[]
> >
> > v5:
> > - add padding to fault_addr_top_byte_context in order to ensure the correct
> >   size and preserve sp alignment
> >
> > v4:
> > - expose only the tag bits in the context instead of the entire FAR_EL1
> > - remove mention of the new context from the sigcontext.__reserved[] note
> >
> > v3:
> > - add documentation to tagged-pointers.rst
> > - update comments in sigcontext.h
> >
> > v2:
> > - revert changes to hw_breakpoint.c
> > - rename set_thread_esr to set_thread_far_esr
> >
> >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> >  arch/arm64/include/asm/exception.h       |  2 +-
> >  arch/arm64/include/asm/processor.h       |  2 +-
> >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> >  arch/arm64/kernel/entry-common.c         |  2 --
> >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> >  7 files changed, 77 insertions(+), 35 deletions(-)
> >
> > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > index eab4323609b9..c6e9592a9dea 100644
> > --- a/Documentation/arm64/tagged-pointers.rst
> > +++ b/Documentation/arm64/tagged-pointers.rst
> > @@ -53,12 +53,17 @@ visibility.
> >  Preserving tags
> >  ---------------
> >
> > -Non-zero tags are not preserved when delivering signals. This means that
> > -signal handlers in applications making use of tags cannot rely on the
> > -tag information for user virtual addresses being maintained for fields
> > -inside siginfo_t. One exception to this rule is for signals raised in
> > -response to watchpoint debug exceptions, where the tag information will
> > -be preserved.
> > +Non-zero tags are not preserved in the fault address fields
> > +siginfo.si_addr or sigcontext.fault_address when delivering
> > +signals. This means that signal handlers in applications making use
> > +of tags cannot rely on the tag information for user virtual addresses
> > +being maintained in these fields. One exception to this rule is for
> > +signals raised in response to watchpoint debug exceptions, where the
> > +tag information will be preserved.
> > +
> > +The fault address tag is preserved in the fault_addr_top_byte field of
> > +the signal frame record fault_addr_top_byte_context, which is present
> > +for signals raised in response to data aborts and instruction aborts.
> >
> >  The architecture prevents the use of a tagged PC, so the upper byte will
> >  be set to a sign-extension of bit 55 on exception return.
> > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > index 7a6e81ca23a8..90e772d9b2cd 100644
> > --- a/arch/arm64/include/asm/exception.h
> > +++ b/arch/arm64/include/asm/exception.h
> > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> >  }
> >
> >  asmlinkage void enter_from_user_mode(void);
> > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> >  void do_undefinstr(struct pt_regs *regs);
> >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > index 240fe5e5b720..63185be29ff9 100644
> > --- a/arch/arm64/include/asm/processor.h
> > +++ b/arch/arm64/include/asm/processor.h
> > @@ -144,7 +144,7 @@ struct thread_struct {
> >       void                    *sve_state;     /* SVE registers, if any */
> >       unsigned int            sve_vl;         /* SVE vector length */
> >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > -     unsigned long           fault_address;  /* fault info */
> > +     unsigned long           fault_address;  /* FAR_EL1 value */
> >       unsigned long           fault_code;     /* ESR_EL1 value */
> >       struct debug_info       debug;          /* debugging */
> >  #ifdef CONFIG_ARM64_PTR_AUTH
> > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > index 8b0ebce92427..2a3fe3de899d 100644
> > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > @@ -44,11 +44,12 @@ struct sigcontext {
> >   *
> >   *   0x210           fpsimd_context
> >   *    0x10           esr_context
> > + *    0x10           fault_addr_top_byte_context
> >   *   0x8a0           sve_context (vl <= 64) (optional)
> >   *    0x20           extra_context (optional)
> >   *    0x10           terminator (null _aarch64_ctx)
> >   *
> > - *   0x510           (reserved for future allocation)
> > + *   0x500           (reserved for future allocation)
> >   *
> >   * New records that can exceed this space need to be opt-in for userspace, so
> >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > @@ -94,17 +95,26 @@ struct esr_context {
> >       __u64 esr;
> >  };
> >
> > +/* Top byte of fault address (normally not exposed via si_addr) */
> > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > +
> > +struct fault_addr_top_byte_context {
> > +     struct _aarch64_ctx head;
> > +     __u8 fault_addr_top_byte;
> > +     __u8 __reserved[7];
> > +};
> > +
>
> Nit: the name here is a bit cumbersome (obviously bikeshedding...)
>
>
> For the rest, some of my comments may be bogus -- I haven't dug into
> this stuff for a little while!
>
>
> Anyway:
>
> Do we really get the whole top byte of the address in the FAR?  If so,
> fine, but I'm having trouble finding a clear statement in the
> architecture one way or the other.  (I didn't attempt to dive into the
> pseudocode.)

I rely on this statement in the ARM:

https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
"For a Data Abort or Watchpoint exception, if address tagging is
enabled for the address accessed by the data access that caused the
exception, then this field includes the tag."

And note that address tagging here essentially means TBI (which is
always enabled on Linux), and not memory tagging.

> Also, since we're burning 16 bytes here, I'd prefer if we make this
> extensible.  At present the __reserved[7] is unusable because
> userspace has no way to know whether it's valid or not.
>
> Options include an additional flag byte (0 for now), or just making
> the whole thing a __u64.  In that case we can leave the top byte bits
> in their original positions if we want, but it would be a good idea to
> include a flag to say that field is valid at all.  (See comments below
> on Synchronous external abort.)
>
> So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> (with #defines for the bits/fields as appropriate).

The flag bits seem like a good idea. Thinking ahead to the MTE sync
tag fault (which might not provide us with bits 60-63), we may
consider having separate bits to indicate "bits 56-59 valid" and "bits
60-63 valid", set both bits for regular data aborts and only the
former for sync tag faults, which would avoid the need to define a
separate context for these faults. And if a future architecture
revision provides us with bits 60-63 for tag faults, we could start
setting both flag bits even for tag faults.

> >  /*
> >   * extra_context: describes extra space in the signal frame for
> >   * additional structures that don't fit in sigcontext.__reserved[].
> >   *
> >   * Note:
> >   *
> > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > - * extra space.  Any other record can be placed either in the extra
> > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > - * this file.
> > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > + * They cannot be placed in the extra space.  Any other record can be
> > + * placed either in the extra space or in sigcontext.__reserved[],
> > + * unless otherwise specified in this file.
> >   *
> >   * 2) There must not be more than one extra_context.
> >   *
> > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > index c839b5bf1904..045b4f518836 100644
> > --- a/arch/arm64/kernel/entry-common.c
> > +++ b/arch/arm64/kernel/entry-common.c
> > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> >       unsigned long far = read_sysreg(far_el1);
> >
> >       local_daif_inherit(regs);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el1_abort);
> > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> >
> >       user_exit_irqoff();
> >       local_daif_restore(DAIF_PROCCTX);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el0_da);
> > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > index 339882db5a91..baa88dc02e5c 100644
> > --- a/arch/arm64/kernel/signal.c
> > +++ b/arch/arm64/kernel/signal.c
> > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> >
> >       unsigned long fpsimd_offset;
> >       unsigned long esr_offset;
> > +     unsigned long ftb_offset;
> >       unsigned long sve_offset;
> >       unsigned long extra_offset;
> >       unsigned long end_offset;
> > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> >                       break;
> >
> >               case ESR_MAGIC:
> > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> >                       /* ignore */
> >                       break;
> >
> > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> >                                    sizeof(struct esr_context));
> >               if (err)
> >                       return err;
> > +
> > +             err = sigframe_alloc(
> > +                     user, &user->ftb_offset,
> > +                     sizeof(struct fault_addr_top_byte_context));
>
> Nit: inconsistent indentation?
>
> (Mostly just because it makes the change look odd against the hunk
> context, but not a big deal.)

With consistent indentation we violate 80 cols due to the extra long
struct name. The indentation is what clang-format is giving me.

> > +             if (err)
> > +                     return err;
> >       }
> >
> >       if (system_supports_sve()) {
> > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> >
> > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > +                      &sf->uc.uc_mcontext.fault_address, err);
> >
> >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> >
> > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> >       }
> >
> > +     if (err == 0 && user->ftb_offset) {
> > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > +                     apply_user_offset(user, user->ftb_offset);
> > +
> > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > +                              &ftb_ctx->head.magic, err);
> > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > +             __put_user_error(current->thread.fault_address >> 56,
> > +                              &ftb_ctx->fault_addr_top_byte, err);
> > +     }
> > +
>
> How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
>
> For Synchronous external aborts in particular, those bits are documented
> as UNKNOWN, but I don't see any special handling,  There may be other
> cases I haven't spotted.
>
> For preference we can omit this record entirely if we don't have any
> information we can report, but certainly we shouldn't expose UNKNOWN
> bits.

In this case we mask out the top byte in do_sea before passing the
address to arm64_notify_die (which clears fault_address and passes the
address argument on to arm64_force_sig_fault to be exposed via
si_addr). So the record would always contain a 0 byte. It seems
reasonable to omit the record in this case instead.

> [ Aside:
>
> Also, what if we're not reporting a memory abort at all?  Does
> thread.fault_address just contain junk from the last fault?  I see
> nothing anywhere that cleans this up.  (This is historical and not
> your fault, but it would be good to close this down while we're about
> it.)
>
>
> Hmmm, looking at the code I think we probably leak fault_address etc.
> across execve() too, so it may even be stale junk from an old process
> :/
>
> Maybe I just confused myself.
>
> End aside. ]

Yes, it's unclear whether we always manage to not expose a fault
address if we're not reporting a data or instruction abort. The code
would need to arrange for fault_code to be set to 0 in order to avoid
exposing previous fault_address values via future signals. I don't see
anywhere where we're resetting these fields after delivering a signal,
so it seems possible by calling arm64_force_sig_fault without first
setting fault_code (most callers do this, but the calls in
arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
not to), or simply by calling force_sig_fault (which happens in many
places throughout the kernel).

Maybe something like this would do the trick? (Untested, and forgive
spaces instead of tabs, grumble grumble gmail):

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index baa88dc02e5c..5867f2fdbe64 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -648,6 +648,7 @@ static int setup_sigframe(struct
rt_sigframe_user_layout *user,
                __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
                __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
                __put_user_error(current->thread.fault_code,
&esr_ctx->esr, err);
+               current->thread.fault_code = 0;
        }

        if (err == 0 && user->ftb_offset) {

> Apart from these issues, the actual code looks reasonable.

Thanks for the review.

Peter
Dave Martin May 18, 2020, 9:53 a.m. UTC | #3
On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > >
> > > We should not stop clearing these bits in the existing fault address fields,
> > > because there may be existing userspace applications that are expecting the tag
> > > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> > >
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > >
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > ---
> > > v6:
> > > - bring back comment about __reserved[]
> > >
> > > v5:
> > > - add padding to fault_addr_top_byte_context in order to ensure the correct
> > >   size and preserve sp alignment
> > >
> > > v4:
> > > - expose only the tag bits in the context instead of the entire FAR_EL1
> > > - remove mention of the new context from the sigcontext.__reserved[] note
> > >
> > > v3:
> > > - add documentation to tagged-pointers.rst
> > > - update comments in sigcontext.h
> > >
> > > v2:
> > > - revert changes to hw_breakpoint.c
> > > - rename set_thread_esr to set_thread_far_esr
> > >
> > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > >  arch/arm64/include/asm/exception.h       |  2 +-
> > >  arch/arm64/include/asm/processor.h       |  2 +-
> > >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> > >  arch/arm64/kernel/entry-common.c         |  2 --
> > >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > >  7 files changed, 77 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > > index eab4323609b9..c6e9592a9dea 100644
> > > --- a/Documentation/arm64/tagged-pointers.rst
> > > +++ b/Documentation/arm64/tagged-pointers.rst
> > > @@ -53,12 +53,17 @@ visibility.
> > >  Preserving tags
> > >  ---------------
> > >
> > > -Non-zero tags are not preserved when delivering signals. This means that
> > > -signal handlers in applications making use of tags cannot rely on the
> > > -tag information for user virtual addresses being maintained for fields
> > > -inside siginfo_t. One exception to this rule is for signals raised in
> > > -response to watchpoint debug exceptions, where the tag information will
> > > -be preserved.
> > > +Non-zero tags are not preserved in the fault address fields
> > > +siginfo.si_addr or sigcontext.fault_address when delivering
> > > +signals. This means that signal handlers in applications making use
> > > +of tags cannot rely on the tag information for user virtual addresses
> > > +being maintained in these fields. One exception to this rule is for
> > > +signals raised in response to watchpoint debug exceptions, where the
> > > +tag information will be preserved.
> > > +
> > > +The fault address tag is preserved in the fault_addr_top_byte field of
> > > +the signal frame record fault_addr_top_byte_context, which is present
> > > +for signals raised in response to data aborts and instruction aborts.
> > >
> > >  The architecture prevents the use of a tagged PC, so the upper byte will
> > >  be set to a sign-extension of bit 55 on exception return.
> > > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > > index 7a6e81ca23a8..90e772d9b2cd 100644
> > > --- a/arch/arm64/include/asm/exception.h
> > > +++ b/arch/arm64/include/asm/exception.h
> > > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> > >  }
> > >
> > >  asmlinkage void enter_from_user_mode(void);
> > > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> > >  void do_undefinstr(struct pt_regs *regs);
> > >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> > >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > > index 240fe5e5b720..63185be29ff9 100644
> > > --- a/arch/arm64/include/asm/processor.h
> > > +++ b/arch/arm64/include/asm/processor.h
> > > @@ -144,7 +144,7 @@ struct thread_struct {
> > >       void                    *sve_state;     /* SVE registers, if any */
> > >       unsigned int            sve_vl;         /* SVE vector length */
> > >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > > -     unsigned long           fault_address;  /* fault info */
> > > +     unsigned long           fault_address;  /* FAR_EL1 value */
> > >       unsigned long           fault_code;     /* ESR_EL1 value */
> > >       struct debug_info       debug;          /* debugging */
> > >  #ifdef CONFIG_ARM64_PTR_AUTH
> > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > index 8b0ebce92427..2a3fe3de899d 100644
> > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > @@ -44,11 +44,12 @@ struct sigcontext {
> > >   *
> > >   *   0x210           fpsimd_context
> > >   *    0x10           esr_context
> > > + *    0x10           fault_addr_top_byte_context
> > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > >   *    0x20           extra_context (optional)
> > >   *    0x10           terminator (null _aarch64_ctx)
> > >   *
> > > - *   0x510           (reserved for future allocation)
> > > + *   0x500           (reserved for future allocation)
> > >   *
> > >   * New records that can exceed this space need to be opt-in for userspace, so
> > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > @@ -94,17 +95,26 @@ struct esr_context {
> > >       __u64 esr;
> > >  };
> > >
> > > +/* Top byte of fault address (normally not exposed via si_addr) */
> > > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > > +
> > > +struct fault_addr_top_byte_context {
> > > +     struct _aarch64_ctx head;
> > > +     __u8 fault_addr_top_byte;
> > > +     __u8 __reserved[7];
> > > +};
> > > +
> >
> > Nit: the name here is a bit cumbersome (obviously bikeshedding...)
> >
> >
> > For the rest, some of my comments may be bogus -- I haven't dug into
> > this stuff for a little while!
> >
> >
> > Anyway:
> >
> > Do we really get the whole top byte of the address in the FAR?  If so,
> > fine, but I'm having trouble finding a clear statement in the
> > architecture one way or the other.  (I didn't attempt to dive into the
> > pseudocode.)
> 
> I rely on this statement in the ARM:
> 
> https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
> "For a Data Abort or Watchpoint exception, if address tagging is
> enabled for the address accessed by the data access that caused the
> exception, then this field includes the tag."

Yes, I think that covers it.  I hadn't found a clear definition of
"tag", but I think the TBI mechanism makes it "reasonably obvious" the
non-address (i.e., tag) bits are [63:56].

> And note that address tagging here essentially means TBI (which is
> always enabled on Linux), and not memory tagging.
> 
> > Also, since we're burning 16 bytes here, I'd prefer if we make this
> > extensible.  At present the __reserved[7] is unusable because
> > userspace has no way to know whether it's valid or not.
> >
> > Options include an additional flag byte (0 for now), or just making
> > the whole thing a __u64.  In that case we can leave the top byte bits
> > in their original positions if we want, but it would be a good idea to
> > include a flag to say that field is valid at all.  (See comments below
> > on Synchronous external abort.)
> >
> > So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> > (with #defines for the bits/fields as appropriate).
> 
> The flag bits seem like a good idea. Thinking ahead to the MTE sync
> tag fault (which might not provide us with bits 60-63), we may
> consider having separate bits to indicate "bits 56-59 valid" and "bits
> 60-63 valid", set both bits for regular data aborts and only the
> former for sync tag faults, which would avoid the need to define a
> separate context for these faults. And if a future architecture
> revision provides us with bits 60-63 for tag faults, we could start
> setting both flag bits even for tag faults.

Seems reasonable, but a "tag mask" field of some sort might be
preferable to hard-wiring, just in case a future update to MTE supports
more than 4 bits.

> > >  /*
> > >   * extra_context: describes extra space in the signal frame for
> > >   * additional structures that don't fit in sigcontext.__reserved[].
> > >   *
> > >   * Note:
> > >   *
> > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > - * extra space.  Any other record can be placed either in the extra
> > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > - * this file.
> > > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > > + * They cannot be placed in the extra space.  Any other record can be
> > > + * placed either in the extra space or in sigcontext.__reserved[],
> > > + * unless otherwise specified in this file.
> > >   *
> > >   * 2) There must not be more than one extra_context.
> > >   *
> > > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > > index c839b5bf1904..045b4f518836 100644
> > > --- a/arch/arm64/kernel/entry-common.c
> > > +++ b/arch/arm64/kernel/entry-common.c
> > > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> > >       unsigned long far = read_sysreg(far_el1);
> > >
> > >       local_daif_inherit(regs);
> > > -     far = untagged_addr(far);
> > >       do_mem_abort(far, esr, regs);
> > >  }
> > >  NOKPROBE_SYMBOL(el1_abort);
> > > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> > >
> > >       user_exit_irqoff();
> > >       local_daif_restore(DAIF_PROCCTX);
> > > -     far = untagged_addr(far);
> > >       do_mem_abort(far, esr, regs);
> > >  }
> > >  NOKPROBE_SYMBOL(el0_da);
> > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > index 339882db5a91..baa88dc02e5c 100644
> > > --- a/arch/arm64/kernel/signal.c
> > > +++ b/arch/arm64/kernel/signal.c
> > > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> > >
> > >       unsigned long fpsimd_offset;
> > >       unsigned long esr_offset;
> > > +     unsigned long ftb_offset;
> > >       unsigned long sve_offset;
> > >       unsigned long extra_offset;
> > >       unsigned long end_offset;
> > > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> > >                       break;
> > >
> > >               case ESR_MAGIC:
> > > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> > >                       /* ignore */
> > >                       break;
> > >
> > > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > >                                    sizeof(struct esr_context));
> > >               if (err)
> > >                       return err;
> > > +
> > > +             err = sigframe_alloc(
> > > +                     user, &user->ftb_offset,
> > > +                     sizeof(struct fault_addr_top_byte_context));
> >
> > Nit: inconsistent indentation?
> >
> > (Mostly just because it makes the change look odd against the hunk
> > context, but not a big deal.)
> 
> With consistent indentation we violate 80 cols due to the extra long
> struct name. The indentation is what clang-format is giving me.

I suspected that might be why.  Fair enough (though a shorter name would
be no bad thing, it's not worth changing that just for nicer indentation).

> 
> > > +             if (err)
> > > +                     return err;
> > >       }
> > >
> > >       if (system_supports_sve()) {
> > > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> > >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> > >
> > > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > > +                      &sf->uc.uc_mcontext.fault_address, err);
> > >
> > >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> > >
> > > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > >       }
> > >
> > > +     if (err == 0 && user->ftb_offset) {
> > > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > > +                     apply_user_offset(user, user->ftb_offset);
> > > +
> > > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > > +                              &ftb_ctx->head.magic, err);
> > > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > > +             __put_user_error(current->thread.fault_address >> 56,
> > > +                              &ftb_ctx->fault_addr_top_byte, err);
> > > +     }
> > > +
> >
> > How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
> >
> > For Synchronous external aborts in particular, those bits are documented
> > as UNKNOWN, but I don't see any special handling,  There may be other
> > cases I haven't spotted.
> >
> > For preference we can omit this record entirely if we don't have any
> > information we can report, but certainly we shouldn't expose UNKNOWN
> > bits.
> 
> In this case we mask out the top byte in do_sea before passing the
> address to arm64_notify_die (which clears fault_address and passes the
> address argument on to arm64_force_sig_fault to be exposed via
> si_addr). So the record would always contain a 0 byte. It seems
> reasonable to omit the record in this case instead.

Ah, right.  Missed that.

The record is already omitted when fault_code == 0 IIUC, so perhaps
we're already doing the right thing for synchronous external aborts.

> > [ Aside:
> >
> > Also, what if we're not reporting a memory abort at all?  Does
> > thread.fault_address just contain junk from the last fault?  I see
> > nothing anywhere that cleans this up.  (This is historical and not
> > your fault, but it would be good to close this down while we're about
> > it.)
> >
> >
> > Hmmm, looking at the code I think we probably leak fault_address etc.
> > across execve() too, so it may even be stale junk from an old process
> > :/
> >
> > Maybe I just confused myself.
> >
> > End aside. ]
> 
> Yes, it's unclear whether we always manage to not expose a fault
> address if we're not reporting a data or instruction abort. The code
> would need to arrange for fault_code to be set to 0 in order to avoid
> exposing previous fault_address values via future signals. I don't see
> anywhere where we're resetting these fields after delivering a signal,
> so it seems possible by calling arm64_force_sig_fault without first
> setting fault_code (most callers do this, but the calls in
> arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
> not to), or simply by calling force_sig_fault (which happens in many
> places throughout the kernel).
> 
> Maybe something like this would do the trick? (Untested, and forgive
> spaces instead of tabs, grumble grumble gmail):
> 
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index baa88dc02e5c..5867f2fdbe64 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> rt_sigframe_user_layout *user,
>                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>                 __put_user_error(current->thread.fault_code,
> &esr_ctx->esr, err);
> +               current->thread.fault_code = 0;

Perhaps, but we'd need to be careful.  For example, can we run out of
user stack before this and deliver a SIGSEGV, but with the old
fault_code still set?  Then we'd emit the old fault code with the
new "can't deliver signal" signal, which doesn't make sense.

Stuff may also go wrong with signal prioritisation.

If a higher-priority signal (say SIGINT) comes in after a data abort
enters the kernel but before the resulting SIGSEGV is dequeued for
delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
With your change we'd then have cleared the fault code by the time we
deliver the SIGSEGV it actually relates to, if I've understood right.

Today, I think we just attach that fault code to every signal that's
delivered until something overwrites or resets it, which means that
a signal that needs fault_code gets it, at the expense of attaching
it to a bunch of other random signals too.


Checking the signal number and si_code might help us to know what we
should be doing with fault_code.  We need to have sure userspace can't
trick us with a non kernel generated signal here.  It would also be
necessary to check how PTRACE_SETSIGINFO interacts with this.


Cheers
---Dave
Peter Collingbourne May 19, 2020, 10 p.m. UTC | #4
On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > > the tag bits may be needed by tools in order to accurately diagnose
> > > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > > Tagging Extension (MTE).
> > > >
> > > > We should not stop clearing these bits in the existing fault address fields,
> > > > because there may be existing userspace applications that are expecting the tag
> > > > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > > > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> > > >
> > > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > >
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > > ---
> > > > v6:
> > > > - bring back comment about __reserved[]
> > > >
> > > > v5:
> > > > - add padding to fault_addr_top_byte_context in order to ensure the correct
> > > >   size and preserve sp alignment
> > > >
> > > > v4:
> > > > - expose only the tag bits in the context instead of the entire FAR_EL1
> > > > - remove mention of the new context from the sigcontext.__reserved[] note
> > > >
> > > > v3:
> > > > - add documentation to tagged-pointers.rst
> > > > - update comments in sigcontext.h
> > > >
> > > > v2:
> > > > - revert changes to hw_breakpoint.c
> > > > - rename set_thread_esr to set_thread_far_esr
> > > >
> > > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > > >  arch/arm64/include/asm/exception.h       |  2 +-
> > > >  arch/arm64/include/asm/processor.h       |  2 +-
> > > >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> > > >  arch/arm64/kernel/entry-common.c         |  2 --
> > > >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> > > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > > >  7 files changed, 77 insertions(+), 35 deletions(-)
> > > >
> > > > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > > > index eab4323609b9..c6e9592a9dea 100644
> > > > --- a/Documentation/arm64/tagged-pointers.rst
> > > > +++ b/Documentation/arm64/tagged-pointers.rst
> > > > @@ -53,12 +53,17 @@ visibility.
> > > >  Preserving tags
> > > >  ---------------
> > > >
> > > > -Non-zero tags are not preserved when delivering signals. This means that
> > > > -signal handlers in applications making use of tags cannot rely on the
> > > > -tag information for user virtual addresses being maintained for fields
> > > > -inside siginfo_t. One exception to this rule is for signals raised in
> > > > -response to watchpoint debug exceptions, where the tag information will
> > > > -be preserved.
> > > > +Non-zero tags are not preserved in the fault address fields
> > > > +siginfo.si_addr or sigcontext.fault_address when delivering
> > > > +signals. This means that signal handlers in applications making use
> > > > +of tags cannot rely on the tag information for user virtual addresses
> > > > +being maintained in these fields. One exception to this rule is for
> > > > +signals raised in response to watchpoint debug exceptions, where the
> > > > +tag information will be preserved.
> > > > +
> > > > +The fault address tag is preserved in the fault_addr_top_byte field of
> > > > +the signal frame record fault_addr_top_byte_context, which is present
> > > > +for signals raised in response to data aborts and instruction aborts.
> > > >
> > > >  The architecture prevents the use of a tagged PC, so the upper byte will
> > > >  be set to a sign-extension of bit 55 on exception return.
> > > > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > > > index 7a6e81ca23a8..90e772d9b2cd 100644
> > > > --- a/arch/arm64/include/asm/exception.h
> > > > +++ b/arch/arm64/include/asm/exception.h
> > > > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> > > >  }
> > > >
> > > >  asmlinkage void enter_from_user_mode(void);
> > > > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > > > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> > > >  void do_undefinstr(struct pt_regs *regs);
> > > >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> > > >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > > > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > > > index 240fe5e5b720..63185be29ff9 100644
> > > > --- a/arch/arm64/include/asm/processor.h
> > > > +++ b/arch/arm64/include/asm/processor.h
> > > > @@ -144,7 +144,7 @@ struct thread_struct {
> > > >       void                    *sve_state;     /* SVE registers, if any */
> > > >       unsigned int            sve_vl;         /* SVE vector length */
> > > >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > > > -     unsigned long           fault_address;  /* fault info */
> > > > +     unsigned long           fault_address;  /* FAR_EL1 value */
> > > >       unsigned long           fault_code;     /* ESR_EL1 value */
> > > >       struct debug_info       debug;          /* debugging */
> > > >  #ifdef CONFIG_ARM64_PTR_AUTH
> > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > index 8b0ebce92427..2a3fe3de899d 100644
> > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > @@ -44,11 +44,12 @@ struct sigcontext {
> > > >   *
> > > >   *   0x210           fpsimd_context
> > > >   *    0x10           esr_context
> > > > + *    0x10           fault_addr_top_byte_context
> > > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > > >   *    0x20           extra_context (optional)
> > > >   *    0x10           terminator (null _aarch64_ctx)
> > > >   *
> > > > - *   0x510           (reserved for future allocation)
> > > > + *   0x500           (reserved for future allocation)
> > > >   *
> > > >   * New records that can exceed this space need to be opt-in for userspace, so
> > > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > > @@ -94,17 +95,26 @@ struct esr_context {
> > > >       __u64 esr;
> > > >  };
> > > >
> > > > +/* Top byte of fault address (normally not exposed via si_addr) */
> > > > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > > > +
> > > > +struct fault_addr_top_byte_context {
> > > > +     struct _aarch64_ctx head;
> > > > +     __u8 fault_addr_top_byte;
> > > > +     __u8 __reserved[7];
> > > > +};
> > > > +
> > >
> > > Nit: the name here is a bit cumbersome (obviously bikeshedding...)
> > >
> > >
> > > For the rest, some of my comments may be bogus -- I haven't dug into
> > > this stuff for a little while!
> > >
> > >
> > > Anyway:
> > >
> > > Do we really get the whole top byte of the address in the FAR?  If so,
> > > fine, but I'm having trouble finding a clear statement in the
> > > architecture one way or the other.  (I didn't attempt to dive into the
> > > pseudocode.)
> >
> > I rely on this statement in the ARM:
> >
> > https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
> > "For a Data Abort or Watchpoint exception, if address tagging is
> > enabled for the address accessed by the data access that caused the
> > exception, then this field includes the tag."
>
> Yes, I think that covers it.  I hadn't found a clear definition of
> "tag", but I think the TBI mechanism makes it "reasonably obvious" the
> non-address (i.e., tag) bits are [63:56].
>
> > And note that address tagging here essentially means TBI (which is
> > always enabled on Linux), and not memory tagging.
> >
> > > Also, since we're burning 16 bytes here, I'd prefer if we make this
> > > extensible.  At present the __reserved[7] is unusable because
> > > userspace has no way to know whether it's valid or not.
> > >
> > > Options include an additional flag byte (0 for now), or just making
> > > the whole thing a __u64.  In that case we can leave the top byte bits
> > > in their original positions if we want, but it would be a good idea to
> > > include a flag to say that field is valid at all.  (See comments below
> > > on Synchronous external abort.)
> > >
> > > So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> > > (with #defines for the bits/fields as appropriate).
> >
> > The flag bits seem like a good idea. Thinking ahead to the MTE sync
> > tag fault (which might not provide us with bits 60-63), we may
> > consider having separate bits to indicate "bits 56-59 valid" and "bits
> > 60-63 valid", set both bits for regular data aborts and only the
> > former for sync tag faults, which would avoid the need to define a
> > separate context for these faults. And if a future architecture
> > revision provides us with bits 60-63 for tag faults, we could start
> > setting both flag bits even for tag faults.
>
> Seems reasonable, but a "tag mask" field of some sort might be
> preferable to hard-wiring, just in case a future update to MTE supports
> more than 4 bits.

That's fine with me.

> > > >  /*
> > > >   * extra_context: describes extra space in the signal frame for
> > > >   * additional structures that don't fit in sigcontext.__reserved[].
> > > >   *
> > > >   * Note:
> > > >   *
> > > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > > - * extra space.  Any other record can be placed either in the extra
> > > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > > - * this file.
> > > > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > > > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > > > + * They cannot be placed in the extra space.  Any other record can be
> > > > + * placed either in the extra space or in sigcontext.__reserved[],
> > > > + * unless otherwise specified in this file.
> > > >   *
> > > >   * 2) There must not be more than one extra_context.
> > > >   *
> > > > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > > > index c839b5bf1904..045b4f518836 100644
> > > > --- a/arch/arm64/kernel/entry-common.c
> > > > +++ b/arch/arm64/kernel/entry-common.c
> > > > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> > > >       unsigned long far = read_sysreg(far_el1);
> > > >
> > > >       local_daif_inherit(regs);
> > > > -     far = untagged_addr(far);
> > > >       do_mem_abort(far, esr, regs);
> > > >  }
> > > >  NOKPROBE_SYMBOL(el1_abort);
> > > > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> > > >
> > > >       user_exit_irqoff();
> > > >       local_daif_restore(DAIF_PROCCTX);
> > > > -     far = untagged_addr(far);
> > > >       do_mem_abort(far, esr, regs);
> > > >  }
> > > >  NOKPROBE_SYMBOL(el0_da);
> > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > index 339882db5a91..baa88dc02e5c 100644
> > > > --- a/arch/arm64/kernel/signal.c
> > > > +++ b/arch/arm64/kernel/signal.c
> > > > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> > > >
> > > >       unsigned long fpsimd_offset;
> > > >       unsigned long esr_offset;
> > > > +     unsigned long ftb_offset;
> > > >       unsigned long sve_offset;
> > > >       unsigned long extra_offset;
> > > >       unsigned long end_offset;
> > > > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> > > >                       break;
> > > >
> > > >               case ESR_MAGIC:
> > > > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> > > >                       /* ignore */
> > > >                       break;
> > > >
> > > > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > > >                                    sizeof(struct esr_context));
> > > >               if (err)
> > > >                       return err;
> > > > +
> > > > +             err = sigframe_alloc(
> > > > +                     user, &user->ftb_offset,
> > > > +                     sizeof(struct fault_addr_top_byte_context));
> > >
> > > Nit: inconsistent indentation?
> > >
> > > (Mostly just because it makes the change look odd against the hunk
> > > context, but not a big deal.)
> >
> > With consistent indentation we violate 80 cols due to the extra long
> > struct name. The indentation is what clang-format is giving me.
>
> I suspected that might be why.  Fair enough (though a shorter name would
> be no bad thing, it's not worth changing that just for nicer indentation).
>
> >
> > > > +             if (err)
> > > > +                     return err;
> > > >       }
> > > >
> > > >       if (system_supports_sve()) {
> > > > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> > > >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> > > >
> > > > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > > > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > > > +                      &sf->uc.uc_mcontext.fault_address, err);
> > > >
> > > >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> > > >
> > > > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > > >       }
> > > >
> > > > +     if (err == 0 && user->ftb_offset) {
> > > > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > > > +                     apply_user_offset(user, user->ftb_offset);
> > > > +
> > > > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > > > +                              &ftb_ctx->head.magic, err);
> > > > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > > > +             __put_user_error(current->thread.fault_address >> 56,
> > > > +                              &ftb_ctx->fault_addr_top_byte, err);
> > > > +     }
> > > > +
> > >
> > > How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
> > >
> > > For Synchronous external aborts in particular, those bits are documented
> > > as UNKNOWN, but I don't see any special handling,  There may be other
> > > cases I haven't spotted.
> > >
> > > For preference we can omit this record entirely if we don't have any
> > > information we can report, but certainly we shouldn't expose UNKNOWN
> > > bits.
> >
> > In this case we mask out the top byte in do_sea before passing the
> > address to arm64_notify_die (which clears fault_address and passes the
> > address argument on to arm64_force_sig_fault to be exposed via
> > si_addr). So the record would always contain a 0 byte. It seems
> > reasonable to omit the record in this case instead.
>
> Ah, right.  Missed that.
>
> The record is already omitted when fault_code == 0 IIUC, so perhaps
> we're already doing the right thing for synchronous external aborts.
>
> > > [ Aside:
> > >
> > > Also, what if we're not reporting a memory abort at all?  Does
> > > thread.fault_address just contain junk from the last fault?  I see
> > > nothing anywhere that cleans this up.  (This is historical and not
> > > your fault, but it would be good to close this down while we're about
> > > it.)
> > >
> > >
> > > Hmmm, looking at the code I think we probably leak fault_address etc.
> > > across execve() too, so it may even be stale junk from an old process
> > > :/
> > >
> > > Maybe I just confused myself.
> > >
> > > End aside. ]
> >
> > Yes, it's unclear whether we always manage to not expose a fault
> > address if we're not reporting a data or instruction abort. The code
> > would need to arrange for fault_code to be set to 0 in order to avoid
> > exposing previous fault_address values via future signals. I don't see
> > anywhere where we're resetting these fields after delivering a signal,
> > so it seems possible by calling arm64_force_sig_fault without first
> > setting fault_code (most callers do this, but the calls in
> > arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
> > not to), or simply by calling force_sig_fault (which happens in many
> > places throughout the kernel).
> >
> > Maybe something like this would do the trick? (Untested, and forgive
> > spaces instead of tabs, grumble grumble gmail):
> >
> > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > index baa88dc02e5c..5867f2fdbe64 100644
> > --- a/arch/arm64/kernel/signal.c
> > +++ b/arch/arm64/kernel/signal.c
> > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > rt_sigframe_user_layout *user,
> >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >                 __put_user_error(current->thread.fault_code,
> > &esr_ctx->esr, err);
> > +               current->thread.fault_code = 0;
>
> Perhaps, but we'd need to be careful.  For example, can we run out of
> user stack before this and deliver a SIGSEGV, but with the old
> fault_code still set?  Then we'd emit the old fault code with the
> new "can't deliver signal" signal, which doesn't make sense.
>
> Stuff may also go wrong with signal prioritisation.
>
> If a higher-priority signal (say SIGINT) comes in after a data abort
> enters the kernel but before the resulting SIGSEGV is dequeued for
> delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> With your change we'd then have cleared the fault code by the time we
> deliver the SIGSEGV it actually relates to, if I've understood right.
>
> Today, I think we just attach that fault code to every signal that's
> delivered until something overwrites or resets it, which means that
> a signal that needs fault_code gets it, at the expense of attaching
> it to a bunch of other random signals too.
>
>
> Checking the signal number and si_code might help us to know what we
> should be doing with fault_code.  We need to have sure userspace can't
> trick us with a non kernel generated signal here.  It would also be
> necessary to check how PTRACE_SETSIGINFO interacts with this.

With these possible interactions in mind I think we should store the
fault code and fault address in kernel_siginfo instead of
thread_struct (and clear these fields when we receive a siginfo from
userspace, i.e. in copy_siginfo_from_user which is used by
ptrace(PTRACE_SETSIGINFO) among other places). That way, the
information is clearly associated with the signal itself and not the
thread, so we don't need to worry about our signal being delivered out
of order.

Peter
Will Deacon May 20, 2020, 8:55 a.m. UTC | #5
On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > index baa88dc02e5c..5867f2fdbe64 100644
> > > --- a/arch/arm64/kernel/signal.c
> > > +++ b/arch/arm64/kernel/signal.c
> > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > rt_sigframe_user_layout *user,
> > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > >                 __put_user_error(current->thread.fault_code,
> > > &esr_ctx->esr, err);
> > > +               current->thread.fault_code = 0;
> >
> > Perhaps, but we'd need to be careful.  For example, can we run out of
> > user stack before this and deliver a SIGSEGV, but with the old
> > fault_code still set?  Then we'd emit the old fault code with the
> > new "can't deliver signal" signal, which doesn't make sense.
> >
> > Stuff may also go wrong with signal prioritisation.
> >
> > If a higher-priority signal (say SIGINT) comes in after a data abort
> > enters the kernel but before the resulting SIGSEGV is dequeued for
> > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > With your change we'd then have cleared the fault code by the time we
> > deliver the SIGSEGV it actually relates to, if I've understood right.
> >
> > Today, I think we just attach that fault code to every signal that's
> > delivered until something overwrites or resets it, which means that
> > a signal that needs fault_code gets it, at the expense of attaching
> > it to a bunch of other random signals too.
> >
> >
> > Checking the signal number and si_code might help us to know what we
> > should be doing with fault_code.  We need to have sure userspace can't
> > trick us with a non kernel generated signal here.  It would also be
> > necessary to check how PTRACE_SETSIGINFO interacts with this.
> 
> With these possible interactions in mind I think we should store the
> fault code and fault address in kernel_siginfo instead of
> thread_struct (and clear these fields when we receive a siginfo from
> userspace, i.e. in copy_siginfo_from_user which is used by
> ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> information is clearly associated with the signal itself and not the
> thread, so we don't need to worry about our signal being delivered out
> of order.

Hmm, I can't see a way to do that that isn't horribly invasive in the core
signal code. Can you?

But generally, I agree: the per-thread handling of fault_address and
fault_code appears to be quite broken in the face of signal prioritisation
and signals that don't correspond directly to hardware trap. It would be
nice to have some tests for this...

If we want to pile on more bodges, perhaps we could stash the signal number
to which the fault_{address,code} relate, and then check that at delivery
and clear on a match. I hate it.

Will
Dave Martin May 20, 2020, 9:26 a.m. UTC | #6
On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > index baa88dc02e5c..5867f2fdbe64 100644
> > > > --- a/arch/arm64/kernel/signal.c
> > > > +++ b/arch/arm64/kernel/signal.c
> > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > > rt_sigframe_user_layout *user,
> > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > > >                 __put_user_error(current->thread.fault_code,
> > > > &esr_ctx->esr, err);
> > > > +               current->thread.fault_code = 0;
> > >
> > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > > user stack before this and deliver a SIGSEGV, but with the old
> > > fault_code still set?  Then we'd emit the old fault code with the
> > > new "can't deliver signal" signal, which doesn't make sense.
> > >
> > > Stuff may also go wrong with signal prioritisation.
> > >
> > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > > With your change we'd then have cleared the fault code by the time we
> > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > >
> > > Today, I think we just attach that fault code to every signal that's
> > > delivered until something overwrites or resets it, which means that
> > > a signal that needs fault_code gets it, at the expense of attaching
> > > it to a bunch of other random signals too.
> > >
> > >
> > > Checking the signal number and si_code might help us to know what we
> > > should be doing with fault_code.  We need to have sure userspace can't
> > > trick us with a non kernel generated signal here.  It would also be
> > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > 
> > With these possible interactions in mind I think we should store the
> > fault code and fault address in kernel_siginfo instead of
> > thread_struct (and clear these fields when we receive a siginfo from
> > userspace, i.e. in copy_siginfo_from_user which is used by
> > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > information is clearly associated with the signal itself and not the
> > thread, so we don't need to worry about our signal being delivered out
> > of order.
> 
> Hmm, I can't see a way to do that that isn't horribly invasive in the core
> signal code. Can you?
> 
> But generally, I agree: the per-thread handling of fault_address and
> fault_code appears to be quite broken in the face of signal prioritisation
> and signals that don't correspond directly to hardware trap. It would be
> nice to have some tests for this...
> 
> If we want to pile on more bodges, perhaps we could stash the signal number
> to which the fault_{address,code} relate, and then check that at delivery
> and clear on a match. I hate it.

I agree with Daniel's suggestion in principle, but I was also concerned
about whether it would be too invasive elsewhere.

Question though: does the core code take special care to make sure that
a force_sig cannot be outprioritised by a regular signal?  If so,
perhaps we get away with it.  I ask this, because the same same issue
may be hitting other arches otherwise.

Cheers
---Dave
Peter Collingbourne May 21, 2020, 2:28 a.m. UTC | #7
On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> > > > > --- a/arch/arm64/kernel/signal.c
> > > > > +++ b/arch/arm64/kernel/signal.c
> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > > > rt_sigframe_user_layout *user,
> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > > > >                 __put_user_error(current->thread.fault_code,
> > > > > &esr_ctx->esr, err);
> > > > > +               current->thread.fault_code = 0;
> > > >
> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > > > user stack before this and deliver a SIGSEGV, but with the old
> > > > fault_code still set?  Then we'd emit the old fault code with the
> > > > new "can't deliver signal" signal, which doesn't make sense.
> > > >
> > > > Stuff may also go wrong with signal prioritisation.
> > > >
> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > > > With your change we'd then have cleared the fault code by the time we
> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > > >
> > > > Today, I think we just attach that fault code to every signal that's
> > > > delivered until something overwrites or resets it, which means that
> > > > a signal that needs fault_code gets it, at the expense of attaching
> > > > it to a bunch of other random signals too.
> > > >
> > > >
> > > > Checking the signal number and si_code might help us to know what we
> > > > should be doing with fault_code.  We need to have sure userspace can't
> > > > trick us with a non kernel generated signal here.  It would also be
> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > >
> > > With these possible interactions in mind I think we should store the
> > > fault code and fault address in kernel_siginfo instead of
> > > thread_struct (and clear these fields when we receive a siginfo from
> > > userspace, i.e. in copy_siginfo_from_user which is used by
> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > > information is clearly associated with the signal itself and not the
> > > thread, so we don't need to worry about our signal being delivered out
> > > of order.
> >
> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> > signal code. Can you?

I think I've come up with a way that doesn't seem to be too invasive.
See patch #1 of the series that I'm about to send out.

> > But generally, I agree: the per-thread handling of fault_address and
> > fault_code appears to be quite broken in the face of signal prioritisation
> > and signals that don't correspond directly to hardware trap. It would be
> > nice to have some tests for this...
> >
> > If we want to pile on more bodges, perhaps we could stash the signal number
> > to which the fault_{address,code} relate, and then check that at delivery
> > and clear on a match. I hate it.
>
> I agree with Daniel's suggestion in principle, but I was also concerned
> about whether it would be too invasive elsewhere.
>
> Question though: does the core code take special care to make sure that
> a force_sig cannot be outprioritised by a regular signal?  If so,
> perhaps we get away with it.  I ask this, because the same same issue
> may be hitting other arches otherwise.

Not as far as I can tell. There does appear to be prioritisation for
synchronous signals [1] but as far as I can tell nothing to
distinguish one of these signals from one with the same signal number
sent from userspace (e.g. via kill(2)).

Peter

[1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222
Eric W. Biederman May 21, 2020, 12:35 p.m. UTC | #8
Peter Collingbourne <pcc@google.com> writes:

> On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>>
>> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
>> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
>> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
>> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
>> > > > > index baa88dc02e5c..5867f2fdbe64 100644
>> > > > > --- a/arch/arm64/kernel/signal.c
>> > > > > +++ b/arch/arm64/kernel/signal.c
>> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
>> > > > > rt_sigframe_user_layout *user,
>> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>> > > > >                 __put_user_error(current->thread.fault_code,
>> > > > > &esr_ctx->esr, err);
>> > > > > +               current->thread.fault_code = 0;
>> > > >
>> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
>> > > > user stack before this and deliver a SIGSEGV, but with the old
>> > > > fault_code still set?  Then we'd emit the old fault code with the
>> > > > new "can't deliver signal" signal, which doesn't make sense.
>> > > >
>> > > > Stuff may also go wrong with signal prioritisation.
>> > > >
>> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
>> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
>> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
>> > > > With your change we'd then have cleared the fault code by the time we
>> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
>> > > >
>> > > > Today, I think we just attach that fault code to every signal that's
>> > > > delivered until something overwrites or resets it, which means that
>> > > > a signal that needs fault_code gets it, at the expense of attaching
>> > > > it to a bunch of other random signals too.
>> > > >
>> > > >
>> > > > Checking the signal number and si_code might help us to know what we
>> > > > should be doing with fault_code.  We need to have sure userspace can't
>> > > > trick us with a non kernel generated signal here.  It would also be
>> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
>> > >
>> > > With these possible interactions in mind I think we should store the
>> > > fault code and fault address in kernel_siginfo instead of
>> > > thread_struct (and clear these fields when we receive a siginfo from
>> > > userspace, i.e. in copy_siginfo_from_user which is used by
>> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
>> > > information is clearly associated with the signal itself and not the
>> > > thread, so we don't need to worry about our signal being delivered out
>> > > of order.
>> >
>> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
>> > signal code. Can you?
>
> I think I've come up with a way that doesn't seem to be too invasive.
> See patch #1 of the series that I'm about to send out.
>
>> > But generally, I agree: the per-thread handling of fault_address and
>> > fault_code appears to be quite broken in the face of signal prioritisation
>> > and signals that don't correspond directly to hardware trap. It would be
>> > nice to have some tests for this...
>> >
>> > If we want to pile on more bodges, perhaps we could stash the signal number
>> > to which the fault_{address,code} relate, and then check that at delivery
>> > and clear on a match. I hate it.
>>
>> I agree with Daniel's suggestion in principle, but I was also concerned
>> about whether it would be too invasive elsewhere.
>>
>> Question though: does the core code take special care to make sure that
>> a force_sig cannot be outprioritised by a regular signal?  If so,
>> perhaps we get away with it.  I ask this, because the same same issue
>> may be hitting other arches otherwise.
>
> Not as far as I can tell. There does appear to be prioritisation for
> synchronous signals [1] but as far as I can tell nothing to
> distinguish one of these signals from one with the same signal number
> sent from userspace (e.g. via kill(2)).

The si_code will differ between signals generated between userspace
and signals generated by the kernel.

We do allow a little bit of ptrace and sending to yourself to spoof
kernel generated signals, for reasons of debugging and process migration
where an existing process needs to be reconstructed.  But the defenses
should be strong enough you can assume that we reliably distinguish
between a signal from userspace and a signal from the kernel.

I don't fully follow what you are doing but this feels like the
kind of case where a new si_code has been defined as well as additional
fields in siginfo.

In your patchset I really hate that you were going back to
force_sig_info, and filling out struct siginfo by hand.  That is an
error prone pattern, and I have fixed enough bugs in the kernel to prove
that.

I take exception to the idea that including the full address might break
userspace.  That means typically means someone has been too lazy to look
and see what userspace is doing.  When that userspace that might break
is the same userspace you are changing the kernel to serve that makes me
nervous.  AKA the userspace that cares about this signal and how it is
represented in siginfo.

A fix of one instance of SIGILL should not be included with a patch that
does something else, and really should come before everything else if
possible.

If this information really belongs in struct siginfo (as it sounds like)
please actually put the information in siginfo, and let userspace look
in siginfo to find it.  struct siginfo is a union with plenty of space,
and plenty of si_codes.

If this applies to multiple cases then it might be trickier but please
dig into the details, don't toss things into sigcontext just because
you can't figure out a clean design for reporting this.

Eric


> Peter
>
> [1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222
Peter Collingbourne May 21, 2020, 6:03 p.m. UTC | #9
On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Peter Collingbourne <pcc@google.com> writes:
>
> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >>
> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> > > > > rt_sigframe_user_layout *user,
> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> > > > > &esr_ctx->esr, err);
> >> > > > > +               current->thread.fault_code = 0;
> >> > > >
> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> > > >
> >> > > > Stuff may also go wrong with signal prioritisation.
> >> > > >
> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> > > > With your change we'd then have cleared the fault code by the time we
> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> > > >
> >> > > > Today, I think we just attach that fault code to every signal that's
> >> > > > delivered until something overwrites or resets it, which means that
> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> > > > it to a bunch of other random signals too.
> >> > > >
> >> > > >
> >> > > > Checking the signal number and si_code might help us to know what we
> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> > >
> >> > > With these possible interactions in mind I think we should store the
> >> > > fault code and fault address in kernel_siginfo instead of
> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> > > information is clearly associated with the signal itself and not the
> >> > > thread, so we don't need to worry about our signal being delivered out
> >> > > of order.
> >> >
> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> > signal code. Can you?
> >
> > I think I've come up with a way that doesn't seem to be too invasive.
> > See patch #1 of the series that I'm about to send out.
> >
> >> > But generally, I agree: the per-thread handling of fault_address and
> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> > and signals that don't correspond directly to hardware trap. It would be
> >> > nice to have some tests for this...
> >> >
> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> > and clear on a match. I hate it.
> >>
> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> about whether it would be too invasive elsewhere.
> >>
> >> Question though: does the core code take special care to make sure that
> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> perhaps we get away with it.  I ask this, because the same same issue
> >> may be hitting other arches otherwise.
> >
> > Not as far as I can tell. There does appear to be prioritisation for
> > synchronous signals [1] but as far as I can tell nothing to
> > distinguish one of these signals from one with the same signal number
> > sent from userspace (e.g. via kill(2)).
>
> The si_code will differ between signals generated between userspace
> and signals generated by the kernel.
>
> We do allow a little bit of ptrace and sending to yourself to spoof
> kernel generated signals, for reasons of debugging and process migration
> where an existing process needs to be reconstructed.  But the defenses
> should be strong enough you can assume that we reliably distinguish
> between a signal from userspace and a signal from the kernel.

So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
add the context in that case? Seems fragile to me, but I suppose I
could live with it.

> I don't fully follow what you are doing but this feels like the
> kind of case where a new si_code has been defined as well as additional
> fields in siginfo.

There is no new si_code for this, the information will be exposed for
several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
(particularly SEGV_MTESERR, which is part of the proposed MTE patch
set). Note that we already have a union field for BUS_MCEERR_AR, and
we may want to expose it for the other si_codes that already have
union fields as well.

That being said, taking a closer look at siginfo, I think we are in
luck and we might be able to make this work in a reasonable way by
reusing padding (see below).

> In your patchset I really hate that you were going back to
> force_sig_info, and filling out struct siginfo by hand.  That is an
> error prone pattern, and I have fixed enough bugs in the kernel to prove
> that.

To be fair, most of the callers are in helper functions that take
explicit parameters similar to force_sig_fault et al, and the SIGILL
one could easily be made that way as well.

> I take exception to the idea that including the full address might break
> userspace.  That means typically means someone has been too lazy to look
> and see what userspace is doing.  When that userspace that might break
> is the same userspace you are changing the kernel to serve that makes me
> nervous.  AKA the userspace that cares about this signal and how it is
> represented in siginfo.

It's not a matter of being lazy. This behaviour isn't just an accident
but has been explicitly documented for years (see the
tagged-pointers.rst file that I changed: "Non-zero tags are not
preserved when delivering signals."), so users can reasonably rely on
it. Furthermore we simply don't have visibility into the majority of
userspace. For example, there are a lot of closed source Android apps
out there, and who knows what signal handlers they're installing and
how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
can't just change the documented semantics under their feet.

It's also not the same userspace either. The userspace that's
initially going to be consuming the new fields is in a part of the
Android system that handles and reports crashes, and that's something
that we control unlike all the apps.

Finally, the userspace may need to know whether the tag bits were
actually zero or whether they were just unavailable, otherwise
userspace could for example produce a misleading crash report. Simply
having the kernel set the top bits of si_addr wouldn't accomplish that
due to the kernel's previous behaviour, hence the mask to let
userspace know which bits are accurate.

> A fix of one instance of SIGILL should not be included with a patch that
> does something else, and really should come before everything else if
> possible.

Fair point. I can see if I can split that part out.

> If this information really belongs in struct siginfo (as it sounds like)
> please actually put the information in siginfo, and let userspace look
> in siginfo to find it.  struct siginfo is a union with plenty of space,
> and plenty of si_codes.
>
> If this applies to multiple cases then it might be trickier but please
> dig into the details, don't toss things into sigcontext just because
> you can't figure out a clean design for reporting this.

If we wanted this in siginfo, one idea that I had was to revert commit
b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
_addr_top_byte and _addr_top_byte_mask in the padding between
_addr_lsb and the union (with comments on all the fields of course to
say when they are filled in). I think that would work since we are
already clearing padding in siginfo, one nice property of the new
fields is that the zero values are correct in the case where the
information isn't being exposed (so old kernels would already have the
correct behaviour). That would only work on certain architectures
(i.e. at least alignof(void*) >= 4) so I suppose it could have an
#ifdef __aarch64__ around it.

Peter





Peter
>
> Eric
>
>
> > Peter
> >
> > [1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222
Eric W. Biederman May 21, 2020, 7:24 p.m. UTC | #10
Peter Collingbourne <pcc@google.com> writes:

> On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Peter Collingbourne <pcc@google.com> writes:
>>
>> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> >>
>> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
>> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
>> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
>> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
>> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
>> >> > > > > --- a/arch/arm64/kernel/signal.c
>> >> > > > > +++ b/arch/arm64/kernel/signal.c
>> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
>> >> > > > > rt_sigframe_user_layout *user,
>> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>> >> > > > >                 __put_user_error(current->thread.fault_code,
>> >> > > > > &esr_ctx->esr, err);
>> >> > > > > +               current->thread.fault_code = 0;
>> >> > > >
>> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
>> >> > > > user stack before this and deliver a SIGSEGV, but with the old
>> >> > > > fault_code still set?  Then we'd emit the old fault code with the
>> >> > > > new "can't deliver signal" signal, which doesn't make sense.
>> >> > > >
>> >> > > > Stuff may also go wrong with signal prioritisation.
>> >> > > >
>> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
>> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
>> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
>> >> > > > With your change we'd then have cleared the fault code by the time we
>> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
>> >> > > >
>> >> > > > Today, I think we just attach that fault code to every signal that's
>> >> > > > delivered until something overwrites or resets it, which means that
>> >> > > > a signal that needs fault_code gets it, at the expense of attaching
>> >> > > > it to a bunch of other random signals too.
>> >> > > >
>> >> > > >
>> >> > > > Checking the signal number and si_code might help us to know what we
>> >> > > > should be doing with fault_code.  We need to have sure userspace can't
>> >> > > > trick us with a non kernel generated signal here.  It would also be
>> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
>> >> > >
>> >> > > With these possible interactions in mind I think we should store the
>> >> > > fault code and fault address in kernel_siginfo instead of
>> >> > > thread_struct (and clear these fields when we receive a siginfo from
>> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
>> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
>> >> > > information is clearly associated with the signal itself and not the
>> >> > > thread, so we don't need to worry about our signal being delivered out
>> >> > > of order.
>> >> >
>> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
>> >> > signal code. Can you?
>> >
>> > I think I've come up with a way that doesn't seem to be too invasive.
>> > See patch #1 of the series that I'm about to send out.
>> >
>> >> > But generally, I agree: the per-thread handling of fault_address and
>> >> > fault_code appears to be quite broken in the face of signal prioritisation
>> >> > and signals that don't correspond directly to hardware trap. It would be
>> >> > nice to have some tests for this...
>> >> >
>> >> > If we want to pile on more bodges, perhaps we could stash the signal number
>> >> > to which the fault_{address,code} relate, and then check that at delivery
>> >> > and clear on a match. I hate it.
>> >>
>> >> I agree with Daniel's suggestion in principle, but I was also concerned
>> >> about whether it would be too invasive elsewhere.
>> >>
>> >> Question though: does the core code take special care to make sure that
>> >> a force_sig cannot be outprioritised by a regular signal?  If so,
>> >> perhaps we get away with it.  I ask this, because the same same issue
>> >> may be hitting other arches otherwise.
>> >
>> > Not as far as I can tell. There does appear to be prioritisation for
>> > synchronous signals [1] but as far as I can tell nothing to
>> > distinguish one of these signals from one with the same signal number
>> > sent from userspace (e.g. via kill(2)).
>>
>> The si_code will differ between signals generated between userspace
>> and signals generated by the kernel.
>>
>> We do allow a little bit of ptrace and sending to yourself to spoof
>> kernel generated signals, for reasons of debugging and process migration
>> where an existing process needs to be reconstructed.  But the defenses
>> should be strong enough you can assume that we reliably distinguish
>> between a signal from userspace and a signal from the kernel.
>
> So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> add the context in that case? Seems fragile to me, but I suppose I
> could live with it.
>
>> I don't fully follow what you are doing but this feels like the
>> kind of case where a new si_code has been defined as well as additional
>> fields in siginfo.
>
> There is no new si_code for this, the information will be exposed for
> several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> set). Note that we already have a union field for BUS_MCEERR_AR, and
> we may want to expose it for the other si_codes that already have
> union fields as well.
>
> That being said, taking a closer look at siginfo, I think we are in
> luck and we might be able to make this work in a reasonable way by
> reusing padding (see below).
>
>> In your patchset I really hate that you were going back to
>> force_sig_info, and filling out struct siginfo by hand.  That is an
>> error prone pattern, and I have fixed enough bugs in the kernel to prove
>> that.
>
> To be fair, most of the callers are in helper functions that take
> explicit parameters similar to force_sig_fault et al, and the SIGILL
> one could easily be made that way as well.
>
>> I take exception to the idea that including the full address might break
>> userspace.  That means typically means someone has been too lazy to look
>> and see what userspace is doing.  When that userspace that might break
>> is the same userspace you are changing the kernel to serve that makes me
>> nervous.  AKA the userspace that cares about this signal and how it is
>> represented in siginfo.
>
> It's not a matter of being lazy. This behaviour isn't just an accident
> but has been explicitly documented for years (see the
> tagged-pointers.rst file that I changed: "Non-zero tags are not
> preserved when delivering signals."), so users can reasonably rely on
> it. Furthermore we simply don't have visibility into the majority of
> userspace. For example, there are a lot of closed source Android apps
> out there, and who knows what signal handlers they're installing and
> how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> can't just change the documented semantics under their feet.
>
> It's also not the same userspace either. The userspace that's
> initially going to be consuming the new fields is in a part of the
> Android system that handles and reports crashes, and that's something
> that we control unlike all the apps.
>
> Finally, the userspace may need to know whether the tag bits were
> actually zero or whether they were just unavailable, otherwise
> userspace could for example produce a misleading crash report. Simply
> having the kernel set the top bits of si_addr wouldn't accomplish that
> due to the kernel's previous behaviour, hence the mask to let
> userspace know which bits are accurate.
>
>> A fix of one instance of SIGILL should not be included with a patch that
>> does something else, and really should come before everything else if
>> possible.
>
> Fair point. I can see if I can split that part out.
>
>> If this information really belongs in struct siginfo (as it sounds like)
>> please actually put the information in siginfo, and let userspace look
>> in siginfo to find it.  struct siginfo is a union with plenty of space,
>> and plenty of si_codes.
>>
>> If this applies to multiple cases then it might be trickier but please
>> dig into the details, don't toss things into sigcontext just because
>> you can't figure out a clean design for reporting this.
>
> If we wanted this in siginfo, one idea that I had was to revert commit
> b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> _addr_top_byte and _addr_top_byte_mask in the padding between
> _addr_lsb and the union (with comments on all the fields of course to
> say when they are filled in). I think that would work since we are
> already clearing padding in siginfo, one nice property of the new
> fields is that the zero values are correct in the case where the
> information isn't being exposed (so old kernels would already have the
> correct behaviour). That would only work on certain architectures
> (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> #ifdef __aarch64__ around it.

Perhaps add a 4th padding member to the union inside of _sigfault, that
adds something like 4 unsigned long's worth of data, and then have your
fields after the union.

Is it quite a bit of work to gather that information from the
instructions that faulted?  I am just checking that this work is really
makes sense.

What I really don't understand is how well this problem generalizes to
other architectures to tell if this is something other people need to
solve at some point as well.

Eric
Peter Collingbourne May 21, 2020, 8:48 p.m. UTC | #11
On Thu, May 21, 2020 at 12:28 PM Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> Peter Collingbourne <pcc@google.com> writes:
>
> > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Peter Collingbourne <pcc@google.com> writes:
> >>
> >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >>
> >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> >> > > > > rt_sigframe_user_layout *user,
> >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> >> > > > > &esr_ctx->esr, err);
> >> >> > > > > +               current->thread.fault_code = 0;
> >> >> > > >
> >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> >> > > >
> >> >> > > > Stuff may also go wrong with signal prioritisation.
> >> >> > > >
> >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> >> > > > With your change we'd then have cleared the fault code by the time we
> >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> >> > > >
> >> >> > > > Today, I think we just attach that fault code to every signal that's
> >> >> > > > delivered until something overwrites or resets it, which means that
> >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> >> > > > it to a bunch of other random signals too.
> >> >> > > >
> >> >> > > >
> >> >> > > > Checking the signal number and si_code might help us to know what we
> >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> >> > >
> >> >> > > With these possible interactions in mind I think we should store the
> >> >> > > fault code and fault address in kernel_siginfo instead of
> >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> >> > > information is clearly associated with the signal itself and not the
> >> >> > > thread, so we don't need to worry about our signal being delivered out
> >> >> > > of order.
> >> >> >
> >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> >> > signal code. Can you?
> >> >
> >> > I think I've come up with a way that doesn't seem to be too invasive.
> >> > See patch #1 of the series that I'm about to send out.
> >> >
> >> >> > But generally, I agree: the per-thread handling of fault_address and
> >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> >> > and signals that don't correspond directly to hardware trap. It would be
> >> >> > nice to have some tests for this...
> >> >> >
> >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> >> > and clear on a match. I hate it.
> >> >>
> >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> >> about whether it would be too invasive elsewhere.
> >> >>
> >> >> Question though: does the core code take special care to make sure that
> >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> >> perhaps we get away with it.  I ask this, because the same same issue
> >> >> may be hitting other arches otherwise.
> >> >
> >> > Not as far as I can tell. There does appear to be prioritisation for
> >> > synchronous signals [1] but as far as I can tell nothing to
> >> > distinguish one of these signals from one with the same signal number
> >> > sent from userspace (e.g. via kill(2)).
> >>
> >> The si_code will differ between signals generated between userspace
> >> and signals generated by the kernel.
> >>
> >> We do allow a little bit of ptrace and sending to yourself to spoof
> >> kernel generated signals, for reasons of debugging and process migration
> >> where an existing process needs to be reconstructed.  But the defenses
> >> should be strong enough you can assume that we reliably distinguish
> >> between a signal from userspace and a signal from the kernel.
> >
> > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > add the context in that case? Seems fragile to me, but I suppose I
> > could live with it.
> >
> >> I don't fully follow what you are doing but this feels like the
> >> kind of case where a new si_code has been defined as well as additional
> >> fields in siginfo.
> >
> > There is no new si_code for this, the information will be exposed for
> > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > we may want to expose it for the other si_codes that already have
> > union fields as well.
> >
> > That being said, taking a closer look at siginfo, I think we are in
> > luck and we might be able to make this work in a reasonable way by
> > reusing padding (see below).
> >
> >> In your patchset I really hate that you were going back to
> >> force_sig_info, and filling out struct siginfo by hand.  That is an
> >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> >> that.
> >
> > To be fair, most of the callers are in helper functions that take
> > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > one could easily be made that way as well.
> >
> >> I take exception to the idea that including the full address might break
> >> userspace.  That means typically means someone has been too lazy to look
> >> and see what userspace is doing.  When that userspace that might break
> >> is the same userspace you are changing the kernel to serve that makes me
> >> nervous.  AKA the userspace that cares about this signal and how it is
> >> represented in siginfo.
> >
> > It's not a matter of being lazy. This behaviour isn't just an accident
> > but has been explicitly documented for years (see the
> > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > preserved when delivering signals."), so users can reasonably rely on
> > it. Furthermore we simply don't have visibility into the majority of
> > userspace. For example, there are a lot of closed source Android apps
> > out there, and who knows what signal handlers they're installing and
> > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > can't just change the documented semantics under their feet.
> >
> > It's also not the same userspace either. The userspace that's
> > initially going to be consuming the new fields is in a part of the
> > Android system that handles and reports crashes, and that's something
> > that we control unlike all the apps.
> >
> > Finally, the userspace may need to know whether the tag bits were
> > actually zero or whether they were just unavailable, otherwise
> > userspace could for example produce a misleading crash report. Simply
> > having the kernel set the top bits of si_addr wouldn't accomplish that
> > due to the kernel's previous behaviour, hence the mask to let
> > userspace know which bits are accurate.
> >
> >> A fix of one instance of SIGILL should not be included with a patch that
> >> does something else, and really should come before everything else if
> >> possible.
> >
> > Fair point. I can see if I can split that part out.
> >
> >> If this information really belongs in struct siginfo (as it sounds like)
> >> please actually put the information in siginfo, and let userspace look
> >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> >> and plenty of si_codes.
> >>
> >> If this applies to multiple cases then it might be trickier but please
> >> dig into the details, don't toss things into sigcontext just because
> >> you can't figure out a clean design for reporting this.
> >
> > If we wanted this in siginfo, one idea that I had was to revert commit
> > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > _addr_top_byte and _addr_top_byte_mask in the padding between
> > _addr_lsb and the union (with comments on all the fields of course to
> > say when they are filled in). I think that would work since we are
> > already clearing padding in siginfo, one nice property of the new
> > fields is that the zero values are correct in the case where the
> > information isn't being exposed (so old kernels would already have the
> > correct behaviour). That would only work on certain architectures
> > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > #ifdef __aarch64__ around it.
>
> Perhaps add a 4th padding member to the union inside of _sigfault, that
> adds something like 4 unsigned long's worth of data, and then have your
> fields after the union.

Maybe. I guess we could always add another union after my fields if we
end up needing another union member that is larger than the 4 unsigned
longs, which would be ugly but at least it would work. Reusing the
padding would avoid that but maybe it's not that likely that we'll
need that much.

> Is it quite a bit of work to gather that information from the
> instructions that faulted?  I am just checking that this work is really
> makes sense.

I think so. At a glance there are hundreds of load and store
instructions on arm64 and we would need to know how to disassemble all
of them and recompute the si_addr from scratch (since the tag bits
could come from any of the registers used to compute the address). And
we really don't want to be doing this tricky stuff in a signal handler
where we've just crashed.

> What I really don't understand is how well this problem generalizes to
> other architectures to tell if this is something other people need to
> solve at some point as well.

An architecture with a feature similar to ARM's TBI or MTE may need
something like this as well, depending on whether they decide to
expose the tag bits in si_addr from the start (and if the feature is
similar to TBI it certainly seems like a reasonable choice to follow
arm64 for compatibility reasons). I would imagine that the main thing
that could vary between architectures is the number of bits involved,
which suggests making the fields arch-specific (or making them larger,
but that may be wasteful).

The only other architecture that I'm aware of with such a feature is
SPARC (whose ADI is similar to MTE). The documentation [1] seems to
suggest that the tag bits are available in si_addr but isn't very
specific.

Peter

[1] https://www.kernel.org/doc/Documentation/sparc/adi.rst
Dave Martin May 26, 2020, 1:03 p.m. UTC | #12
On Thu, May 21, 2020 at 02:24:45PM -0500, Eric W. Biederman wrote:
> Peter Collingbourne <pcc@google.com> writes:
> 
> > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Peter Collingbourne <pcc@google.com> writes:
> >>
> >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >>
> >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> >> > > > > rt_sigframe_user_layout *user,
> >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> >> > > > > &esr_ctx->esr, err);
> >> >> > > > > +               current->thread.fault_code = 0;
> >> >> > > >
> >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> >> > > >
> >> >> > > > Stuff may also go wrong with signal prioritisation.
> >> >> > > >
> >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> >> > > > With your change we'd then have cleared the fault code by the time we
> >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> >> > > >
> >> >> > > > Today, I think we just attach that fault code to every signal that's
> >> >> > > > delivered until something overwrites or resets it, which means that
> >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> >> > > > it to a bunch of other random signals too.
> >> >> > > >
> >> >> > > >
> >> >> > > > Checking the signal number and si_code might help us to know what we
> >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> >> > >
> >> >> > > With these possible interactions in mind I think we should store the
> >> >> > > fault code and fault address in kernel_siginfo instead of
> >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> >> > > information is clearly associated with the signal itself and not the
> >> >> > > thread, so we don't need to worry about our signal being delivered out
> >> >> > > of order.
> >> >> >
> >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> >> > signal code. Can you?
> >> >
> >> > I think I've come up with a way that doesn't seem to be too invasive.
> >> > See patch #1 of the series that I'm about to send out.
> >> >
> >> >> > But generally, I agree: the per-thread handling of fault_address and
> >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> >> > and signals that don't correspond directly to hardware trap. It would be
> >> >> > nice to have some tests for this...
> >> >> >
> >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> >> > and clear on a match. I hate it.
> >> >>
> >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> >> about whether it would be too invasive elsewhere.
> >> >>
> >> >> Question though: does the core code take special care to make sure that
> >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> >> perhaps we get away with it.  I ask this, because the same same issue
> >> >> may be hitting other arches otherwise.
> >> >
> >> > Not as far as I can tell. There does appear to be prioritisation for
> >> > synchronous signals [1] but as far as I can tell nothing to
> >> > distinguish one of these signals from one with the same signal number
> >> > sent from userspace (e.g. via kill(2)).
> >>
> >> The si_code will differ between signals generated between userspace
> >> and signals generated by the kernel.
> >>
> >> We do allow a little bit of ptrace and sending to yourself to spoof
> >> kernel generated signals, for reasons of debugging and process migration
> >> where an existing process needs to be reconstructed.  But the defenses
> >> should be strong enough you can assume that we reliably distinguish
> >> between a signal from userspace and a signal from the kernel.
> >
> > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > add the context in that case? Seems fragile to me, but I suppose I
> > could live with it.
> >
> >> I don't fully follow what you are doing but this feels like the
> >> kind of case where a new si_code has been defined as well as additional
> >> fields in siginfo.
> >
> > There is no new si_code for this, the information will be exposed for
> > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > we may want to expose it for the other si_codes that already have
> > union fields as well.
> >
> > That being said, taking a closer look at siginfo, I think we are in
> > luck and we might be able to make this work in a reasonable way by
> > reusing padding (see below).
> >
> >> In your patchset I really hate that you were going back to
> >> force_sig_info, and filling out struct siginfo by hand.  That is an
> >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> >> that.
> >
> > To be fair, most of the callers are in helper functions that take
> > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > one could easily be made that way as well.
> >
> >> I take exception to the idea that including the full address might break
> >> userspace.  That means typically means someone has been too lazy to look
> >> and see what userspace is doing.  When that userspace that might break
> >> is the same userspace you are changing the kernel to serve that makes me
> >> nervous.  AKA the userspace that cares about this signal and how it is
> >> represented in siginfo.
> >
> > It's not a matter of being lazy. This behaviour isn't just an accident
> > but has been explicitly documented for years (see the
> > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > preserved when delivering signals."), so users can reasonably rely on
> > it. Furthermore we simply don't have visibility into the majority of
> > userspace. For example, there are a lot of closed source Android apps
> > out there, and who knows what signal handlers they're installing and
> > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > can't just change the documented semantics under their feet.
> >
> > It's also not the same userspace either. The userspace that's
> > initially going to be consuming the new fields is in a part of the
> > Android system that handles and reports crashes, and that's something
> > that we control unlike all the apps.
> >
> > Finally, the userspace may need to know whether the tag bits were
> > actually zero or whether they were just unavailable, otherwise
> > userspace could for example produce a misleading crash report. Simply
> > having the kernel set the top bits of si_addr wouldn't accomplish that
> > due to the kernel's previous behaviour, hence the mask to let
> > userspace know which bits are accurate.
> >
> >> A fix of one instance of SIGILL should not be included with a patch that
> >> does something else, and really should come before everything else if
> >> possible.
> >
> > Fair point. I can see if I can split that part out.
> >
> >> If this information really belongs in struct siginfo (as it sounds like)
> >> please actually put the information in siginfo, and let userspace look
> >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> >> and plenty of si_codes.
> >>
> >> If this applies to multiple cases then it might be trickier but please
> >> dig into the details, don't toss things into sigcontext just because
> >> you can't figure out a clean design for reporting this.
> >
> > If we wanted this in siginfo, one idea that I had was to revert commit
> > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > _addr_top_byte and _addr_top_byte_mask in the padding between
> > _addr_lsb and the union (with comments on all the fields of course to
> > say when they are filled in). I think that would work since we are
> > already clearing padding in siginfo, one nice property of the new
> > fields is that the zero values are correct in the case where the
> > information isn't being exposed (so old kernels would already have the
> > correct behaviour). That would only work on certain architectures
> > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > #ifdef __aarch64__ around it.
> 
> Perhaps add a 4th padding member to the union inside of _sigfault, that
> adds something like 4 unsigned long's worth of data, and then have your
> fields after the union.
> 
> Is it quite a bit of work to gather that information from the
> instructions that faulted?  I am just checking that this work is really
> makes sense.
> 
> What I really don't understand is how well this problem generalizes to
> other architectures to tell if this is something other people need to
> solve at some point as well.

The broad issue here is how arch-specific fault diagnostics make it into
the signal frame, and whether this is needed at all.

The address tag bits are one case, but the same basic mechanism is also
used to report the type of failed access (read versus write) for
SIGSEGV on arm64.  (IIRC qemu relies on this for tracking page use /
dirtiness in userspace.)

Having a way to associate arch metadata of this sort with the
specific signal it relates to seems a good idea.  That way, we're not
relying on internal details of the signal common code such as the
precise order signals get delivered in.

This concept is certainly applicable to other arches, but I don't know
the extent to which they actually depend on it.


Ideally, there would be a si_flags field to add simple arch_specific
attributes in, but there seems no backwards compatible way to add such a
thing for existing signals.  (Or is there?)

Cheers
---Dave
Peter Collingbourne June 8, 2020, 6:12 p.m. UTC | #13
On Thu, May 21, 2020 at 1:48 PM Peter Collingbourne <pcc@google.com> wrote:
>
> On Thu, May 21, 2020 at 12:28 PM Eric W. Biederman
> <ebiederm@xmission.com> wrote:
> >
> > Peter Collingbourne <pcc@google.com> writes:
> >
> > > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > >>
> > >> Peter Collingbourne <pcc@google.com> writes:
> > >>
> > >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >> >>
> > >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> > >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> > >> >> > > > > --- a/arch/arm64/kernel/signal.c
> > >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> > >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > >> >> > > > > rt_sigframe_user_layout *user,
> > >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > >> >> > > > >                 __put_user_error(current->thread.fault_code,
> > >> >> > > > > &esr_ctx->esr, err);
> > >> >> > > > > +               current->thread.fault_code = 0;
> > >> >> > > >
> > >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> > >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> > >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> > >> >> > > >
> > >> >> > > > Stuff may also go wrong with signal prioritisation.
> > >> >> > > >
> > >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > >> >> > > > With your change we'd then have cleared the fault code by the time we
> > >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > >> >> > > >
> > >> >> > > > Today, I think we just attach that fault code to every signal that's
> > >> >> > > > delivered until something overwrites or resets it, which means that
> > >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> > >> >> > > > it to a bunch of other random signals too.
> > >> >> > > >
> > >> >> > > >
> > >> >> > > > Checking the signal number and si_code might help us to know what we
> > >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> > >> >> > > > trick us with a non kernel generated signal here.  It would also be
> > >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > >> >> > >
> > >> >> > > With these possible interactions in mind I think we should store the
> > >> >> > > fault code and fault address in kernel_siginfo instead of
> > >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> > >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> > >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > >> >> > > information is clearly associated with the signal itself and not the
> > >> >> > > thread, so we don't need to worry about our signal being delivered out
> > >> >> > > of order.
> > >> >> >
> > >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> > >> >> > signal code. Can you?
> > >> >
> > >> > I think I've come up with a way that doesn't seem to be too invasive.
> > >> > See patch #1 of the series that I'm about to send out.
> > >> >
> > >> >> > But generally, I agree: the per-thread handling of fault_address and
> > >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> > >> >> > and signals that don't correspond directly to hardware trap. It would be
> > >> >> > nice to have some tests for this...
> > >> >> >
> > >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> > >> >> > to which the fault_{address,code} relate, and then check that at delivery
> > >> >> > and clear on a match. I hate it.
> > >> >>
> > >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> > >> >> about whether it would be too invasive elsewhere.
> > >> >>
> > >> >> Question though: does the core code take special care to make sure that
> > >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> > >> >> perhaps we get away with it.  I ask this, because the same same issue
> > >> >> may be hitting other arches otherwise.
> > >> >
> > >> > Not as far as I can tell. There does appear to be prioritisation for
> > >> > synchronous signals [1] but as far as I can tell nothing to
> > >> > distinguish one of these signals from one with the same signal number
> > >> > sent from userspace (e.g. via kill(2)).
> > >>
> > >> The si_code will differ between signals generated between userspace
> > >> and signals generated by the kernel.
> > >>
> > >> We do allow a little bit of ptrace and sending to yourself to spoof
> > >> kernel generated signals, for reasons of debugging and process migration
> > >> where an existing process needs to be reconstructed.  But the defenses
> > >> should be strong enough you can assume that we reliably distinguish
> > >> between a signal from userspace and a signal from the kernel.
> > >
> > > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > > add the context in that case? Seems fragile to me, but I suppose I
> > > could live with it.
> > >
> > >> I don't fully follow what you are doing but this feels like the
> > >> kind of case where a new si_code has been defined as well as additional
> > >> fields in siginfo.
> > >
> > > There is no new si_code for this, the information will be exposed for
> > > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > > we may want to expose it for the other si_codes that already have
> > > union fields as well.
> > >
> > > That being said, taking a closer look at siginfo, I think we are in
> > > luck and we might be able to make this work in a reasonable way by
> > > reusing padding (see below).
> > >
> > >> In your patchset I really hate that you were going back to
> > >> force_sig_info, and filling out struct siginfo by hand.  That is an
> > >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> > >> that.
> > >
> > > To be fair, most of the callers are in helper functions that take
> > > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > > one could easily be made that way as well.
> > >
> > >> I take exception to the idea that including the full address might break
> > >> userspace.  That means typically means someone has been too lazy to look
> > >> and see what userspace is doing.  When that userspace that might break
> > >> is the same userspace you are changing the kernel to serve that makes me
> > >> nervous.  AKA the userspace that cares about this signal and how it is
> > >> represented in siginfo.
> > >
> > > It's not a matter of being lazy. This behaviour isn't just an accident
> > > but has been explicitly documented for years (see the
> > > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > > preserved when delivering signals."), so users can reasonably rely on
> > > it. Furthermore we simply don't have visibility into the majority of
> > > userspace. For example, there are a lot of closed source Android apps
> > > out there, and who knows what signal handlers they're installing and
> > > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > > can't just change the documented semantics under their feet.
> > >
> > > It's also not the same userspace either. The userspace that's
> > > initially going to be consuming the new fields is in a part of the
> > > Android system that handles and reports crashes, and that's something
> > > that we control unlike all the apps.
> > >
> > > Finally, the userspace may need to know whether the tag bits were
> > > actually zero or whether they were just unavailable, otherwise
> > > userspace could for example produce a misleading crash report. Simply
> > > having the kernel set the top bits of si_addr wouldn't accomplish that
> > > due to the kernel's previous behaviour, hence the mask to let
> > > userspace know which bits are accurate.
> > >
> > >> A fix of one instance of SIGILL should not be included with a patch that
> > >> does something else, and really should come before everything else if
> > >> possible.
> > >
> > > Fair point. I can see if I can split that part out.
> > >
> > >> If this information really belongs in struct siginfo (as it sounds like)
> > >> please actually put the information in siginfo, and let userspace look
> > >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> > >> and plenty of si_codes.
> > >>
> > >> If this applies to multiple cases then it might be trickier but please
> > >> dig into the details, don't toss things into sigcontext just because
> > >> you can't figure out a clean design for reporting this.
> > >
> > > If we wanted this in siginfo, one idea that I had was to revert commit
> > > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > > _addr_top_byte and _addr_top_byte_mask in the padding between
> > > _addr_lsb and the union (with comments on all the fields of course to
> > > say when they are filled in). I think that would work since we are
> > > already clearing padding in siginfo, one nice property of the new
> > > fields is that the zero values are correct in the case where the
> > > information isn't being exposed (so old kernels would already have the
> > > correct behaviour). That would only work on certain architectures
> > > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > > #ifdef __aarch64__ around it.
> >
> > Perhaps add a 4th padding member to the union inside of _sigfault, that
> > adds something like 4 unsigned long's worth of data, and then have your
> > fields after the union.
>
> Maybe. I guess we could always add another union after my fields if we
> end up needing another union member that is larger than the 4 unsigned
> longs, which would be ugly but at least it would work. Reusing the
> padding would avoid that but maybe it's not that likely that we'll
> need that much.

In the interests of getting the discussion on this started again I'm
sending a v7 which moves the fields into the padding bytes after
si_addr_lsb. It should be easy to switch to another location in
siginfo if you don't like this one.

> > Is it quite a bit of work to gather that information from the
> > instructions that faulted?  I am just checking that this work is really
> > makes sense.
>
> I think so. At a glance there are hundreds of load and store
> instructions on arm64 and we would need to know how to disassemble all
> of them and recompute the si_addr from scratch (since the tag bits
> could come from any of the registers used to compute the address). And
> we really don't want to be doing this tricky stuff in a signal handler
> where we've just crashed.
>
> > What I really don't understand is how well this problem generalizes to
> > other architectures to tell if this is something other people need to
> > solve at some point as well.
>
> An architecture with a feature similar to ARM's TBI or MTE may need
> something like this as well, depending on whether they decide to
> expose the tag bits in si_addr from the start (and if the feature is
> similar to TBI it certainly seems like a reasonable choice to follow
> arm64 for compatibility reasons). I would imagine that the main thing
> that could vary between architectures is the number of bits involved,
> which suggests making the fields arch-specific (or making them larger,
> but that may be wasteful).

I made the new fields arch-specific given the points that I made above.

Peter
diff mbox series

Patch

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..c6e9592a9dea 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@  visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the fault_addr_top_byte field of
+the signal frame record fault_addr_top_byte_context, which is present
+for signals raised in response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@  static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..63185be29ff9 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -144,7 +144,7 @@  struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..2a3fe3de899d 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@  struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		fault_addr_top_byte_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,17 +95,26 @@  struct esr_context {
 	__u64 esr;
 };
 
+/* Top byte of fault address (normally not exposed via si_addr) */
+#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
+
+struct fault_addr_top_byte_context {
+	struct _aarch64_ctx head;
+	__u8 fault_addr_top_byte;
+	__u8 __reserved[7];
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
  *
  * Note:
  *
- * 1) fpsimd_context, esr_context and extra_context must be placed in
- * sigcontext.__reserved[] if present.  They cannot be placed in the
- * extra space.  Any other record can be placed either in the extra
- * space or in sigcontext.__reserved[], unless otherwise specified in
- * this file.
+ * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
+ * extra_context must be placed in sigcontext.__reserved[] if present.
+ * They cannot be placed in the extra space.  Any other record can be
+ * placed either in the extra space or in sigcontext.__reserved[],
+ * unless otherwise specified in this file.
  *
  * 2) There must not be more than one extra_context.
  *
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@  static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@  static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..baa88dc02e5c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@  struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long ftb_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@  static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAULT_ADDR_TOP_BYTE_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,12 @@  static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(
+			user, &user->ftb_offset,
+			sizeof(struct fault_addr_top_byte_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +629,8 @@  static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +650,17 @@  static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->ftb_offset) {
+		struct fault_addr_top_byte_context __user *ftb_ctx =
+			apply_user_offset(user, user->ftb_offset);
+
+		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
+				 &ftb_ctx->head.magic, err);
+		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address >> 56,
+				 &ftb_ctx->fault_addr_top_byte, err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..39bbaa05f162 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@ 
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@  static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@  static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@  static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@  static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@  static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@  static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -570,7 +576,7 @@  static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -605,30 +611,32 @@  static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +652,7 @@  static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +725,12 @@  static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {