diff mbox series

[v3,12/25] arm64: handle PKEY/POE faults

Message ID 20231124163510.1835740-13-joey.gouly@arm.com (mailing list archive)
State New, archived
Headers show
Series Permission Overlay Extension | expand

Commit Message

Joey Gouly Nov. 24, 2023, 4:34 p.m. UTC
If a memory fault occurs that is due to an overlay/pkey fault, report that to
userspace with a SEGV_PKUERR.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/traps.h |  1 +
 arch/arm64/kernel/traps.c      | 12 ++++++++--
 arch/arm64/mm/fault.c          | 44 +++++++++++++++++++++++++++++++---
 3 files changed, 52 insertions(+), 5 deletions(-)

Comments

Catalin Marinas Dec. 11, 2023, 6:18 p.m. UTC | #1
On Fri, Nov 24, 2023 at 04:34:57PM +0000, Joey Gouly wrote:
> @@ -497,6 +498,23 @@ static void do_bad_area(unsigned long far, unsigned long esr,
>  #define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
>  #define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
>  
> +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
> +			unsigned int mm_flags)
> +{
> +	unsigned long iss2 = ESR_ELx_ISS2(esr);
> +
> +	if (!arch_pkeys_enabled())
> +		return false;
> +
> +	if (iss2 & ESR_ELx_Overlay)
> +		return true;
> +
> +	return !arch_vma_access_permitted(vma,
> +			mm_flags & FAULT_FLAG_WRITE,
> +			mm_flags & FAULT_FLAG_INSTRUCTION,
> +			mm_flags & FAULT_FLAG_REMOTE);
> +}

Do we actually need this additional arch_vma_access_permitted() check?
The ESR should tell us if it was a POE fault. Permission overlay faults
have priority over the base permission faults, so we'd not need to fall
back to this additional checks. Well, see below, we could do something
slightly smarter here.

I can see x86 and powerpc have similar checks (though at a different
point under the mmap lock) but I'm not familiar with their exception
model, exception priorities.

> +
>  static vm_fault_t __do_page_fault(struct mm_struct *mm,
>  				  struct vm_area_struct *vma, unsigned long addr,
>  				  unsigned int mm_flags, unsigned long vm_flags,
> @@ -688,9 +706,29 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
>  		 * Something tried to access memory that isn't in our memory
>  		 * map.
>  		 */
> -		arm64_force_sig_fault(SIGSEGV,
> -				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
> -				      far, inf->name);
> +		int fault_kind;
> +		/*
> +		 * The pkey value that we return to userspace can be different
> +		 * from the pkey that caused the fault.
> +		 *
> +		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
> +		 * 2. T1   : set AMR to deny access to pkey=4, touches, page
> +		 * 3. T1   : faults...
> +		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
> +		 * 5. T1   : enters fault handler, takes mmap_lock, etc...
> +		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
> +		 *	     faulted on a pte with its pkey=4.
> +		 */
> +		int pkey = vma_pkey(vma);

Other than the vma_pkey() race, I'm more worried about the vma
completely disappearing, e.g. munmap() in another thread. We end up
dereferencing a free pointer here. We need to do this under the mmap
lock.

Since we need to do this check under the mmap lock, we could add an
additional check to see if the pkey fault we got was a racy and just
restart the instruction if it no longer faults according to
por_el0_allows_pkey(). However, the code below is too late in the fault
handling to be able to do much other than SIGSEGV.

> +
> +		if (fault_from_pkey(esr, vma, mm_flags))
> +			fault_kind = SEGV_PKUERR;
> +		else
> +			fault_kind = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR;
> +
> +		arm64_force_sig_fault_pkey(SIGSEGV,
> +				      fault_kind,
> +				      far, inf->name, pkey);
>  	}
>  
>  	return 0;
Joey Gouly Dec. 13, 2023, 3:02 p.m. UTC | #2
Hi,

On Mon, Dec 11, 2023 at 06:18:17PM +0000, Catalin Marinas wrote:
> On Fri, Nov 24, 2023 at 04:34:57PM +0000, Joey Gouly wrote:
> > @@ -497,6 +498,23 @@ static void do_bad_area(unsigned long far, unsigned long esr,
> >  #define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
> >  #define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
> >  
> > +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
> > +			unsigned int mm_flags)
> > +{
> > +	unsigned long iss2 = ESR_ELx_ISS2(esr);
> > +
> > +	if (!arch_pkeys_enabled())
> > +		return false;
> > +
> > +	if (iss2 & ESR_ELx_Overlay)
> > +		return true;
> > +
> > +	return !arch_vma_access_permitted(vma,
> > +			mm_flags & FAULT_FLAG_WRITE,
> > +			mm_flags & FAULT_FLAG_INSTRUCTION,
> > +			mm_flags & FAULT_FLAG_REMOTE);
> > +}
> 
> Do we actually need this additional arch_vma_access_permitted() check?
> The ESR should tell us if it was a POE fault. Permission overlay faults
> have priority over the base permission faults, so we'd not need to fall
> back to this additional checks. Well, see below, we could do something
> slightly smarter here.

We want this here as it follows other arch's which will fail with a pkey fault
even if the page isn't actually mapped. If the paged isn't mapped we'd get a
translation fault, but since we know the type of access and have the pkey in
the VMA we check it here.

> 
> I can see x86 and powerpc have similar checks (though at a different
> point under the mmap lock) but I'm not familiar with their exception
> model, exception priorities.
> 
> > +
> >  static vm_fault_t __do_page_fault(struct mm_struct *mm,
> >  				  struct vm_area_struct *vma, unsigned long addr,
> >  				  unsigned int mm_flags, unsigned long vm_flags,
> > @@ -688,9 +706,29 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
> >  		 * Something tried to access memory that isn't in our memory
> >  		 * map.
> >  		 */
> > -		arm64_force_sig_fault(SIGSEGV,
> > -				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
> > -				      far, inf->name);
> > +		int fault_kind;
> > +		/*
> > +		 * The pkey value that we return to userspace can be different
> > +		 * from the pkey that caused the fault.
> > +		 *
> > +		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
> > +		 * 2. T1   : set AMR to deny access to pkey=4, touches, page
> > +		 * 3. T1   : faults...
> > +		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
> > +		 * 5. T1   : enters fault handler, takes mmap_lock, etc...
> > +		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
> > +		 *	     faulted on a pte with its pkey=4.
> > +		 */
> > +		int pkey = vma_pkey(vma);
> 
> Other than the vma_pkey() race, I'm more worried about the vma
> completely disappearing, e.g. munmap() in another thread. We end up
> dereferencing a free pointer here. We need to do this under the mmap
> lock.
> 
> Since we need to do this check under the mmap lock, we could add an
> additional check to see if the pkey fault we got was a racy and just
> restart the instruction if it no longer faults according to
> por_el0_allows_pkey(). However, the code below is too late in the fault
> handling to be able to do much other than SIGSEGV.

After discussing in person, I agree with the assesment that this is the wrong
place to do the check, and after looking at the x86 arch code, I see how
they're doing it earlier.

Thanks,
Joey
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index eefe766d6161..f6f6f2cb7f10 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -25,6 +25,7 @@  try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn)
 void force_signal_inject(int signal, int code, unsigned long address, unsigned long err);
 void arm64_notify_segfault(unsigned long addr);
 void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str);
+void arm64_force_sig_fault_pkey(int signo, int code, unsigned long far, const char *str, int pkey);
 void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str);
 void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str);
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 215e6d7f2df8..1bac6c84d3f5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -263,16 +263,24 @@  static void arm64_show_signal(int signo, const char *str)
 	__show_regs(regs);
 }
 
-void arm64_force_sig_fault(int signo, int code, unsigned long far,
-			   const char *str)
+void arm64_force_sig_fault_pkey(int signo, int code, unsigned long far,
+			   const char *str, int pkey)
 {
 	arm64_show_signal(signo, str);
 	if (signo == SIGKILL)
 		force_sig(SIGKILL);
+	else if (code == SEGV_PKUERR)
+		force_sig_pkuerr((void __user *)far, pkey);
 	else
 		force_sig_fault(signo, code, (void __user *)far);
 }
 
+void arm64_force_sig_fault(int signo, int code, unsigned long far,
+			   const char *str)
+{
+	arm64_force_sig_fault_pkey(signo, code, far, str, 0);
+}
+
 void arm64_force_sig_mceerr(int code, unsigned long far, short lsb,
 			    const char *str)
 {
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 460d799e1296..efd263f56da7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -23,6 +23,7 @@ 
 #include <linux/sched/debug.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
+#include <linux/pkeys.h>
 #include <linux/preempt.h>
 #include <linux/hugetlb.h>
 
@@ -497,6 +498,23 @@  static void do_bad_area(unsigned long far, unsigned long esr,
 #define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
 #define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
 
+static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
+			unsigned int mm_flags)
+{
+	unsigned long iss2 = ESR_ELx_ISS2(esr);
+
+	if (!arch_pkeys_enabled())
+		return false;
+
+	if (iss2 & ESR_ELx_Overlay)
+		return true;
+
+	return !arch_vma_access_permitted(vma,
+			mm_flags & FAULT_FLAG_WRITE,
+			mm_flags & FAULT_FLAG_INSTRUCTION,
+			mm_flags & FAULT_FLAG_REMOTE);
+}
+
 static vm_fault_t __do_page_fault(struct mm_struct *mm,
 				  struct vm_area_struct *vma, unsigned long addr,
 				  unsigned int mm_flags, unsigned long vm_flags,
@@ -688,9 +706,29 @@  static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 		 * Something tried to access memory that isn't in our memory
 		 * map.
 		 */
-		arm64_force_sig_fault(SIGSEGV,
-				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
-				      far, inf->name);
+		int fault_kind;
+		/*
+		 * The pkey value that we return to userspace can be different
+		 * from the pkey that caused the fault.
+		 *
+		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
+		 * 2. T1   : set AMR to deny access to pkey=4, touches, page
+		 * 3. T1   : faults...
+		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+		 * 5. T1   : enters fault handler, takes mmap_lock, etc...
+		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
+		 *	     faulted on a pte with its pkey=4.
+		 */
+		int pkey = vma_pkey(vma);
+
+		if (fault_from_pkey(esr, vma, mm_flags))
+			fault_kind = SEGV_PKUERR;
+		else
+			fault_kind = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR;
+
+		arm64_force_sig_fault_pkey(SIGSEGV,
+				      fault_kind,
+				      far, inf->name, pkey);
 	}
 
 	return 0;