[RFC,v7,14/16] EXPERIMENTAL: xpfo, mm: optimize spin lock usage in xpfo_kmap
diff mbox series

Message ID 7e8e17f519ae87a91fc6cbb57b8b27094c96305c.1547153058.git.khalid.aziz@oracle.com
State New
Headers show
Series
  • Add support for eXclusive Page Frame Ownership
Related show

Commit Message

Khalid Aziz Jan. 10, 2019, 9:09 p.m. UTC
From: Julian Stecklina <jsteckli@amazon.de>

We can reduce spin lock usage in xpfo_kmap to the 0->1 transition of
the mapcount. This means that xpfo_kmap() can now race and that we
get spurious page faults.

The page fault handler helps the system make forward progress by
fixing the page table instead of allowing repeated page faults until
the right xpfo_kmap went through.

Model-checked with up to 4 concurrent callers with Spin.

Signed-off-by: Julian Stecklina <jsteckli@amazon.de>
Cc: x86@kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
Cc: Tycho Andersen <tycho@docker.com>
Cc: Marco Benatto <marco.antonio.780@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
---
 arch/x86/mm/fault.c  |  4 ++++
 include/linux/xpfo.h |  4 ++++
 mm/xpfo.c            | 50 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 51 insertions(+), 7 deletions(-)

Comments

Laura Abbott Jan. 17, 2019, 12:18 a.m. UTC | #1
On 1/10/19 1:09 PM, Khalid Aziz wrote:
> From: Julian Stecklina <jsteckli@amazon.de>
> 
> We can reduce spin lock usage in xpfo_kmap to the 0->1 transition of
> the mapcount. This means that xpfo_kmap() can now race and that we
> get spurious page faults.
> 
> The page fault handler helps the system make forward progress by
> fixing the page table instead of allowing repeated page faults until
> the right xpfo_kmap went through.
> 
> Model-checked with up to 4 concurrent callers with Spin.
> 

This needs the spurious check for arm64 as well. This at
least gets me booting but could probably use more review:

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 7d9571f4ae3d..8f425848cbb9 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,6 +32,7 @@
  #include <linux/perf_event.h>
  #include <linux/preempt.h>
  #include <linux/hugetlb.h>
+#include <linux/xpfo.h>
  
  #include <asm/bug.h>
  #include <asm/cmpxchg.h>
@@ -289,6 +290,9 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
         if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
                 return;
  
+       if (xpfo_spurious_fault(addr))
+               return;
+
         if (is_el1_permission_fault(addr, esr, regs)) {
                 if (esr & ESR_ELx_WNR)
                         msg = "write to read-only memory";


> Signed-off-by: Julian Stecklina <jsteckli@amazon.de>
> Cc: x86@kernel.org
> Cc: kernel-hardening@lists.openwall.com
> Cc: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
> Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
> Cc: Tycho Andersen <tycho@docker.com>
> Cc: Marco Benatto <marco.antonio.780@gmail.com>
> Cc: David Woodhouse <dwmw2@infradead.org>
> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
> ---
>   arch/x86/mm/fault.c  |  4 ++++
>   include/linux/xpfo.h |  4 ++++
>   mm/xpfo.c            | 50 +++++++++++++++++++++++++++++++++++++-------
>   3 files changed, 51 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index ba51652fbd33..207081dcd572 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -18,6 +18,7 @@
>   #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
>   #include <linux/efi.h>			/* efi_recover_from_page_fault()*/
>   #include <linux/mm_types.h>
> +#include <linux/xpfo.h>
>   
>   #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
>   #include <asm/traps.h>			/* dotraplinkage, ...		*/
> @@ -1218,6 +1219,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
>   	if (kprobes_fault(regs))
>   		return;
>   
> +	if (xpfo_spurious_fault(address))
> +		return;
> +
>   	/*
>   	 * Note, despite being a "bad area", there are quite a few
>   	 * acceptable reasons to get here, such as erratum fixups
> diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> index ea5188882f49..58dd243637d2 100644
> --- a/include/linux/xpfo.h
> +++ b/include/linux/xpfo.h
> @@ -54,6 +54,8 @@ bool xpfo_enabled(void);
>   
>   phys_addr_t user_virt_to_phys(unsigned long addr);
>   
> +bool xpfo_spurious_fault(unsigned long addr);
> +
>   #else /* !CONFIG_XPFO */
>   
>   static inline void xpfo_init_single_page(struct page *page) { }
> @@ -81,6 +83,8 @@ static inline bool xpfo_enabled(void) { return false; }
>   
>   static inline phys_addr_t user_virt_to_phys(unsigned long addr) { return 0; }
>   
> +static inline bool xpfo_spurious_fault(unsigned long addr) { return false; }
> +
>   #endif /* CONFIG_XPFO */
>   
>   #endif /* _LINUX_XPFO_H */
> diff --git a/mm/xpfo.c b/mm/xpfo.c
> index dbf20efb0499..85079377c91d 100644
> --- a/mm/xpfo.c
> +++ b/mm/xpfo.c
> @@ -119,6 +119,16 @@ void xpfo_free_pages(struct page *page, int order)
>   	}
>   }
>   
> +static void xpfo_do_map(void *kaddr, struct page *page)
> +{
> +	spin_lock(&page->xpfo_lock);
> +	if (PageXpfoUnmapped(page)) {
> +		set_kpte(kaddr, page, PAGE_KERNEL);
> +		ClearPageXpfoUnmapped(page);
> +	}
> +	spin_unlock(&page->xpfo_lock);
> +}
> +
>   void xpfo_kmap(void *kaddr, struct page *page)
>   {
>   	if (!static_branch_unlikely(&xpfo_inited))
> @@ -127,17 +137,12 @@ void xpfo_kmap(void *kaddr, struct page *page)
>   	if (!PageXpfoUser(page))
>   		return;
>   
> -	spin_lock(&page->xpfo_lock);
> -
>   	/*
>   	 * The page was previously allocated to user space, so map it back
>   	 * into the kernel. No TLB flush required.
>   	 */
> -	if ((atomic_inc_return(&page->xpfo_mapcount) == 1) &&
> -	    TestClearPageXpfoUnmapped(page))
> -		set_kpte(kaddr, page, PAGE_KERNEL);
> -
> -	spin_unlock(&page->xpfo_lock);
> +	if (atomic_inc_return(&page->xpfo_mapcount) == 1)
> +		xpfo_do_map(kaddr, page);
>   }
>   EXPORT_SYMBOL(xpfo_kmap);
>   
> @@ -204,3 +209,34 @@ void xpfo_temp_unmap(const void *addr, size_t size, void **mapping,
>   			kunmap_atomic(mapping[i]);
>   }
>   EXPORT_SYMBOL(xpfo_temp_unmap);
> +
> +bool xpfo_spurious_fault(unsigned long addr)
> +{
> +	struct page *page;
> +	bool spurious;
> +	int mapcount;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return false;
> +
> +	/* XXX Is this sufficient to guard against calling virt_to_page() on a
> +	 * virtual address that has no corresponding struct page? */
> +	if (!virt_addr_valid(addr))
> +		return false;
> +
> +	page = virt_to_page(addr);
> +	mapcount = atomic_read(&page->xpfo_mapcount);
> +	spurious = PageXpfoUser(page) && mapcount;
> +
> +	/* Guarantee forward progress in case xpfo_kmap() raced. */
> +	if (spurious && PageXpfoUnmapped(page)) {
> +		xpfo_do_map((void *)(addr & PAGE_MASK), page);
> +	}
> +
> +	if (unlikely(!spurious))
> +		printk("XPFO non-spurious fault %lx user=%d unmapped=%d mapcount=%d\n",
> +			addr, PageXpfoUser(page), PageXpfoUnmapped(page),
> +			mapcount);
> +
> +	return spurious;
> +}
>
Khalid Aziz Jan. 17, 2019, 3:14 p.m. UTC | #2
On 1/16/19 5:18 PM, Laura Abbott wrote:
> On 1/10/19 1:09 PM, Khalid Aziz wrote:
>> From: Julian Stecklina <jsteckli@amazon.de>
>>
>> We can reduce spin lock usage in xpfo_kmap to the 0->1 transition of
>> the mapcount. This means that xpfo_kmap() can now race and that we
>> get spurious page faults.
>>
>> The page fault handler helps the system make forward progress by
>> fixing the page table instead of allowing repeated page faults until
>> the right xpfo_kmap went through.
>>
>> Model-checked with up to 4 concurrent callers with Spin.
>>
> 
> This needs the spurious check for arm64 as well. This at
> least gets me booting but could probably use more review:
> 
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 7d9571f4ae3d..8f425848cbb9 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -32,6 +32,7 @@
>  #include <linux/perf_event.h>
>  #include <linux/preempt.h>
>  #include <linux/hugetlb.h>
> +#include <linux/xpfo.h>
>  
>  #include <asm/bug.h>
>  #include <asm/cmpxchg.h>
> @@ -289,6 +290,9 @@ static void __do_kernel_fault(unsigned long addr,
> unsigned int esr,
>         if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
>                 return;
>  
> +       if (xpfo_spurious_fault(addr))
> +               return;
> +
>         if (is_el1_permission_fault(addr, esr, regs)) {
>                 if (esr & ESR_ELx_WNR)
>                         msg = "write to read-only memory";
> 
> 

That makes sense. Thanks for debugging this. I will add this to patch 14
("EXPERIMENTAL: xpfo, mm: optimize spin lock usage in xpfo_kmap").

Thanks,
Khalid

Patch
diff mbox series

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ba51652fbd33..207081dcd572 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,6 +18,7 @@ 
 #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 #include <linux/efi.h>			/* efi_recover_from_page_fault()*/
 #include <linux/mm_types.h>
+#include <linux/xpfo.h>
 
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
@@ -1218,6 +1219,9 @@  do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 	if (kprobes_fault(regs))
 		return;
 
+	if (xpfo_spurious_fault(address))
+		return;
+
 	/*
 	 * Note, despite being a "bad area", there are quite a few
 	 * acceptable reasons to get here, such as erratum fixups
diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index ea5188882f49..58dd243637d2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -54,6 +54,8 @@  bool xpfo_enabled(void);
 
 phys_addr_t user_virt_to_phys(unsigned long addr);
 
+bool xpfo_spurious_fault(unsigned long addr);
+
 #else /* !CONFIG_XPFO */
 
 static inline void xpfo_init_single_page(struct page *page) { }
@@ -81,6 +83,8 @@  static inline bool xpfo_enabled(void) { return false; }
 
 static inline phys_addr_t user_virt_to_phys(unsigned long addr) { return 0; }
 
+static inline bool xpfo_spurious_fault(unsigned long addr) { return false; }
+
 #endif /* CONFIG_XPFO */
 
 #endif /* _LINUX_XPFO_H */
diff --git a/mm/xpfo.c b/mm/xpfo.c
index dbf20efb0499..85079377c91d 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -119,6 +119,16 @@  void xpfo_free_pages(struct page *page, int order)
 	}
 }
 
+static void xpfo_do_map(void *kaddr, struct page *page)
+{
+	spin_lock(&page->xpfo_lock);
+	if (PageXpfoUnmapped(page)) {
+		set_kpte(kaddr, page, PAGE_KERNEL);
+		ClearPageXpfoUnmapped(page);
+	}
+	spin_unlock(&page->xpfo_lock);
+}
+
 void xpfo_kmap(void *kaddr, struct page *page)
 {
 	if (!static_branch_unlikely(&xpfo_inited))
@@ -127,17 +137,12 @@  void xpfo_kmap(void *kaddr, struct page *page)
 	if (!PageXpfoUser(page))
 		return;
 
-	spin_lock(&page->xpfo_lock);
-
 	/*
 	 * The page was previously allocated to user space, so map it back
 	 * into the kernel. No TLB flush required.
 	 */
-	if ((atomic_inc_return(&page->xpfo_mapcount) == 1) &&
-	    TestClearPageXpfoUnmapped(page))
-		set_kpte(kaddr, page, PAGE_KERNEL);
-
-	spin_unlock(&page->xpfo_lock);
+	if (atomic_inc_return(&page->xpfo_mapcount) == 1)
+		xpfo_do_map(kaddr, page);
 }
 EXPORT_SYMBOL(xpfo_kmap);
 
@@ -204,3 +209,34 @@  void xpfo_temp_unmap(const void *addr, size_t size, void **mapping,
 			kunmap_atomic(mapping[i]);
 }
 EXPORT_SYMBOL(xpfo_temp_unmap);
+
+bool xpfo_spurious_fault(unsigned long addr)
+{
+	struct page *page;
+	bool spurious;
+	int mapcount;
+
+	if (!static_branch_unlikely(&xpfo_inited))
+		return false;
+
+	/* XXX Is this sufficient to guard against calling virt_to_page() on a
+	 * virtual address that has no corresponding struct page? */
+	if (!virt_addr_valid(addr))
+		return false;
+
+	page = virt_to_page(addr);
+	mapcount = atomic_read(&page->xpfo_mapcount);
+	spurious = PageXpfoUser(page) && mapcount;
+
+	/* Guarantee forward progress in case xpfo_kmap() raced. */
+	if (spurious && PageXpfoUnmapped(page)) {
+		xpfo_do_map((void *)(addr & PAGE_MASK), page);
+	}
+
+	if (unlikely(!spurious))
+		printk("XPFO non-spurious fault %lx user=%d unmapped=%d mapcount=%d\n",
+			addr, PageXpfoUser(page), PageXpfoUnmapped(page),
+			mapcount);
+
+	return spurious;
+}