diff mbox

[RFC,v3,1/2] Add support for eXclusive Page Frame Ownership (XPFO)

Message ID 20161104144534.14790-2-juerg.haefliger@hpe.com (mailing list archive)
State New, archived
Headers show

Commit Message

Juerg Haefliger Nov. 4, 2016, 2:45 p.m. UTC
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
    to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com>
---
 arch/x86/Kconfig         |   3 +-
 arch/x86/mm/init.c       |   2 +-
 drivers/ata/libata-sff.c |   4 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h     |  39 +++++++++
 lib/swiotlb.c            |   3 +-
 mm/Makefile              |   1 +
 mm/page_alloc.c          |   2 +
 mm/page_ext.c            |   4 +
 mm/xpfo.c                | 206 +++++++++++++++++++++++++++++++++++++++++++++++
 security/Kconfig         |  19 +++++
 12 files changed, 298 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

Comments

Christoph Hellwig Nov. 4, 2016, 2:50 p.m. UTC | #1
The libata parts here really need to be split out and the proper list
and maintainer need to be Cc'ed.

> diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> index 051b6158d1b7..58af734be25d 100644
> --- a/drivers/ata/libata-sff.c
> +++ b/drivers/ata/libata-sff.c
> @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
>  
>  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>  
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>  		unsigned long flags;
>  
>  		/* FIXME: use a bounce buffer */
> @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
>  
>  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>  
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>  		unsigned long flags;
>  
>  		/* FIXME: use bounce buffer */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h

This is just piling one nasty hack on top of another.  libata should
just use the highmem case unconditionally, as it is the correct thing
to do for all cases.
ZhaoJunmin Zhao(Junmin) Nov. 10, 2016, 5:53 a.m. UTC | #2
> This patch adds support for XPFO which protects against 'ret2dir' kernel
> attacks. The basic idea is to enforce exclusive ownership of page frames
> by either the kernel or userspace, unless explicitly requested by the
> kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
>
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.
>
> Known issues/limitations:
>    - Only supports x86-64 (for now)
>    - Only supports 4k pages (for now)
>    - There are most likely some legitimate uses cases where the kernel needs
>      to access userspace which need to be made XPFO-aware
>    - Performance penalty
>
> Reference paper by the original patch authors:
>    http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
>
> Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
> Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com>
> ---
>   arch/x86/Kconfig         |   3 +-
>   arch/x86/mm/init.c       |   2 +-
>   drivers/ata/libata-sff.c |   4 +-
>   include/linux/highmem.h  |  15 +++-
>   include/linux/page_ext.h |   7 ++
>   include/linux/xpfo.h     |  39 +++++++++
>   lib/swiotlb.c            |   3 +-
>   mm/Makefile              |   1 +
>   mm/page_alloc.c          |   2 +
>   mm/page_ext.c            |   4 +
>   mm/xpfo.c                | 206 +++++++++++++++++++++++++++++++++++++++++++++++
>   security/Kconfig         |  19 +++++
>   12 files changed, 298 insertions(+), 7 deletions(-)
>   create mode 100644 include/linux/xpfo.h
>   create mode 100644 mm/xpfo.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index bada636d1065..38b334f8fde5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -165,6 +165,7 @@ config X86
>   	select HAVE_STACK_VALIDATION		if X86_64
>   	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
>   	select ARCH_HAS_PKEYS			if X86_INTEL_MEMORY_PROTECTION_KEYS
> +	select ARCH_SUPPORTS_XPFO		if X86_64
>
>   config INSTRUCTION_DECODER
>   	def_bool y
> @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
>
>   config X86_DIRECT_GBPAGES
>   	def_bool y
> -	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
> +	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>   	---help---
>   	  Certain kernel features effectively disable kernel
>   	  linear 1 GB mappings (even if the CPU otherwise
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 22af912d66d2..a6fafbae02bb 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -161,7 +161,7 @@ static int page_size_mask;
>
>   static void __init probe_page_size_mask(void)
>   {
> -#if !defined(CONFIG_KMEMCHECK)
> +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
>   	/*
>   	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
>   	 * use small pages.
> diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> index 051b6158d1b7..58af734be25d 100644
> --- a/drivers/ata/libata-sff.c
> +++ b/drivers/ata/libata-sff.c
> @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
>
>   	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>   		unsigned long flags;
>
>   		/* FIXME: use a bounce buffer */
> @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
>
>   	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>   		unsigned long flags;
>
>   		/* FIXME: use bounce buffer */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index bb3f3297062a..7a17c166532f 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -7,6 +7,7 @@
>   #include <linux/mm.h>
>   #include <linux/uaccess.h>
>   #include <linux/hardirq.h>
> +#include <linux/xpfo.h>
>
>   #include <asm/cacheflush.h>
>
> @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
>   #ifndef ARCH_HAS_KMAP
>   static inline void *kmap(struct page *page)
>   {
> +	void *kaddr;
> +
>   	might_sleep();
> -	return page_address(page);
> +	kaddr = page_address(page);
> +	xpfo_kmap(kaddr, page);
> +	return kaddr;
>   }
>
>   static inline void kunmap(struct page *page)
>   {
> +	xpfo_kunmap(page_address(page), page);
>   }
>
>   static inline void *kmap_atomic(struct page *page)
>   {
> +	void *kaddr;
> +
>   	preempt_disable();
>   	pagefault_disable();
> -	return page_address(page);
> +	kaddr = page_address(page);
> +	xpfo_kmap(kaddr, page);
> +	return kaddr;
>   }
>   #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
>
>   static inline void __kunmap_atomic(void *addr)
>   {
> +	xpfo_kunmap(addr, virt_to_page(addr));
>   	pagefault_enable();
>   	preempt_enable();
>   }
> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> index 9298c393ddaa..0e451a42e5a3 100644
> --- a/include/linux/page_ext.h
> +++ b/include/linux/page_ext.h
> @@ -29,6 +29,8 @@ enum page_ext_flags {
>   	PAGE_EXT_DEBUG_POISON,		/* Page is poisoned */
>   	PAGE_EXT_DEBUG_GUARD,
>   	PAGE_EXT_OWNER,
> +	PAGE_EXT_XPFO_KERNEL,		/* Page is a kernel page */
> +	PAGE_EXT_XPFO_UNMAPPED,		/* Page is unmapped */
>   #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>   	PAGE_EXT_YOUNG,
>   	PAGE_EXT_IDLE,
> @@ -44,6 +46,11 @@ enum page_ext_flags {
>    */
>   struct page_ext {
>   	unsigned long flags;
> +#ifdef CONFIG_XPFO
> +	int inited;		/* Map counter and lock initialized */
> +	atomic_t mapcount;	/* Counter for balancing map/unmap requests */
> +	spinlock_t maplock;	/* Lock to serialize map/unmap requests */
> +#endif
>   };
>
>   extern void pgdat_page_ext_init(struct pglist_data *pgdat);
> diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> new file mode 100644
> index 000000000000..77187578ca33
> --- /dev/null
> +++ b/include/linux/xpfo.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#ifndef _LINUX_XPFO_H
> +#define _LINUX_XPFO_H
> +
> +#ifdef CONFIG_XPFO
> +
> +extern struct page_ext_operations page_xpfo_ops;
> +
> +extern void xpfo_kmap(void *kaddr, struct page *page);
> +extern void xpfo_kunmap(void *kaddr, struct page *page);
> +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp);
> +extern void xpfo_free_page(struct page *page, int order);
> +
> +extern bool xpfo_page_is_unmapped(struct page *page);
> +
> +#else /* !CONFIG_XPFO */
> +
> +static inline void xpfo_kmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { }
> +static inline void xpfo_free_page(struct page *page, int order) { }
> +
> +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
> +
> +#endif /* CONFIG_XPFO */
> +
> +#endif /* _LINUX_XPFO_H */
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index 22e13a0e19d7..455eff44604e 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
>   {
>   	unsigned long pfn = PFN_DOWN(orig_addr);
>   	unsigned char *vaddr = phys_to_virt(tlb_addr);
> +	struct page *page = pfn_to_page(pfn);
>
> -	if (PageHighMem(pfn_to_page(pfn))) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>   		/* The buffer does not have a mapping.  Map it in and copy */
>   		unsigned int offset = orig_addr & ~PAGE_MASK;
>   		char *buffer;
> diff --git a/mm/Makefile b/mm/Makefile
> index 295bd7a9f76b..175680f516aa 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>   obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
>   obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
>   obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> +obj-$(CONFIG_XPFO) += xpfo.o
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8fd42aa7c4bd..100e80e008e2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
>   	kernel_poison_pages(page, 1 << order, 0);
>   	kernel_map_pages(page, 1 << order, 0);
>   	kasan_free_pages(page, order);
> +	xpfo_free_page(page, order);
>
>   	return true;
>   }
> @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>   	kernel_map_pages(page, 1 << order, 1);
>   	kernel_poison_pages(page, 1 << order, 1);
>   	kasan_alloc_pages(page, order);
> +	xpfo_alloc_page(page, order, gfp_flags);
>   	set_page_owner(page, order, gfp_flags);
>   }
>
> diff --git a/mm/page_ext.c b/mm/page_ext.c
> index 121dcffc4ec1..ba6dbcacc2db 100644
> --- a/mm/page_ext.c
> +++ b/mm/page_ext.c
> @@ -7,6 +7,7 @@
>   #include <linux/kmemleak.h>
>   #include <linux/page_owner.h>
>   #include <linux/page_idle.h>
> +#include <linux/xpfo.h>
>
>   /*
>    * struct page extension
> @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = {
>   #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>   	&page_idle_ops,
>   #endif
> +#ifdef CONFIG_XPFO
> +	&page_xpfo_ops,
> +#endif
>   };
>
>   static unsigned long total_usage;
> diff --git a/mm/xpfo.c b/mm/xpfo.c
> new file mode 100644
> index 000000000000..8e3a6a694b6a
> --- /dev/null
> +++ b/mm/xpfo.c
> @@ -0,0 +1,206 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/page_ext.h>
> +#include <linux/xpfo.h>
> +
> +#include <asm/tlbflush.h>
> +
> +DEFINE_STATIC_KEY_FALSE(xpfo_inited);
> +
> +static bool need_xpfo(void)
> +{
> +	return true;
> +}
> +
> +static void init_xpfo(void)
> +{
> +	printk(KERN_INFO "XPFO enabled\n");
> +	static_branch_enable(&xpfo_inited);
> +}
> +
> +struct page_ext_operations page_xpfo_ops = {
> +	.need = need_xpfo,
> +	.init = init_xpfo,
> +};
> +
> +/*
> + * Update a single kernel page table entry
> + */
> +static inline void set_kpte(struct page *page, unsigned long kaddr,
> +			    pgprot_t prot) {
> +	unsigned int level;
> +	pte_t *kpte = lookup_address(kaddr, &level);
> +
> +	/* We only support 4k pages for now */
> +	BUG_ON(!kpte || level != PG_LEVEL_4K);
> +
> +	set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
> +}
> +
> +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp)
> +{
> +	int i, flush_tlb = 0;
> +	struct page_ext *page_ext;
> +	unsigned long kaddr;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	for (i = 0; i < (1 << order); i++)  {
> +		page_ext = lookup_page_ext(page + i);
> +
> +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +
> +		/* Initialize the map lock and map counter */
> +		if (!page_ext->inited) {
> +			spin_lock_init(&page_ext->maplock);
> +			atomic_set(&page_ext->mapcount, 0);
> +			page_ext->inited = 1;
> +		}
> +		BUG_ON(atomic_read(&page_ext->mapcount));
> +
> +		if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
> +			/*
> +			 * Flush the TLB if the page was previously allocated
> +			 * to the kernel.
> +			 */
> +			if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL,
> +					       &page_ext->flags))
> +				flush_tlb = 1;
> +		} else {
> +			/* Tag the page as a kernel page */
> +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +		}
> +	}
> +
> +	if (flush_tlb) {
> +		kaddr = (unsigned long)page_address(page);
> +		flush_tlb_kernel_range(kaddr, kaddr + (1 << order) *
> +				       PAGE_SIZE);
> +	}
> +}
> +
> +void xpfo_free_page(struct page *page, int order)
> +{
> +	int i;
> +	struct page_ext *page_ext;
> +	unsigned long kaddr;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	for (i = 0; i < (1 << order); i++) {
> +		page_ext = lookup_page_ext(page + i);
> +
> +		if (!page_ext->inited) {
> +			/*
> +			 * The page was allocated before page_ext was
> +			 * initialized, so it is a kernel page and it needs to
> +			 * be tagged accordingly.
> +			 */
> +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +			continue;
> +		}
> +
> +		/*
> +		 * Map the page back into the kernel if it was previously
> +		 * allocated to user space.
> +		 */
> +		if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
> +				       &page_ext->flags)) {
> +			kaddr = (unsigned long)page_address(page + i);
> +			set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> +		}
> +	}
> +}
> +
> +void xpfo_kmap(void *kaddr, struct page *page)
> +{
> +	struct page_ext *page_ext;
> +	unsigned long flags;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	page_ext = lookup_page_ext(page);
> +
> +	/*
> +	 * The page was allocated before page_ext was initialized (which means
> +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> +	 * do.
> +	 */
> +	if (!page_ext->inited ||
> +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +		return;
> +
> +	spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +	/*
> +	 * The page was previously allocated to user space, so map it back
> +	 * into the kernel. No TLB flush required.
> +	 */
> +	if ((atomic_inc_return(&page_ext->mapcount) == 1) &&
> +	    test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags))
> +		set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL));
> +
> +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kmap);
> +
> +void xpfo_kunmap(void *kaddr, struct page *page)
> +{
> +	struct page_ext *page_ext;
> +	unsigned long flags;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	page_ext = lookup_page_ext(page);
> +
> +	/*
> +	 * The page was allocated before page_ext was initialized (which means
> +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> +	 * do.
> +	 */
> +	if (!page_ext->inited ||
> +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +		return;
> +
> +	spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +	/*
> +	 * The page is to be allocated back to user space, so unmap it from the
> +	 * kernel, flush the TLB and tag it as a user page.
> +	 */
> +	if (atomic_dec_return(&page_ext->mapcount) == 0) {
> +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +		set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
> +		set_kpte(page, (unsigned long)kaddr, __pgprot(0));
> +		__flush_tlb_one((unsigned long)kaddr);
> +	}
> +
> +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kunmap);
> +
> +inline bool xpfo_page_is_unmapped(struct page *page)
> +{
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return false;
> +
> +	return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags);
> +}
> +EXPORT_SYMBOL(xpfo_page_is_unmapped);
> diff --git a/security/Kconfig b/security/Kconfig
> index 118f4549404e..4502e15c8419 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -6,6 +6,25 @@ menu "Security options"
>
>   source security/keys/Kconfig
>
> +config ARCH_SUPPORTS_XPFO
> +	bool
> +
> +config XPFO
> +	bool "Enable eXclusive Page Frame Ownership (XPFO)"
> +	default n
> +	depends on ARCH_SUPPORTS_XPFO
> +	select PAGE_EXTENSION
> +	help
> +	  This option offers protection against 'ret2dir' kernel attacks.
> +	  When enabled, every time a page frame is allocated to user space, it
> +	  is unmapped from the direct mapped RAM region in kernel space
> +	  (physmap). Similarly, when a page frame is freed/reclaimed, it is
> +	  mapped back to physmap.
> +
> +	  There is a slight performance impact when this option is enabled.
> +
> +	  If in doubt, say "N".
> +
>   config SECURITY_DMESG_RESTRICT
>   	bool "Restrict unprivileged access to the kernel syslog"
>   	default n
>

When a physical page is assigned to a process in user space, it should 
be unmaped from kernel physmap.  From the code, I can see the patch only 
handle the page in high memory zone. if the kernel use the high memory 
zone, it will call the kmap. So I would like to know if the physical 
page is coming from normal zone,how to handle it.

Thanks
Zhaojunmin
Kees Cook Nov. 10, 2016, 7:11 p.m. UTC | #3
On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote:
> This patch adds support for XPFO which protects against 'ret2dir' kernel
> attacks. The basic idea is to enforce exclusive ownership of page frames
> by either the kernel or userspace, unless explicitly requested by the
> kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
>
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.

Thanks for keeping on this! I'd really like to see it land and then
get more architectures to support it.

> Known issues/limitations:
>   - Only supports x86-64 (for now)
>   - Only supports 4k pages (for now)
>   - There are most likely some legitimate uses cases where the kernel needs
>     to access userspace which need to be made XPFO-aware
>   - Performance penalty

In the Kconfig you say "slight", but I'm curious what kinds of
benchmarks you've done and if there's a more specific cost we can
declare, just to give people more of an idea what the hit looks like?
(What workloads would trigger a lot of XPFO unmapping, for example?)

Thanks!

-Kees
Kees Cook Nov. 10, 2016, 7:24 p.m. UTC | #4
On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote:
> This patch adds support for XPFO which protects against 'ret2dir' kernel
> attacks. The basic idea is to enforce exclusive ownership of page frames
> by either the kernel or userspace, unless explicitly requested by the
> kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
>
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.
>
> Known issues/limitations:
>   - Only supports x86-64 (for now)
>   - Only supports 4k pages (for now)
>   - There are most likely some legitimate uses cases where the kernel needs
>     to access userspace which need to be made XPFO-aware
>   - Performance penalty
>
> Reference paper by the original patch authors:
>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Would it be possible to create an lkdtm test that can exercise this protection?

> Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
> Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com>
> ---
>  arch/x86/Kconfig         |   3 +-
>  arch/x86/mm/init.c       |   2 +-
>  drivers/ata/libata-sff.c |   4 +-
>  include/linux/highmem.h  |  15 +++-
>  include/linux/page_ext.h |   7 ++
>  include/linux/xpfo.h     |  39 +++++++++
>  lib/swiotlb.c            |   3 +-
>  mm/Makefile              |   1 +
>  mm/page_alloc.c          |   2 +
>  mm/page_ext.c            |   4 +
>  mm/xpfo.c                | 206 +++++++++++++++++++++++++++++++++++++++++++++++
>  security/Kconfig         |  19 +++++
>  12 files changed, 298 insertions(+), 7 deletions(-)
>  create mode 100644 include/linux/xpfo.h
>  create mode 100644 mm/xpfo.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index bada636d1065..38b334f8fde5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -165,6 +165,7 @@ config X86
>         select HAVE_STACK_VALIDATION            if X86_64
>         select ARCH_USES_HIGH_VMA_FLAGS         if X86_INTEL_MEMORY_PROTECTION_KEYS
>         select ARCH_HAS_PKEYS                   if X86_INTEL_MEMORY_PROTECTION_KEYS
> +       select ARCH_SUPPORTS_XPFO               if X86_64
>
>  config INSTRUCTION_DECODER
>         def_bool y
> @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
>
>  config X86_DIRECT_GBPAGES
>         def_bool y
> -       depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
> +       depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>         ---help---
>           Certain kernel features effectively disable kernel
>           linear 1 GB mappings (even if the CPU otherwise
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 22af912d66d2..a6fafbae02bb 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -161,7 +161,7 @@ static int page_size_mask;
>
>  static void __init probe_page_size_mask(void)
>  {
> -#if !defined(CONFIG_KMEMCHECK)
> +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
>         /*
>          * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
>          * use small pages.
> diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> index 051b6158d1b7..58af734be25d 100644
> --- a/drivers/ata/libata-sff.c
> +++ b/drivers/ata/libata-sff.c
> @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
>
>         DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>
> -       if (PageHighMem(page)) {
> +       if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>                 unsigned long flags;
>
>                 /* FIXME: use a bounce buffer */
> @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
>
>         DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>
> -       if (PageHighMem(page)) {
> +       if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>                 unsigned long flags;
>
>                 /* FIXME: use bounce buffer */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index bb3f3297062a..7a17c166532f 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -7,6 +7,7 @@
>  #include <linux/mm.h>
>  #include <linux/uaccess.h>
>  #include <linux/hardirq.h>
> +#include <linux/xpfo.h>
>
>  #include <asm/cacheflush.h>
>
> @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
>  #ifndef ARCH_HAS_KMAP
>  static inline void *kmap(struct page *page)
>  {
> +       void *kaddr;
> +
>         might_sleep();
> -       return page_address(page);
> +       kaddr = page_address(page);
> +       xpfo_kmap(kaddr, page);
> +       return kaddr;
>  }
>
>  static inline void kunmap(struct page *page)
>  {
> +       xpfo_kunmap(page_address(page), page);
>  }
>
>  static inline void *kmap_atomic(struct page *page)
>  {
> +       void *kaddr;
> +
>         preempt_disable();
>         pagefault_disable();
> -       return page_address(page);
> +       kaddr = page_address(page);
> +       xpfo_kmap(kaddr, page);
> +       return kaddr;
>  }
>  #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
>
>  static inline void __kunmap_atomic(void *addr)
>  {
> +       xpfo_kunmap(addr, virt_to_page(addr));
>         pagefault_enable();
>         preempt_enable();
>  }
> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> index 9298c393ddaa..0e451a42e5a3 100644
> --- a/include/linux/page_ext.h
> +++ b/include/linux/page_ext.h
> @@ -29,6 +29,8 @@ enum page_ext_flags {
>         PAGE_EXT_DEBUG_POISON,          /* Page is poisoned */
>         PAGE_EXT_DEBUG_GUARD,
>         PAGE_EXT_OWNER,
> +       PAGE_EXT_XPFO_KERNEL,           /* Page is a kernel page */
> +       PAGE_EXT_XPFO_UNMAPPED,         /* Page is unmapped */
>  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>         PAGE_EXT_YOUNG,
>         PAGE_EXT_IDLE,
> @@ -44,6 +46,11 @@ enum page_ext_flags {
>   */
>  struct page_ext {
>         unsigned long flags;
> +#ifdef CONFIG_XPFO
> +       int inited;             /* Map counter and lock initialized */
> +       atomic_t mapcount;      /* Counter for balancing map/unmap requests */
> +       spinlock_t maplock;     /* Lock to serialize map/unmap requests */
> +#endif
>  };
>
>  extern void pgdat_page_ext_init(struct pglist_data *pgdat);
> diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> new file mode 100644
> index 000000000000..77187578ca33
> --- /dev/null
> +++ b/include/linux/xpfo.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#ifndef _LINUX_XPFO_H
> +#define _LINUX_XPFO_H
> +
> +#ifdef CONFIG_XPFO
> +
> +extern struct page_ext_operations page_xpfo_ops;
> +
> +extern void xpfo_kmap(void *kaddr, struct page *page);
> +extern void xpfo_kunmap(void *kaddr, struct page *page);
> +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp);
> +extern void xpfo_free_page(struct page *page, int order);
> +
> +extern bool xpfo_page_is_unmapped(struct page *page);
> +
> +#else /* !CONFIG_XPFO */
> +
> +static inline void xpfo_kmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { }
> +static inline void xpfo_free_page(struct page *page, int order) { }
> +
> +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
> +
> +#endif /* CONFIG_XPFO */
> +
> +#endif /* _LINUX_XPFO_H */
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index 22e13a0e19d7..455eff44604e 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
>  {
>         unsigned long pfn = PFN_DOWN(orig_addr);
>         unsigned char *vaddr = phys_to_virt(tlb_addr);
> +       struct page *page = pfn_to_page(pfn);
>
> -       if (PageHighMem(pfn_to_page(pfn))) {
> +       if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>                 /* The buffer does not have a mapping.  Map it in and copy */
>                 unsigned int offset = orig_addr & ~PAGE_MASK;
>                 char *buffer;
> diff --git a/mm/Makefile b/mm/Makefile
> index 295bd7a9f76b..175680f516aa 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
>  obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
>  obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> +obj-$(CONFIG_XPFO) += xpfo.o
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8fd42aa7c4bd..100e80e008e2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
>         kernel_poison_pages(page, 1 << order, 0);
>         kernel_map_pages(page, 1 << order, 0);
>         kasan_free_pages(page, order);
> +       xpfo_free_page(page, order);
>
>         return true;
>  }
> @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>         kernel_map_pages(page, 1 << order, 1);
>         kernel_poison_pages(page, 1 << order, 1);
>         kasan_alloc_pages(page, order);
> +       xpfo_alloc_page(page, order, gfp_flags);
>         set_page_owner(page, order, gfp_flags);
>  }
>
> diff --git a/mm/page_ext.c b/mm/page_ext.c
> index 121dcffc4ec1..ba6dbcacc2db 100644
> --- a/mm/page_ext.c
> +++ b/mm/page_ext.c
> @@ -7,6 +7,7 @@
>  #include <linux/kmemleak.h>
>  #include <linux/page_owner.h>
>  #include <linux/page_idle.h>
> +#include <linux/xpfo.h>
>
>  /*
>   * struct page extension
> @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = {
>  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>         &page_idle_ops,
>  #endif
> +#ifdef CONFIG_XPFO
> +       &page_xpfo_ops,
> +#endif
>  };
>
>  static unsigned long total_usage;
> diff --git a/mm/xpfo.c b/mm/xpfo.c
> new file mode 100644
> index 000000000000..8e3a6a694b6a
> --- /dev/null
> +++ b/mm/xpfo.c
> @@ -0,0 +1,206 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/page_ext.h>
> +#include <linux/xpfo.h>
> +
> +#include <asm/tlbflush.h>
> +
> +DEFINE_STATIC_KEY_FALSE(xpfo_inited);
> +
> +static bool need_xpfo(void)
> +{
> +       return true;
> +}
> +
> +static void init_xpfo(void)
> +{
> +       printk(KERN_INFO "XPFO enabled\n");
> +       static_branch_enable(&xpfo_inited);
> +}
> +
> +struct page_ext_operations page_xpfo_ops = {
> +       .need = need_xpfo,
> +       .init = init_xpfo,
> +};
> +
> +/*
> + * Update a single kernel page table entry
> + */
> +static inline void set_kpte(struct page *page, unsigned long kaddr,
> +                           pgprot_t prot) {
> +       unsigned int level;
> +       pte_t *kpte = lookup_address(kaddr, &level);
> +
> +       /* We only support 4k pages for now */
> +       BUG_ON(!kpte || level != PG_LEVEL_4K);
> +
> +       set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
> +}
> +
> +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp)
> +{
> +       int i, flush_tlb = 0;
> +       struct page_ext *page_ext;
> +       unsigned long kaddr;
> +
> +       if (!static_branch_unlikely(&xpfo_inited))
> +               return;
> +
> +       for (i = 0; i < (1 << order); i++)  {
> +               page_ext = lookup_page_ext(page + i);
> +
> +               BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +
> +               /* Initialize the map lock and map counter */
> +               if (!page_ext->inited) {
> +                       spin_lock_init(&page_ext->maplock);
> +                       atomic_set(&page_ext->mapcount, 0);
> +                       page_ext->inited = 1;
> +               }
> +               BUG_ON(atomic_read(&page_ext->mapcount));
> +
> +               if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
> +                       /*
> +                        * Flush the TLB if the page was previously allocated
> +                        * to the kernel.
> +                        */
> +                       if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL,
> +                                              &page_ext->flags))
> +                               flush_tlb = 1;
> +               } else {
> +                       /* Tag the page as a kernel page */
> +                       set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +               }
> +       }
> +
> +       if (flush_tlb) {
> +               kaddr = (unsigned long)page_address(page);
> +               flush_tlb_kernel_range(kaddr, kaddr + (1 << order) *
> +                                      PAGE_SIZE);
> +       }
> +}
> +
> +void xpfo_free_page(struct page *page, int order)
> +{
> +       int i;
> +       struct page_ext *page_ext;
> +       unsigned long kaddr;
> +
> +       if (!static_branch_unlikely(&xpfo_inited))
> +               return;
> +
> +       for (i = 0; i < (1 << order); i++) {
> +               page_ext = lookup_page_ext(page + i);
> +
> +               if (!page_ext->inited) {
> +                       /*
> +                        * The page was allocated before page_ext was
> +                        * initialized, so it is a kernel page and it needs to
> +                        * be tagged accordingly.
> +                        */
> +                       set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +                       continue;
> +               }
> +
> +               /*
> +                * Map the page back into the kernel if it was previously
> +                * allocated to user space.
> +                */
> +               if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
> +                                      &page_ext->flags)) {
> +                       kaddr = (unsigned long)page_address(page + i);
> +                       set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> +               }
> +       }
> +}
> +
> +void xpfo_kmap(void *kaddr, struct page *page)
> +{
> +       struct page_ext *page_ext;
> +       unsigned long flags;
> +
> +       if (!static_branch_unlikely(&xpfo_inited))
> +               return;
> +
> +       page_ext = lookup_page_ext(page);
> +
> +       /*
> +        * The page was allocated before page_ext was initialized (which means
> +        * it's a kernel page) or it's allocated to the kernel, so nothing to
> +        * do.
> +        */
> +       if (!page_ext->inited ||
> +           test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +               return;
> +
> +       spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +       /*
> +        * The page was previously allocated to user space, so map it back
> +        * into the kernel. No TLB flush required.
> +        */
> +       if ((atomic_inc_return(&page_ext->mapcount) == 1) &&
> +           test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags))
> +               set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL));
> +
> +       spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kmap);
> +
> +void xpfo_kunmap(void *kaddr, struct page *page)
> +{
> +       struct page_ext *page_ext;
> +       unsigned long flags;
> +
> +       if (!static_branch_unlikely(&xpfo_inited))
> +               return;
> +
> +       page_ext = lookup_page_ext(page);
> +
> +       /*
> +        * The page was allocated before page_ext was initialized (which means
> +        * it's a kernel page) or it's allocated to the kernel, so nothing to
> +        * do.
> +        */
> +       if (!page_ext->inited ||
> +           test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +               return;
> +
> +       spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +       /*
> +        * The page is to be allocated back to user space, so unmap it from the
> +        * kernel, flush the TLB and tag it as a user page.
> +        */
> +       if (atomic_dec_return(&page_ext->mapcount) == 0) {
> +               BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +               set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
> +               set_kpte(page, (unsigned long)kaddr, __pgprot(0));
> +               __flush_tlb_one((unsigned long)kaddr);
> +       }
> +
> +       spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kunmap);
> +
> +inline bool xpfo_page_is_unmapped(struct page *page)
> +{
> +       if (!static_branch_unlikely(&xpfo_inited))
> +               return false;
> +
> +       return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags);
> +}
> +EXPORT_SYMBOL(xpfo_page_is_unmapped);
> diff --git a/security/Kconfig b/security/Kconfig
> index 118f4549404e..4502e15c8419 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -6,6 +6,25 @@ menu "Security options"
>
>  source security/keys/Kconfig
>
> +config ARCH_SUPPORTS_XPFO
> +       bool

Can you include a "help" section here to describe what requirements an
architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and
HAVE_ARCH_VMAP_STACK or some examples.

> +config XPFO
> +       bool "Enable eXclusive Page Frame Ownership (XPFO)"
> +       default n
> +       depends on ARCH_SUPPORTS_XPFO
> +       select PAGE_EXTENSION
> +       help
> +         This option offers protection against 'ret2dir' kernel attacks.
> +         When enabled, every time a page frame is allocated to user space, it
> +         is unmapped from the direct mapped RAM region in kernel space
> +         (physmap). Similarly, when a page frame is freed/reclaimed, it is
> +         mapped back to physmap.
> +
> +         There is a slight performance impact when this option is enabled.
> +
> +         If in doubt, say "N".
> +
>  config SECURITY_DMESG_RESTRICT
>         bool "Restrict unprivileged access to the kernel syslog"
>         default n
> --
> 2.10.1
>

I've added these patches to my kspp tree on kernel.org, so it should
get some 0-day testing now...

Thanks!

-Kees
Juerg Haefliger Nov. 15, 2016, 11:15 a.m. UTC | #5
Sorry for the late reply, I just found your email in my cluttered inbox.

On 11/10/2016 08:11 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
> 
> Thanks for keeping on this! I'd really like to see it land and then
> get more architectures to support it.

Good to hear :-)


>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>>     to access userspace which need to be made XPFO-aware
>>   - Performance penalty
> 
> In the Kconfig you say "slight", but I'm curious what kinds of
> benchmarks you've done and if there's a more specific cost we can
> declare, just to give people more of an idea what the hit looks like?
> (What workloads would trigger a lot of XPFO unmapping, for example?)

That 'slight' wording is based on the performance numbers published in the referenced paper.

So far I've only run kernel compilation tests. For that workload, the big performance hit comes from
disabling >4k page sizes (around 10%). Adding XPFO on top causes 'only' another 0.5% performance
penalty. I'm currently looking into adding support for larger page sizes to see what the real impact
is and then generate some more relevant numbers.

...Juerg


> Thanks!
> 
> -Kees
>
Juerg Haefliger Nov. 15, 2016, 11:18 a.m. UTC | #6
On 11/10/2016 08:24 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
>>
>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>>     to access userspace which need to be made XPFO-aware
>>   - Performance penalty
>>
>> Reference paper by the original patch authors:
>>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Would it be possible to create an lkdtm test that can exercise this protection?

I'll look into it.


>> diff --git a/security/Kconfig b/security/Kconfig
>> index 118f4549404e..4502e15c8419 100644
>> --- a/security/Kconfig
>> +++ b/security/Kconfig
>> @@ -6,6 +6,25 @@ menu "Security options"
>>
>>  source security/keys/Kconfig
>>
>> +config ARCH_SUPPORTS_XPFO
>> +       bool
> 
> Can you include a "help" section here to describe what requirements an
> architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and
> HAVE_ARCH_VMAP_STACK or some examples.

Will do.


>> +config XPFO
>> +       bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +       default n
>> +       depends on ARCH_SUPPORTS_XPFO
>> +       select PAGE_EXTENSION
>> +       help
>> +         This option offers protection against 'ret2dir' kernel attacks.
>> +         When enabled, every time a page frame is allocated to user space, it
>> +         is unmapped from the direct mapped RAM region in kernel space
>> +         (physmap). Similarly, when a page frame is freed/reclaimed, it is
>> +         mapped back to physmap.
>> +
>> +         There is a slight performance impact when this option is enabled.
>> +
>> +         If in doubt, say "N".
>> +
>>  config SECURITY_DMESG_RESTRICT
>>         bool "Restrict unprivileged access to the kernel syslog"
>>         default n
> 
> I've added these patches to my kspp tree on kernel.org, so it should
> get some 0-day testing now...

Very good. Thanks!


> Thanks!

Appreciate the feedback.

...Juerg


> -Kees
>
AKASHI Takahiro Nov. 24, 2016, 10:56 a.m. UTC | #7
Hi,

I'm trying to give it a spin on arm64, but ...

On Fri, Nov 04, 2016 at 03:45:33PM +0100, Juerg Haefliger wrote:
> This patch adds support for XPFO which protects against 'ret2dir' kernel
> attacks. The basic idea is to enforce exclusive ownership of page frames
> by either the kernel or userspace, unless explicitly requested by the
> kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
> 
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.
> 
> Known issues/limitations:
>   - Only supports x86-64 (for now)
>   - Only supports 4k pages (for now)
>   - There are most likely some legitimate uses cases where the kernel needs
>     to access userspace which need to be made XPFO-aware
>   - Performance penalty
> 
> Reference paper by the original patch authors:
>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
> Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com>
> ---
>  arch/x86/Kconfig         |   3 +-
>  arch/x86/mm/init.c       |   2 +-
>  drivers/ata/libata-sff.c |   4 +-
>  include/linux/highmem.h  |  15 +++-
>  include/linux/page_ext.h |   7 ++
>  include/linux/xpfo.h     |  39 +++++++++
>  lib/swiotlb.c            |   3 +-
>  mm/Makefile              |   1 +
>  mm/page_alloc.c          |   2 +
>  mm/page_ext.c            |   4 +
>  mm/xpfo.c                | 206 +++++++++++++++++++++++++++++++++++++++++++++++
>  security/Kconfig         |  19 +++++
>  12 files changed, 298 insertions(+), 7 deletions(-)
>  create mode 100644 include/linux/xpfo.h
>  create mode 100644 mm/xpfo.c
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index bada636d1065..38b334f8fde5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -165,6 +165,7 @@ config X86
>  	select HAVE_STACK_VALIDATION		if X86_64
>  	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
>  	select ARCH_HAS_PKEYS			if X86_INTEL_MEMORY_PROTECTION_KEYS
> +	select ARCH_SUPPORTS_XPFO		if X86_64
>  
>  config INSTRUCTION_DECODER
>  	def_bool y
> @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
>  
>  config X86_DIRECT_GBPAGES
>  	def_bool y
> -	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
> +	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>  	---help---
>  	  Certain kernel features effectively disable kernel
>  	  linear 1 GB mappings (even if the CPU otherwise
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 22af912d66d2..a6fafbae02bb 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -161,7 +161,7 @@ static int page_size_mask;
>  
>  static void __init probe_page_size_mask(void)
>  {
> -#if !defined(CONFIG_KMEMCHECK)
> +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
>  	/*
>  	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
>  	 * use small pages.
> diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> index 051b6158d1b7..58af734be25d 100644
> --- a/drivers/ata/libata-sff.c
> +++ b/drivers/ata/libata-sff.c
> @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
>  
>  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>  
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>  		unsigned long flags;
>  
>  		/* FIXME: use a bounce buffer */
> @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
>  
>  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
>  
> -	if (PageHighMem(page)) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>  		unsigned long flags;
>  
>  		/* FIXME: use bounce buffer */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index bb3f3297062a..7a17c166532f 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -7,6 +7,7 @@
>  #include <linux/mm.h>
>  #include <linux/uaccess.h>
>  #include <linux/hardirq.h>
> +#include <linux/xpfo.h>
>  
>  #include <asm/cacheflush.h>
>  
> @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
>  #ifndef ARCH_HAS_KMAP
>  static inline void *kmap(struct page *page)
>  {
> +	void *kaddr;
> +
>  	might_sleep();
> -	return page_address(page);
> +	kaddr = page_address(page);
> +	xpfo_kmap(kaddr, page);
> +	return kaddr;
>  }
>  
>  static inline void kunmap(struct page *page)
>  {
> +	xpfo_kunmap(page_address(page), page);
>  }
>  
>  static inline void *kmap_atomic(struct page *page)
>  {
> +	void *kaddr;
> +
>  	preempt_disable();
>  	pagefault_disable();
> -	return page_address(page);
> +	kaddr = page_address(page);
> +	xpfo_kmap(kaddr, page);
> +	return kaddr;
>  }
>  #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
>  
>  static inline void __kunmap_atomic(void *addr)
>  {
> +	xpfo_kunmap(addr, virt_to_page(addr));
>  	pagefault_enable();
>  	preempt_enable();
>  }
> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> index 9298c393ddaa..0e451a42e5a3 100644
> --- a/include/linux/page_ext.h
> +++ b/include/linux/page_ext.h
> @@ -29,6 +29,8 @@ enum page_ext_flags {
>  	PAGE_EXT_DEBUG_POISON,		/* Page is poisoned */
>  	PAGE_EXT_DEBUG_GUARD,
>  	PAGE_EXT_OWNER,
> +	PAGE_EXT_XPFO_KERNEL,		/* Page is a kernel page */
> +	PAGE_EXT_XPFO_UNMAPPED,		/* Page is unmapped */
>  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>  	PAGE_EXT_YOUNG,
>  	PAGE_EXT_IDLE,
> @@ -44,6 +46,11 @@ enum page_ext_flags {
>   */
>  struct page_ext {
>  	unsigned long flags;
> +#ifdef CONFIG_XPFO
> +	int inited;		/* Map counter and lock initialized */
> +	atomic_t mapcount;	/* Counter for balancing map/unmap requests */
> +	spinlock_t maplock;	/* Lock to serialize map/unmap requests */
> +#endif
>  };
>  
>  extern void pgdat_page_ext_init(struct pglist_data *pgdat);
> diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> new file mode 100644
> index 000000000000..77187578ca33
> --- /dev/null
> +++ b/include/linux/xpfo.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#ifndef _LINUX_XPFO_H
> +#define _LINUX_XPFO_H
> +
> +#ifdef CONFIG_XPFO
> +
> +extern struct page_ext_operations page_xpfo_ops;
> +
> +extern void xpfo_kmap(void *kaddr, struct page *page);
> +extern void xpfo_kunmap(void *kaddr, struct page *page);
> +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp);
> +extern void xpfo_free_page(struct page *page, int order);
> +
> +extern bool xpfo_page_is_unmapped(struct page *page);
> +
> +#else /* !CONFIG_XPFO */
> +
> +static inline void xpfo_kmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
> +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { }
> +static inline void xpfo_free_page(struct page *page, int order) { }
> +
> +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
> +
> +#endif /* CONFIG_XPFO */
> +
> +#endif /* _LINUX_XPFO_H */
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index 22e13a0e19d7..455eff44604e 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
>  {
>  	unsigned long pfn = PFN_DOWN(orig_addr);
>  	unsigned char *vaddr = phys_to_virt(tlb_addr);
> +	struct page *page = pfn_to_page(pfn);
>  
> -	if (PageHighMem(pfn_to_page(pfn))) {
> +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
>  		/* The buffer does not have a mapping.  Map it in and copy */
>  		unsigned int offset = orig_addr & ~PAGE_MASK;
>  		char *buffer;
> diff --git a/mm/Makefile b/mm/Makefile
> index 295bd7a9f76b..175680f516aa 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
>  obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
>  obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> +obj-$(CONFIG_XPFO) += xpfo.o
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8fd42aa7c4bd..100e80e008e2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
>  	kernel_poison_pages(page, 1 << order, 0);
>  	kernel_map_pages(page, 1 << order, 0);
>  	kasan_free_pages(page, order);
> +	xpfo_free_page(page, order);
>  
>  	return true;
>  }
> @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>  	kernel_map_pages(page, 1 << order, 1);
>  	kernel_poison_pages(page, 1 << order, 1);
>  	kasan_alloc_pages(page, order);
> +	xpfo_alloc_page(page, order, gfp_flags);
>  	set_page_owner(page, order, gfp_flags);
>  }
>  
> diff --git a/mm/page_ext.c b/mm/page_ext.c
> index 121dcffc4ec1..ba6dbcacc2db 100644
> --- a/mm/page_ext.c
> +++ b/mm/page_ext.c
> @@ -7,6 +7,7 @@
>  #include <linux/kmemleak.h>
>  #include <linux/page_owner.h>
>  #include <linux/page_idle.h>
> +#include <linux/xpfo.h>
>  
>  /*
>   * struct page extension
> @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = {
>  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
>  	&page_idle_ops,
>  #endif
> +#ifdef CONFIG_XPFO
> +	&page_xpfo_ops,
> +#endif
>  };
>  
>  static unsigned long total_usage;
> diff --git a/mm/xpfo.c b/mm/xpfo.c
> new file mode 100644
> index 000000000000..8e3a6a694b6a
> --- /dev/null
> +++ b/mm/xpfo.c
> @@ -0,0 +1,206 @@
> +/*
> + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> + * Copyright (C) 2016 Brown University. All rights reserved.
> + *
> + * Authors:
> + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/page_ext.h>
> +#include <linux/xpfo.h>
> +
> +#include <asm/tlbflush.h>
> +
> +DEFINE_STATIC_KEY_FALSE(xpfo_inited);
> +
> +static bool need_xpfo(void)
> +{
> +	return true;
> +}
> +
> +static void init_xpfo(void)
> +{
> +	printk(KERN_INFO "XPFO enabled\n");
> +	static_branch_enable(&xpfo_inited);
> +}
> +
> +struct page_ext_operations page_xpfo_ops = {
> +	.need = need_xpfo,
> +	.init = init_xpfo,
> +};
> +
> +/*
> + * Update a single kernel page table entry
> + */
> +static inline void set_kpte(struct page *page, unsigned long kaddr,
> +			    pgprot_t prot) {
> +	unsigned int level;
> +	pte_t *kpte = lookup_address(kaddr, &level);
> +
> +	/* We only support 4k pages for now */
> +	BUG_ON(!kpte || level != PG_LEVEL_4K);
> +
> +	set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
> +}

As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific,
would it be better to put the whole definition into arch-specific part?

> +
> +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp)
> +{
> +	int i, flush_tlb = 0;
> +	struct page_ext *page_ext;
> +	unsigned long kaddr;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	for (i = 0; i < (1 << order); i++)  {
> +		page_ext = lookup_page_ext(page + i);
> +
> +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +
> +		/* Initialize the map lock and map counter */
> +		if (!page_ext->inited) {
> +			spin_lock_init(&page_ext->maplock);
> +			atomic_set(&page_ext->mapcount, 0);
> +			page_ext->inited = 1;
> +		}
> +		BUG_ON(atomic_read(&page_ext->mapcount));
> +
> +		if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
> +			/*
> +			 * Flush the TLB if the page was previously allocated
> +			 * to the kernel.
> +			 */
> +			if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL,
> +					       &page_ext->flags))
> +				flush_tlb = 1;
> +		} else {
> +			/* Tag the page as a kernel page */
> +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +		}
> +	}
> +
> +	if (flush_tlb) {
> +		kaddr = (unsigned long)page_address(page);
> +		flush_tlb_kernel_range(kaddr, kaddr + (1 << order) *
> +				       PAGE_SIZE);
> +	}
> +}
> +
> +void xpfo_free_page(struct page *page, int order)
> +{
> +	int i;
> +	struct page_ext *page_ext;
> +	unsigned long kaddr;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	for (i = 0; i < (1 << order); i++) {
> +		page_ext = lookup_page_ext(page + i);
> +
> +		if (!page_ext->inited) {
> +			/*
> +			 * The page was allocated before page_ext was
> +			 * initialized, so it is a kernel page and it needs to
> +			 * be tagged accordingly.
> +			 */
> +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> +			continue;
> +		}
> +
> +		/*
> +		 * Map the page back into the kernel if it was previously
> +		 * allocated to user space.
> +		 */
> +		if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
> +				       &page_ext->flags)) {
> +			kaddr = (unsigned long)page_address(page + i);
> +			set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));

Why not PAGE_KERNEL?

> +		}
> +	}
> +}
> +
> +void xpfo_kmap(void *kaddr, struct page *page)
> +{
> +	struct page_ext *page_ext;
> +	unsigned long flags;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	page_ext = lookup_page_ext(page);
> +
> +	/*
> +	 * The page was allocated before page_ext was initialized (which means
> +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> +	 * do.
> +	 */
> +	if (!page_ext->inited ||
> +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +		return;
> +
> +	spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +	/*
> +	 * The page was previously allocated to user space, so map it back
> +	 * into the kernel. No TLB flush required.
> +	 */
> +	if ((atomic_inc_return(&page_ext->mapcount) == 1) &&
> +	    test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags))
> +		set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL));
> +
> +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kmap);
> +
> +void xpfo_kunmap(void *kaddr, struct page *page)
> +{
> +	struct page_ext *page_ext;
> +	unsigned long flags;
> +
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return;
> +
> +	page_ext = lookup_page_ext(page);
> +
> +	/*
> +	 * The page was allocated before page_ext was initialized (which means
> +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> +	 * do.
> +	 */
> +	if (!page_ext->inited ||
> +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> +		return;
> +
> +	spin_lock_irqsave(&page_ext->maplock, flags);
> +
> +	/*
> +	 * The page is to be allocated back to user space, so unmap it from the
> +	 * kernel, flush the TLB and tag it as a user page.
> +	 */
> +	if (atomic_dec_return(&page_ext->mapcount) == 0) {
> +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> +		set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
> +		set_kpte(page, (unsigned long)kaddr, __pgprot(0));
> +		__flush_tlb_one((unsigned long)kaddr);

Again __flush_tlb_one() is x86-specific.
flush_tlb_kernel_range() instead?

Thanks,
-Takahiro AKASHI

> +	}
> +
> +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> +}
> +EXPORT_SYMBOL(xpfo_kunmap);
> +
> +inline bool xpfo_page_is_unmapped(struct page *page)
> +{
> +	if (!static_branch_unlikely(&xpfo_inited))
> +		return false;
> +
> +	return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags);
> +}
> +EXPORT_SYMBOL(xpfo_page_is_unmapped);
> diff --git a/security/Kconfig b/security/Kconfig
> index 118f4549404e..4502e15c8419 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -6,6 +6,25 @@ menu "Security options"
>  
>  source security/keys/Kconfig
>  
> +config ARCH_SUPPORTS_XPFO
> +	bool
> +
> +config XPFO
> +	bool "Enable eXclusive Page Frame Ownership (XPFO)"
> +	default n
> +	depends on ARCH_SUPPORTS_XPFO
> +	select PAGE_EXTENSION
> +	help
> +	  This option offers protection against 'ret2dir' kernel attacks.
> +	  When enabled, every time a page frame is allocated to user space, it
> +	  is unmapped from the direct mapped RAM region in kernel space
> +	  (physmap). Similarly, when a page frame is freed/reclaimed, it is
> +	  mapped back to physmap.
> +
> +	  There is a slight performance impact when this option is enabled.
> +
> +	  If in doubt, say "N".
> +
>  config SECURITY_DMESG_RESTRICT
>  	bool "Restrict unprivileged access to the kernel syslog"
>  	default n
> -- 
> 2.10.1
>
Juerg Haefliger Nov. 28, 2016, 11:15 a.m. UTC | #8
On 11/24/2016 11:56 AM, AKASHI Takahiro wrote:
> Hi,
> 
> I'm trying to give it a spin on arm64, but ...

Thanks for trying this.


>> +/*
>> + * Update a single kernel page table entry
>> + */
>> +static inline void set_kpte(struct page *page, unsigned long kaddr,
>> +			    pgprot_t prot) {
>> +	unsigned int level;
>> +	pte_t *kpte = lookup_address(kaddr, &level);
>> +
>> +	/* We only support 4k pages for now */
>> +	BUG_ON(!kpte || level != PG_LEVEL_4K);
>> +
>> +	set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
>> +}
> 
> As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific,
> would it be better to put the whole definition into arch-specific part?

Well yes but I haven't really looked into splitting up the arch specific stuff.


>> +		/*
>> +		 * Map the page back into the kernel if it was previously
>> +		 * allocated to user space.
>> +		 */
>> +		if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
>> +				       &page_ext->flags)) {
>> +			kaddr = (unsigned long)page_address(page + i);
>> +			set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> 
> Why not PAGE_KERNEL?

Good catch, thanks!


>> +	/*
>> +	 * The page is to be allocated back to user space, so unmap it from the
>> +	 * kernel, flush the TLB and tag it as a user page.
>> +	 */
>> +	if (atomic_dec_return(&page_ext->mapcount) == 0) {
>> +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
>> +		set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
>> +		set_kpte(page, (unsigned long)kaddr, __pgprot(0));
>> +		__flush_tlb_one((unsigned long)kaddr);
> 
> Again __flush_tlb_one() is x86-specific.
> flush_tlb_kernel_range() instead?

I'll take a look. If you can tell me what the relevant arm64 equivalents are for the arch-specific
functions, that would help tremendously.

Thanks for the comments!

...Juerg



> Thanks,
> -Takahiro AKASHI
AKASHI Takahiro Dec. 9, 2016, 9:02 a.m. UTC | #9
On Thu, Nov 24, 2016 at 07:56:30PM +0900, AKASHI Takahiro wrote:
> Hi,
> 
> I'm trying to give it a spin on arm64, but ...

In my experiment on hikey,
the kernel boot failed, catching a page fault around cache operations,
(a) __clean_dcache_area_pou() on 4KB-page kernel, 
(b) __inval_cache_range() on 64KB-page kernel,
(See more details for backtrace below.)

This is because, on arm64, cache operations are by VA (in particular,
of direct/linear mapping of physical memory). So I think that 
naively unmapping a page from physmap in xpfo_kunmap() won't work well
on arm64.

-Takahiro AKASHI

case (a)
--------
Unable to handle kernel paging request at virtual address ffff800000cba000
pgd = ffff80003ba8c000
*pgd=0000000000000000
task: ffff80003be38000 task.stack: ffff80003be40000
PC is at __clean_dcache_area_pou+0x20/0x38
LR is at sync_icache_aliases+0x2c/0x40
 ...
Call trace:
 ...
__clean_dcache_area_pou+0x20/0x38
__sync_icache_dcache+0x6c/0xa8
alloc_set_pte+0x33c/0x588
filemap_map_pages+0x3a8/0x3b8
handle_mm_fault+0x910/0x1080
do_page_fault+0x2b0/0x358
do_mem_abort+0x44/0xa0
el0_ia+0x18/0x1c

case (b)
--------
Unable to handle kernel paging request at virtual address ffff80002aed0000
pgd = ffff000008f40000
, *pud=000000003dfc0003
, *pmd=000000003dfa0003
, *pte=000000002aed0000
task: ffff800028711900 task.stack: ffff800029020000
PC is at __inval_cache_range+0x3c/0x60
LR is at __swiotlb_map_sg_attrs+0x6c/0x98
 ...

Call trace:
 ...
__inval_cache_range+0x3c/0x60
dw_mci_pre_dma_transfer.isra.7+0xfc/0x190
dw_mci_pre_req+0x50/0x60
mmc_start_req+0x4c/0x420
mmc_blk_issue_rw_rq+0xb0/0x9b8
mmc_blk_issue_rq+0x154/0x518
mmc_queue_thread+0xac/0x158
kthread+0xd0/0xe8
ret_from_fork+0x10/0x20


> 
> On Fri, Nov 04, 2016 at 03:45:33PM +0100, Juerg Haefliger wrote:
> > This patch adds support for XPFO which protects against 'ret2dir' kernel
> > attacks. The basic idea is to enforce exclusive ownership of page frames
> > by either the kernel or userspace, unless explicitly requested by the
> > kernel. Whenever a page destined for userspace is allocated, it is
> > unmapped from physmap (the kernel's page table). When such a page is
> > reclaimed from userspace, it is mapped back to physmap.
> > 
> > Additional fields in the page_ext struct are used for XPFO housekeeping.
> > Specifically two flags to distinguish user vs. kernel pages and to tag
> > unmapped pages and a reference counter to balance kmap/kunmap operations
> > and a lock to serialize access to the XPFO fields.
> > 
> > Known issues/limitations:
> >   - Only supports x86-64 (for now)
> >   - Only supports 4k pages (for now)
> >   - There are most likely some legitimate uses cases where the kernel needs
> >     to access userspace which need to be made XPFO-aware
> >   - Performance penalty
> > 
> > Reference paper by the original patch authors:
> >   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> > 
> > Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu>
> > Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com>
> > ---
> >  arch/x86/Kconfig         |   3 +-
> >  arch/x86/mm/init.c       |   2 +-
> >  drivers/ata/libata-sff.c |   4 +-
> >  include/linux/highmem.h  |  15 +++-
> >  include/linux/page_ext.h |   7 ++
> >  include/linux/xpfo.h     |  39 +++++++++
> >  lib/swiotlb.c            |   3 +-
> >  mm/Makefile              |   1 +
> >  mm/page_alloc.c          |   2 +
> >  mm/page_ext.c            |   4 +
> >  mm/xpfo.c                | 206 +++++++++++++++++++++++++++++++++++++++++++++++
> >  security/Kconfig         |  19 +++++
> >  12 files changed, 298 insertions(+), 7 deletions(-)
> >  create mode 100644 include/linux/xpfo.h
> >  create mode 100644 mm/xpfo.c
> > 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index bada636d1065..38b334f8fde5 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -165,6 +165,7 @@ config X86
> >  	select HAVE_STACK_VALIDATION		if X86_64
> >  	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
> >  	select ARCH_HAS_PKEYS			if X86_INTEL_MEMORY_PROTECTION_KEYS
> > +	select ARCH_SUPPORTS_XPFO		if X86_64
> >  
> >  config INSTRUCTION_DECODER
> >  	def_bool y
> > @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
> >  
> >  config X86_DIRECT_GBPAGES
> >  	def_bool y
> > -	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
> > +	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
> >  	---help---
> >  	  Certain kernel features effectively disable kernel
> >  	  linear 1 GB mappings (even if the CPU otherwise
> > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> > index 22af912d66d2..a6fafbae02bb 100644
> > --- a/arch/x86/mm/init.c
> > +++ b/arch/x86/mm/init.c
> > @@ -161,7 +161,7 @@ static int page_size_mask;
> >  
> >  static void __init probe_page_size_mask(void)
> >  {
> > -#if !defined(CONFIG_KMEMCHECK)
> > +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
> >  	/*
> >  	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
> >  	 * use small pages.
> > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> > index 051b6158d1b7..58af734be25d 100644
> > --- a/drivers/ata/libata-sff.c
> > +++ b/drivers/ata/libata-sff.c
> > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
> >  
> >  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
> >  
> > -	if (PageHighMem(page)) {
> > +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
> >  		unsigned long flags;
> >  
> >  		/* FIXME: use a bounce buffer */
> > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
> >  
> >  	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
> >  
> > -	if (PageHighMem(page)) {
> > +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
> >  		unsigned long flags;
> >  
> >  		/* FIXME: use bounce buffer */
> > diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> > index bb3f3297062a..7a17c166532f 100644
> > --- a/include/linux/highmem.h
> > +++ b/include/linux/highmem.h
> > @@ -7,6 +7,7 @@
> >  #include <linux/mm.h>
> >  #include <linux/uaccess.h>
> >  #include <linux/hardirq.h>
> > +#include <linux/xpfo.h>
> >  
> >  #include <asm/cacheflush.h>
> >  
> > @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
> >  #ifndef ARCH_HAS_KMAP
> >  static inline void *kmap(struct page *page)
> >  {
> > +	void *kaddr;
> > +
> >  	might_sleep();
> > -	return page_address(page);
> > +	kaddr = page_address(page);
> > +	xpfo_kmap(kaddr, page);
> > +	return kaddr;
> >  }
> >  
> >  static inline void kunmap(struct page *page)
> >  {
> > +	xpfo_kunmap(page_address(page), page);
> >  }
> >  
> >  static inline void *kmap_atomic(struct page *page)
> >  {
> > +	void *kaddr;
> > +
> >  	preempt_disable();
> >  	pagefault_disable();
> > -	return page_address(page);
> > +	kaddr = page_address(page);
> > +	xpfo_kmap(kaddr, page);
> > +	return kaddr;
> >  }
> >  #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
> >  
> >  static inline void __kunmap_atomic(void *addr)
> >  {
> > +	xpfo_kunmap(addr, virt_to_page(addr));
> >  	pagefault_enable();
> >  	preempt_enable();
> >  }
> > diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> > index 9298c393ddaa..0e451a42e5a3 100644
> > --- a/include/linux/page_ext.h
> > +++ b/include/linux/page_ext.h
> > @@ -29,6 +29,8 @@ enum page_ext_flags {
> >  	PAGE_EXT_DEBUG_POISON,		/* Page is poisoned */
> >  	PAGE_EXT_DEBUG_GUARD,
> >  	PAGE_EXT_OWNER,
> > +	PAGE_EXT_XPFO_KERNEL,		/* Page is a kernel page */
> > +	PAGE_EXT_XPFO_UNMAPPED,		/* Page is unmapped */
> >  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
> >  	PAGE_EXT_YOUNG,
> >  	PAGE_EXT_IDLE,
> > @@ -44,6 +46,11 @@ enum page_ext_flags {
> >   */
> >  struct page_ext {
> >  	unsigned long flags;
> > +#ifdef CONFIG_XPFO
> > +	int inited;		/* Map counter and lock initialized */
> > +	atomic_t mapcount;	/* Counter for balancing map/unmap requests */
> > +	spinlock_t maplock;	/* Lock to serialize map/unmap requests */
> > +#endif
> >  };
> >  
> >  extern void pgdat_page_ext_init(struct pglist_data *pgdat);
> > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> > new file mode 100644
> > index 000000000000..77187578ca33
> > --- /dev/null
> > +++ b/include/linux/xpfo.h
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> > + * Copyright (C) 2016 Brown University. All rights reserved.
> > + *
> > + * Authors:
> > + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> > + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms of the GNU General Public License version 2 as published by
> > + * the Free Software Foundation.
> > + */
> > +
> > +#ifndef _LINUX_XPFO_H
> > +#define _LINUX_XPFO_H
> > +
> > +#ifdef CONFIG_XPFO
> > +
> > +extern struct page_ext_operations page_xpfo_ops;
> > +
> > +extern void xpfo_kmap(void *kaddr, struct page *page);
> > +extern void xpfo_kunmap(void *kaddr, struct page *page);
> > +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp);
> > +extern void xpfo_free_page(struct page *page, int order);
> > +
> > +extern bool xpfo_page_is_unmapped(struct page *page);
> > +
> > +#else /* !CONFIG_XPFO */
> > +
> > +static inline void xpfo_kmap(void *kaddr, struct page *page) { }
> > +static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
> > +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { }
> > +static inline void xpfo_free_page(struct page *page, int order) { }
> > +
> > +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
> > +
> > +#endif /* CONFIG_XPFO */
> > +
> > +#endif /* _LINUX_XPFO_H */
> > diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> > index 22e13a0e19d7..455eff44604e 100644
> > --- a/lib/swiotlb.c
> > +++ b/lib/swiotlb.c
> > @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
> >  {
> >  	unsigned long pfn = PFN_DOWN(orig_addr);
> >  	unsigned char *vaddr = phys_to_virt(tlb_addr);
> > +	struct page *page = pfn_to_page(pfn);
> >  
> > -	if (PageHighMem(pfn_to_page(pfn))) {
> > +	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
> >  		/* The buffer does not have a mapping.  Map it in and copy */
> >  		unsigned int offset = orig_addr & ~PAGE_MASK;
> >  		char *buffer;
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 295bd7a9f76b..175680f516aa 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
> >  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
> >  obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
> >  obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> > +obj-$(CONFIG_XPFO) += xpfo.o
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 8fd42aa7c4bd..100e80e008e2 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
> >  	kernel_poison_pages(page, 1 << order, 0);
> >  	kernel_map_pages(page, 1 << order, 0);
> >  	kasan_free_pages(page, order);
> > +	xpfo_free_page(page, order);
> >  
> >  	return true;
> >  }
> > @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
> >  	kernel_map_pages(page, 1 << order, 1);
> >  	kernel_poison_pages(page, 1 << order, 1);
> >  	kasan_alloc_pages(page, order);
> > +	xpfo_alloc_page(page, order, gfp_flags);
> >  	set_page_owner(page, order, gfp_flags);
> >  }
> >  
> > diff --git a/mm/page_ext.c b/mm/page_ext.c
> > index 121dcffc4ec1..ba6dbcacc2db 100644
> > --- a/mm/page_ext.c
> > +++ b/mm/page_ext.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/kmemleak.h>
> >  #include <linux/page_owner.h>
> >  #include <linux/page_idle.h>
> > +#include <linux/xpfo.h>
> >  
> >  /*
> >   * struct page extension
> > @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = {
> >  #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
> >  	&page_idle_ops,
> >  #endif
> > +#ifdef CONFIG_XPFO
> > +	&page_xpfo_ops,
> > +#endif
> >  };
> >  
> >  static unsigned long total_usage;
> > diff --git a/mm/xpfo.c b/mm/xpfo.c
> > new file mode 100644
> > index 000000000000..8e3a6a694b6a
> > --- /dev/null
> > +++ b/mm/xpfo.c
> > @@ -0,0 +1,206 @@
> > +/*
> > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
> > + * Copyright (C) 2016 Brown University. All rights reserved.
> > + *
> > + * Authors:
> > + *   Juerg Haefliger <juerg.haefliger@hpe.com>
> > + *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms of the GNU General Public License version 2 as published by
> > + * the Free Software Foundation.
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/module.h>
> > +#include <linux/page_ext.h>
> > +#include <linux/xpfo.h>
> > +
> > +#include <asm/tlbflush.h>
> > +
> > +DEFINE_STATIC_KEY_FALSE(xpfo_inited);
> > +
> > +static bool need_xpfo(void)
> > +{
> > +	return true;
> > +}
> > +
> > +static void init_xpfo(void)
> > +{
> > +	printk(KERN_INFO "XPFO enabled\n");
> > +	static_branch_enable(&xpfo_inited);
> > +}
> > +
> > +struct page_ext_operations page_xpfo_ops = {
> > +	.need = need_xpfo,
> > +	.init = init_xpfo,
> > +};
> > +
> > +/*
> > + * Update a single kernel page table entry
> > + */
> > +static inline void set_kpte(struct page *page, unsigned long kaddr,
> > +			    pgprot_t prot) {
> > +	unsigned int level;
> > +	pte_t *kpte = lookup_address(kaddr, &level);
> > +
> > +	/* We only support 4k pages for now */
> > +	BUG_ON(!kpte || level != PG_LEVEL_4K);
> > +
> > +	set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
> > +}
> 
> As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific,
> would it be better to put the whole definition into arch-specific part?
> 
> > +
> > +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp)
> > +{
> > +	int i, flush_tlb = 0;
> > +	struct page_ext *page_ext;
> > +	unsigned long kaddr;
> > +
> > +	if (!static_branch_unlikely(&xpfo_inited))
> > +		return;
> > +
> > +	for (i = 0; i < (1 << order); i++)  {
> > +		page_ext = lookup_page_ext(page + i);
> > +
> > +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> > +
> > +		/* Initialize the map lock and map counter */
> > +		if (!page_ext->inited) {
> > +			spin_lock_init(&page_ext->maplock);
> > +			atomic_set(&page_ext->mapcount, 0);
> > +			page_ext->inited = 1;
> > +		}
> > +		BUG_ON(atomic_read(&page_ext->mapcount));
> > +
> > +		if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
> > +			/*
> > +			 * Flush the TLB if the page was previously allocated
> > +			 * to the kernel.
> > +			 */
> > +			if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL,
> > +					       &page_ext->flags))
> > +				flush_tlb = 1;
> > +		} else {
> > +			/* Tag the page as a kernel page */
> > +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> > +		}
> > +	}
> > +
> > +	if (flush_tlb) {
> > +		kaddr = (unsigned long)page_address(page);
> > +		flush_tlb_kernel_range(kaddr, kaddr + (1 << order) *
> > +				       PAGE_SIZE);
> > +	}
> > +}
> > +
> > +void xpfo_free_page(struct page *page, int order)
> > +{
> > +	int i;
> > +	struct page_ext *page_ext;
> > +	unsigned long kaddr;
> > +
> > +	if (!static_branch_unlikely(&xpfo_inited))
> > +		return;
> > +
> > +	for (i = 0; i < (1 << order); i++) {
> > +		page_ext = lookup_page_ext(page + i);
> > +
> > +		if (!page_ext->inited) {
> > +			/*
> > +			 * The page was allocated before page_ext was
> > +			 * initialized, so it is a kernel page and it needs to
> > +			 * be tagged accordingly.
> > +			 */
> > +			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * Map the page back into the kernel if it was previously
> > +		 * allocated to user space.
> > +		 */
> > +		if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
> > +				       &page_ext->flags)) {
> > +			kaddr = (unsigned long)page_address(page + i);
> > +			set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> 
> Why not PAGE_KERNEL?
> 
> > +		}
> > +	}
> > +}
> > +
> > +void xpfo_kmap(void *kaddr, struct page *page)
> > +{
> > +	struct page_ext *page_ext;
> > +	unsigned long flags;
> > +
> > +	if (!static_branch_unlikely(&xpfo_inited))
> > +		return;
> > +
> > +	page_ext = lookup_page_ext(page);
> > +
> > +	/*
> > +	 * The page was allocated before page_ext was initialized (which means
> > +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> > +	 * do.
> > +	 */
> > +	if (!page_ext->inited ||
> > +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> > +		return;
> > +
> > +	spin_lock_irqsave(&page_ext->maplock, flags);
> > +
> > +	/*
> > +	 * The page was previously allocated to user space, so map it back
> > +	 * into the kernel. No TLB flush required.
> > +	 */
> > +	if ((atomic_inc_return(&page_ext->mapcount) == 1) &&
> > +	    test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags))
> > +		set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL));
> > +
> > +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> > +}
> > +EXPORT_SYMBOL(xpfo_kmap);
> > +
> > +void xpfo_kunmap(void *kaddr, struct page *page)
> > +{
> > +	struct page_ext *page_ext;
> > +	unsigned long flags;
> > +
> > +	if (!static_branch_unlikely(&xpfo_inited))
> > +		return;
> > +
> > +	page_ext = lookup_page_ext(page);
> > +
> > +	/*
> > +	 * The page was allocated before page_ext was initialized (which means
> > +	 * it's a kernel page) or it's allocated to the kernel, so nothing to
> > +	 * do.
> > +	 */
> > +	if (!page_ext->inited ||
> > +	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
> > +		return;
> > +
> > +	spin_lock_irqsave(&page_ext->maplock, flags);
> > +
> > +	/*
> > +	 * The page is to be allocated back to user space, so unmap it from the
> > +	 * kernel, flush the TLB and tag it as a user page.
> > +	 */
> > +	if (atomic_dec_return(&page_ext->mapcount) == 0) {
> > +		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
> > +		set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
> > +		set_kpte(page, (unsigned long)kaddr, __pgprot(0));
> > +		__flush_tlb_one((unsigned long)kaddr);
> 
> Again __flush_tlb_one() is x86-specific.
> flush_tlb_kernel_range() instead?
> 
> Thanks,
> -Takahiro AKASHI
> 
> > +	}
> > +
> > +	spin_unlock_irqrestore(&page_ext->maplock, flags);
> > +}
> > +EXPORT_SYMBOL(xpfo_kunmap);
> > +
> > +inline bool xpfo_page_is_unmapped(struct page *page)
> > +{
> > +	if (!static_branch_unlikely(&xpfo_inited))
> > +		return false;
> > +
> > +	return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags);
> > +}
> > +EXPORT_SYMBOL(xpfo_page_is_unmapped);
> > diff --git a/security/Kconfig b/security/Kconfig
> > index 118f4549404e..4502e15c8419 100644
> > --- a/security/Kconfig
> > +++ b/security/Kconfig
> > @@ -6,6 +6,25 @@ menu "Security options"
> >  
> >  source security/keys/Kconfig
> >  
> > +config ARCH_SUPPORTS_XPFO
> > +	bool
> > +
> > +config XPFO
> > +	bool "Enable eXclusive Page Frame Ownership (XPFO)"
> > +	default n
> > +	depends on ARCH_SUPPORTS_XPFO
> > +	select PAGE_EXTENSION
> > +	help
> > +	  This option offers protection against 'ret2dir' kernel attacks.
> > +	  When enabled, every time a page frame is allocated to user space, it
> > +	  is unmapped from the direct mapped RAM region in kernel space
> > +	  (physmap). Similarly, when a page frame is freed/reclaimed, it is
> > +	  mapped back to physmap.
> > +
> > +	  There is a slight performance impact when this option is enabled.
> > +
> > +	  If in doubt, say "N".
> > +
> >  config SECURITY_DMESG_RESTRICT
> >  	bool "Restrict unprivileged access to the kernel syslog"
> >  	default n
> > -- 
> > 2.10.1
> >
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636d1065..38b334f8fde5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@  config X86
 	select HAVE_STACK_VALIDATION		if X86_64
 	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
 	select ARCH_HAS_PKEYS			if X86_INTEL_MEMORY_PROTECTION_KEYS
+	select ARCH_SUPPORTS_XPFO		if X86_64
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -1361,7 +1362,7 @@  config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
 	def_bool y
-	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
 	---help---
 	  Certain kernel features effectively disable kernel
 	  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 22af912d66d2..a6fafbae02bb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@  static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
 	/*
 	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 	 * use small pages.
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 051b6158d1b7..58af734be25d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -715,7 +715,7 @@  static void ata_pio_sector(struct ata_queued_cmd *qc)
 
 	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-	if (PageHighMem(page)) {
+	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
 		unsigned long flags;
 
 		/* FIXME: use a bounce buffer */
@@ -860,7 +860,7 @@  static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
 
 	DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-	if (PageHighMem(page)) {
+	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
 		unsigned long flags;
 
 		/* FIXME: use bounce buffer */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@ 
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/xpfo.h>
 
 #include <asm/cacheflush.h>
 
@@ -55,24 +56,34 @@  static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+	void *kaddr;
+
 	might_sleep();
-	return page_address(page);
+	kaddr = page_address(page);
+	xpfo_kmap(kaddr, page);
+	return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+	xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+	void *kaddr;
+
 	preempt_disable();
 	pagefault_disable();
-	return page_address(page);
+	kaddr = page_address(page);
+	xpfo_kmap(kaddr, page);
+	return kaddr;
 }
 #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
+	xpfo_kunmap(addr, virt_to_page(addr));
 	pagefault_enable();
 	preempt_enable();
 }
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 9298c393ddaa..0e451a42e5a3 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -29,6 +29,8 @@  enum page_ext_flags {
 	PAGE_EXT_DEBUG_POISON,		/* Page is poisoned */
 	PAGE_EXT_DEBUG_GUARD,
 	PAGE_EXT_OWNER,
+	PAGE_EXT_XPFO_KERNEL,		/* Page is a kernel page */
+	PAGE_EXT_XPFO_UNMAPPED,		/* Page is unmapped */
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
@@ -44,6 +46,11 @@  enum page_ext_flags {
  */
 struct page_ext {
 	unsigned long flags;
+#ifdef CONFIG_XPFO
+	int inited;		/* Map counter and lock initialized */
+	atomic_t mapcount;	/* Counter for balancing map/unmap requests */
+	spinlock_t maplock;	/* Lock to serialize map/unmap requests */
+#endif
 };
 
 extern void pgdat_page_ext_init(struct pglist_data *pgdat);
diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
new file mode 100644
index 000000000000..77187578ca33
--- /dev/null
+++ b/include/linux/xpfo.h
@@ -0,0 +1,39 @@ 
+/*
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
+ * Copyright (C) 2016 Brown University. All rights reserved.
+ *
+ * Authors:
+ *   Juerg Haefliger <juerg.haefliger@hpe.com>
+ *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _LINUX_XPFO_H
+#define _LINUX_XPFO_H
+
+#ifdef CONFIG_XPFO
+
+extern struct page_ext_operations page_xpfo_ops;
+
+extern void xpfo_kmap(void *kaddr, struct page *page);
+extern void xpfo_kunmap(void *kaddr, struct page *page);
+extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp);
+extern void xpfo_free_page(struct page *page, int order);
+
+extern bool xpfo_page_is_unmapped(struct page *page);
+
+#else /* !CONFIG_XPFO */
+
+static inline void xpfo_kmap(void *kaddr, struct page *page) { }
+static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
+static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { }
+static inline void xpfo_free_page(struct page *page, int order) { }
+
+static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+
+#endif /* CONFIG_XPFO */
+
+#endif /* _LINUX_XPFO_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 22e13a0e19d7..455eff44604e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -390,8 +390,9 @@  static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 {
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
+	struct page *page = pfn_to_page(pfn);
 
-	if (PageHighMem(pfn_to_page(pfn))) {
+	if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
 		/* The buffer does not have a mapping.  Map it in and copy */
 		unsigned int offset = orig_addr & ~PAGE_MASK;
 		char *buffer;
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..175680f516aa 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,3 +100,4 @@  obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_XPFO) += xpfo.o
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fd42aa7c4bd..100e80e008e2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1045,6 +1045,7 @@  static __always_inline bool free_pages_prepare(struct page *page,
 	kernel_poison_pages(page, 1 << order, 0);
 	kernel_map_pages(page, 1 << order, 0);
 	kasan_free_pages(page, order);
+	xpfo_free_page(page, order);
 
 	return true;
 }
@@ -1745,6 +1746,7 @@  inline void post_alloc_hook(struct page *page, unsigned int order,
 	kernel_map_pages(page, 1 << order, 1);
 	kernel_poison_pages(page, 1 << order, 1);
 	kasan_alloc_pages(page, order);
+	xpfo_alloc_page(page, order, gfp_flags);
 	set_page_owner(page, order, gfp_flags);
 }
 
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 121dcffc4ec1..ba6dbcacc2db 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -7,6 +7,7 @@ 
 #include <linux/kmemleak.h>
 #include <linux/page_owner.h>
 #include <linux/page_idle.h>
+#include <linux/xpfo.h>
 
 /*
  * struct page extension
@@ -68,6 +69,9 @@  static struct page_ext_operations *page_ext_ops[] = {
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	&page_idle_ops,
 #endif
+#ifdef CONFIG_XPFO
+	&page_xpfo_ops,
+#endif
 };
 
 static unsigned long total_usage;
diff --git a/mm/xpfo.c b/mm/xpfo.c
new file mode 100644
index 000000000000..8e3a6a694b6a
--- /dev/null
+++ b/mm/xpfo.c
@@ -0,0 +1,206 @@ 
+/*
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
+ * Copyright (C) 2016 Brown University. All rights reserved.
+ *
+ * Authors:
+ *   Juerg Haefliger <juerg.haefliger@hpe.com>
+ *   Vasileios P. Kemerlis <vpk@cs.brown.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page_ext.h>
+#include <linux/xpfo.h>
+
+#include <asm/tlbflush.h>
+
+DEFINE_STATIC_KEY_FALSE(xpfo_inited);
+
+static bool need_xpfo(void)
+{
+	return true;
+}
+
+static void init_xpfo(void)
+{
+	printk(KERN_INFO "XPFO enabled\n");
+	static_branch_enable(&xpfo_inited);
+}
+
+struct page_ext_operations page_xpfo_ops = {
+	.need = need_xpfo,
+	.init = init_xpfo,
+};
+
+/*
+ * Update a single kernel page table entry
+ */
+static inline void set_kpte(struct page *page, unsigned long kaddr,
+			    pgprot_t prot) {
+	unsigned int level;
+	pte_t *kpte = lookup_address(kaddr, &level);
+
+	/* We only support 4k pages for now */
+	BUG_ON(!kpte || level != PG_LEVEL_4K);
+
+	set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
+}
+
+void xpfo_alloc_page(struct page *page, int order, gfp_t gfp)
+{
+	int i, flush_tlb = 0;
+	struct page_ext *page_ext;
+	unsigned long kaddr;
+
+	if (!static_branch_unlikely(&xpfo_inited))
+		return;
+
+	for (i = 0; i < (1 << order); i++)  {
+		page_ext = lookup_page_ext(page + i);
+
+		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
+
+		/* Initialize the map lock and map counter */
+		if (!page_ext->inited) {
+			spin_lock_init(&page_ext->maplock);
+			atomic_set(&page_ext->mapcount, 0);
+			page_ext->inited = 1;
+		}
+		BUG_ON(atomic_read(&page_ext->mapcount));
+
+		if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
+			/*
+			 * Flush the TLB if the page was previously allocated
+			 * to the kernel.
+			 */
+			if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL,
+					       &page_ext->flags))
+				flush_tlb = 1;
+		} else {
+			/* Tag the page as a kernel page */
+			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
+		}
+	}
+
+	if (flush_tlb) {
+		kaddr = (unsigned long)page_address(page);
+		flush_tlb_kernel_range(kaddr, kaddr + (1 << order) *
+				       PAGE_SIZE);
+	}
+}
+
+void xpfo_free_page(struct page *page, int order)
+{
+	int i;
+	struct page_ext *page_ext;
+	unsigned long kaddr;
+
+	if (!static_branch_unlikely(&xpfo_inited))
+		return;
+
+	for (i = 0; i < (1 << order); i++) {
+		page_ext = lookup_page_ext(page + i);
+
+		if (!page_ext->inited) {
+			/*
+			 * The page was allocated before page_ext was
+			 * initialized, so it is a kernel page and it needs to
+			 * be tagged accordingly.
+			 */
+			set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags);
+			continue;
+		}
+
+		/*
+		 * Map the page back into the kernel if it was previously
+		 * allocated to user space.
+		 */
+		if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
+				       &page_ext->flags)) {
+			kaddr = (unsigned long)page_address(page + i);
+			set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
+		}
+	}
+}
+
+void xpfo_kmap(void *kaddr, struct page *page)
+{
+	struct page_ext *page_ext;
+	unsigned long flags;
+
+	if (!static_branch_unlikely(&xpfo_inited))
+		return;
+
+	page_ext = lookup_page_ext(page);
+
+	/*
+	 * The page was allocated before page_ext was initialized (which means
+	 * it's a kernel page) or it's allocated to the kernel, so nothing to
+	 * do.
+	 */
+	if (!page_ext->inited ||
+	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
+		return;
+
+	spin_lock_irqsave(&page_ext->maplock, flags);
+
+	/*
+	 * The page was previously allocated to user space, so map it back
+	 * into the kernel. No TLB flush required.
+	 */
+	if ((atomic_inc_return(&page_ext->mapcount) == 1) &&
+	    test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags))
+		set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL));
+
+	spin_unlock_irqrestore(&page_ext->maplock, flags);
+}
+EXPORT_SYMBOL(xpfo_kmap);
+
+void xpfo_kunmap(void *kaddr, struct page *page)
+{
+	struct page_ext *page_ext;
+	unsigned long flags;
+
+	if (!static_branch_unlikely(&xpfo_inited))
+		return;
+
+	page_ext = lookup_page_ext(page);
+
+	/*
+	 * The page was allocated before page_ext was initialized (which means
+	 * it's a kernel page) or it's allocated to the kernel, so nothing to
+	 * do.
+	 */
+	if (!page_ext->inited ||
+	    test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags))
+		return;
+
+	spin_lock_irqsave(&page_ext->maplock, flags);
+
+	/*
+	 * The page is to be allocated back to user space, so unmap it from the
+	 * kernel, flush the TLB and tag it as a user page.
+	 */
+	if (atomic_dec_return(&page_ext->mapcount) == 0) {
+		BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags));
+		set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags);
+		set_kpte(page, (unsigned long)kaddr, __pgprot(0));
+		__flush_tlb_one((unsigned long)kaddr);
+	}
+
+	spin_unlock_irqrestore(&page_ext->maplock, flags);
+}
+EXPORT_SYMBOL(xpfo_kunmap);
+
+inline bool xpfo_page_is_unmapped(struct page *page)
+{
+	if (!static_branch_unlikely(&xpfo_inited))
+		return false;
+
+	return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags);
+}
+EXPORT_SYMBOL(xpfo_page_is_unmapped);
diff --git a/security/Kconfig b/security/Kconfig
index 118f4549404e..4502e15c8419 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -6,6 +6,25 @@  menu "Security options"
 
 source security/keys/Kconfig
 
+config ARCH_SUPPORTS_XPFO
+	bool
+
+config XPFO
+	bool "Enable eXclusive Page Frame Ownership (XPFO)"
+	default n
+	depends on ARCH_SUPPORTS_XPFO
+	select PAGE_EXTENSION
+	help
+	  This option offers protection against 'ret2dir' kernel attacks.
+	  When enabled, every time a page frame is allocated to user space, it
+	  is unmapped from the direct mapped RAM region in kernel space
+	  (physmap). Similarly, when a page frame is freed/reclaimed, it is
+	  mapped back to physmap.
+
+	  There is a slight performance impact when this option is enabled.
+
+	  If in doubt, say "N".
+
 config SECURITY_DMESG_RESTRICT
 	bool "Restrict unprivileged access to the kernel syslog"
 	default n