Message ID | 20161104144534.14790-2-juerg.haefliger@hpe.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
The libata parts here really need to be split out and the proper list and maintainer need to be Cc'ed. > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c > index 051b6158d1b7..58af734be25d 100644 > --- a/drivers/ata/libata-sff.c > +++ b/drivers/ata/libata-sff.c > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use a bounce buffer */ > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use bounce buffer */ > diff --git a/include/linux/highmem.h b/include/linux/highmem.h This is just piling one nasty hack on top of another. libata should just use the highmem case unconditionally, as it is the correct thing to do for all cases.
> This patch adds support for XPFO which protects against 'ret2dir' kernel > attacks. The basic idea is to enforce exclusive ownership of page frames > by either the kernel or userspace, unless explicitly requested by the > kernel. Whenever a page destined for userspace is allocated, it is > unmapped from physmap (the kernel's page table). When such a page is > reclaimed from userspace, it is mapped back to physmap. > > Additional fields in the page_ext struct are used for XPFO housekeeping. > Specifically two flags to distinguish user vs. kernel pages and to tag > unmapped pages and a reference counter to balance kmap/kunmap operations > and a lock to serialize access to the XPFO fields. > > Known issues/limitations: > - Only supports x86-64 (for now) > - Only supports 4k pages (for now) > - There are most likely some legitimate uses cases where the kernel needs > to access userspace which need to be made XPFO-aware > - Performance penalty > > Reference paper by the original patch authors: > http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf > > Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu> > Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com> > --- > arch/x86/Kconfig | 3 +- > arch/x86/mm/init.c | 2 +- > drivers/ata/libata-sff.c | 4 +- > include/linux/highmem.h | 15 +++- > include/linux/page_ext.h | 7 ++ > include/linux/xpfo.h | 39 +++++++++ > lib/swiotlb.c | 3 +- > mm/Makefile | 1 + > mm/page_alloc.c | 2 + > mm/page_ext.c | 4 + > mm/xpfo.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++ > security/Kconfig | 19 +++++ > 12 files changed, 298 insertions(+), 7 deletions(-) > create mode 100644 include/linux/xpfo.h > create mode 100644 mm/xpfo.c > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index bada636d1065..38b334f8fde5 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -165,6 +165,7 @@ config X86 > select HAVE_STACK_VALIDATION if X86_64 > select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS > select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS > + select ARCH_SUPPORTS_XPFO if X86_64 > > config INSTRUCTION_DECODER > def_bool y > @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT > > config X86_DIRECT_GBPAGES > def_bool y > - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK > + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO > ---help--- > Certain kernel features effectively disable kernel > linear 1 GB mappings (even if the CPU otherwise > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c > index 22af912d66d2..a6fafbae02bb 100644 > --- a/arch/x86/mm/init.c > +++ b/arch/x86/mm/init.c > @@ -161,7 +161,7 @@ static int page_size_mask; > > static void __init probe_page_size_mask(void) > { > -#if !defined(CONFIG_KMEMCHECK) > +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO) > /* > * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will > * use small pages. > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c > index 051b6158d1b7..58af734be25d 100644 > --- a/drivers/ata/libata-sff.c > +++ b/drivers/ata/libata-sff.c > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use a bounce buffer */ > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use bounce buffer */ > diff --git a/include/linux/highmem.h b/include/linux/highmem.h > index bb3f3297062a..7a17c166532f 100644 > --- a/include/linux/highmem.h > +++ b/include/linux/highmem.h > @@ -7,6 +7,7 @@ > #include <linux/mm.h> > #include <linux/uaccess.h> > #include <linux/hardirq.h> > +#include <linux/xpfo.h> > > #include <asm/cacheflush.h> > > @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr) > #ifndef ARCH_HAS_KMAP > static inline void *kmap(struct page *page) > { > + void *kaddr; > + > might_sleep(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > > static inline void kunmap(struct page *page) > { > + xpfo_kunmap(page_address(page), page); > } > > static inline void *kmap_atomic(struct page *page) > { > + void *kaddr; > + > preempt_disable(); > pagefault_disable(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > #define kmap_atomic_prot(page, prot) kmap_atomic(page) > > static inline void __kunmap_atomic(void *addr) > { > + xpfo_kunmap(addr, virt_to_page(addr)); > pagefault_enable(); > preempt_enable(); > } > diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h > index 9298c393ddaa..0e451a42e5a3 100644 > --- a/include/linux/page_ext.h > +++ b/include/linux/page_ext.h > @@ -29,6 +29,8 @@ enum page_ext_flags { > PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ > PAGE_EXT_DEBUG_GUARD, > PAGE_EXT_OWNER, > + PAGE_EXT_XPFO_KERNEL, /* Page is a kernel page */ > + PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */ > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > PAGE_EXT_YOUNG, > PAGE_EXT_IDLE, > @@ -44,6 +46,11 @@ enum page_ext_flags { > */ > struct page_ext { > unsigned long flags; > +#ifdef CONFIG_XPFO > + int inited; /* Map counter and lock initialized */ > + atomic_t mapcount; /* Counter for balancing map/unmap requests */ > + spinlock_t maplock; /* Lock to serialize map/unmap requests */ > +#endif > }; > > extern void pgdat_page_ext_init(struct pglist_data *pgdat); > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h > new file mode 100644 > index 000000000000..77187578ca33 > --- /dev/null > +++ b/include/linux/xpfo.h > @@ -0,0 +1,39 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#ifndef _LINUX_XPFO_H > +#define _LINUX_XPFO_H > + > +#ifdef CONFIG_XPFO > + > +extern struct page_ext_operations page_xpfo_ops; > + > +extern void xpfo_kmap(void *kaddr, struct page *page); > +extern void xpfo_kunmap(void *kaddr, struct page *page); > +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp); > +extern void xpfo_free_page(struct page *page, int order); > + > +extern bool xpfo_page_is_unmapped(struct page *page); > + > +#else /* !CONFIG_XPFO */ > + > +static inline void xpfo_kmap(void *kaddr, struct page *page) { } > +static inline void xpfo_kunmap(void *kaddr, struct page *page) { } > +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { } > +static inline void xpfo_free_page(struct page *page, int order) { } > + > +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } > + > +#endif /* CONFIG_XPFO */ > + > +#endif /* _LINUX_XPFO_H */ > diff --git a/lib/swiotlb.c b/lib/swiotlb.c > index 22e13a0e19d7..455eff44604e 100644 > --- a/lib/swiotlb.c > +++ b/lib/swiotlb.c > @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, > { > unsigned long pfn = PFN_DOWN(orig_addr); > unsigned char *vaddr = phys_to_virt(tlb_addr); > + struct page *page = pfn_to_page(pfn); > > - if (PageHighMem(pfn_to_page(pfn))) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > /* The buffer does not have a mapping. Map it in and copy */ > unsigned int offset = orig_addr & ~PAGE_MASK; > char *buffer; > diff --git a/mm/Makefile b/mm/Makefile > index 295bd7a9f76b..175680f516aa 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o > obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o > obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o > obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o > +obj-$(CONFIG_XPFO) += xpfo.o > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 8fd42aa7c4bd..100e80e008e2 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page, > kernel_poison_pages(page, 1 << order, 0); > kernel_map_pages(page, 1 << order, 0); > kasan_free_pages(page, order); > + xpfo_free_page(page, order); > > return true; > } > @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > kernel_map_pages(page, 1 << order, 1); > kernel_poison_pages(page, 1 << order, 1); > kasan_alloc_pages(page, order); > + xpfo_alloc_page(page, order, gfp_flags); > set_page_owner(page, order, gfp_flags); > } > > diff --git a/mm/page_ext.c b/mm/page_ext.c > index 121dcffc4ec1..ba6dbcacc2db 100644 > --- a/mm/page_ext.c > +++ b/mm/page_ext.c > @@ -7,6 +7,7 @@ > #include <linux/kmemleak.h> > #include <linux/page_owner.h> > #include <linux/page_idle.h> > +#include <linux/xpfo.h> > > /* > * struct page extension > @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = { > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > &page_idle_ops, > #endif > +#ifdef CONFIG_XPFO > + &page_xpfo_ops, > +#endif > }; > > static unsigned long total_usage; > diff --git a/mm/xpfo.c b/mm/xpfo.c > new file mode 100644 > index 000000000000..8e3a6a694b6a > --- /dev/null > +++ b/mm/xpfo.c > @@ -0,0 +1,206 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#include <linux/mm.h> > +#include <linux/module.h> > +#include <linux/page_ext.h> > +#include <linux/xpfo.h> > + > +#include <asm/tlbflush.h> > + > +DEFINE_STATIC_KEY_FALSE(xpfo_inited); > + > +static bool need_xpfo(void) > +{ > + return true; > +} > + > +static void init_xpfo(void) > +{ > + printk(KERN_INFO "XPFO enabled\n"); > + static_branch_enable(&xpfo_inited); > +} > + > +struct page_ext_operations page_xpfo_ops = { > + .need = need_xpfo, > + .init = init_xpfo, > +}; > + > +/* > + * Update a single kernel page table entry > + */ > +static inline void set_kpte(struct page *page, unsigned long kaddr, > + pgprot_t prot) { > + unsigned int level; > + pte_t *kpte = lookup_address(kaddr, &level); > + > + /* We only support 4k pages for now */ > + BUG_ON(!kpte || level != PG_LEVEL_4K); > + > + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); > +} > + > +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) > +{ > + int i, flush_tlb = 0; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + > + /* Initialize the map lock and map counter */ > + if (!page_ext->inited) { > + spin_lock_init(&page_ext->maplock); > + atomic_set(&page_ext->mapcount, 0); > + page_ext->inited = 1; > + } > + BUG_ON(atomic_read(&page_ext->mapcount)); > + > + if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) { > + /* > + * Flush the TLB if the page was previously allocated > + * to the kernel. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL, > + &page_ext->flags)) > + flush_tlb = 1; > + } else { > + /* Tag the page as a kernel page */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + } > + } > + > + if (flush_tlb) { > + kaddr = (unsigned long)page_address(page); > + flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * > + PAGE_SIZE); > + } > +} > + > +void xpfo_free_page(struct page *page, int order) > +{ > + int i; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + if (!page_ext->inited) { > + /* > + * The page was allocated before page_ext was > + * initialized, so it is a kernel page and it needs to > + * be tagged accordingly. > + */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + continue; > + } > + > + /* > + * Map the page back into the kernel if it was previously > + * allocated to user space. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, > + &page_ext->flags)) { > + kaddr = (unsigned long)page_address(page + i); > + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); > + } > + } > +} > + > +void xpfo_kmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page was previously allocated to user space, so map it back > + * into the kernel. No TLB flush required. > + */ > + if ((atomic_inc_return(&page_ext->mapcount) == 1) && > + test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)) > + set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL)); > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kmap); > + > +void xpfo_kunmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page is to be allocated back to user space, so unmap it from the > + * kernel, flush the TLB and tag it as a user page. > + */ > + if (atomic_dec_return(&page_ext->mapcount) == 0) { > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); > + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); > + __flush_tlb_one((unsigned long)kaddr); > + } > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kunmap); > + > +inline bool xpfo_page_is_unmapped(struct page *page) > +{ > + if (!static_branch_unlikely(&xpfo_inited)) > + return false; > + > + return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags); > +} > +EXPORT_SYMBOL(xpfo_page_is_unmapped); > diff --git a/security/Kconfig b/security/Kconfig > index 118f4549404e..4502e15c8419 100644 > --- a/security/Kconfig > +++ b/security/Kconfig > @@ -6,6 +6,25 @@ menu "Security options" > > source security/keys/Kconfig > > +config ARCH_SUPPORTS_XPFO > + bool > + > +config XPFO > + bool "Enable eXclusive Page Frame Ownership (XPFO)" > + default n > + depends on ARCH_SUPPORTS_XPFO > + select PAGE_EXTENSION > + help > + This option offers protection against 'ret2dir' kernel attacks. > + When enabled, every time a page frame is allocated to user space, it > + is unmapped from the direct mapped RAM region in kernel space > + (physmap). Similarly, when a page frame is freed/reclaimed, it is > + mapped back to physmap. > + > + There is a slight performance impact when this option is enabled. > + > + If in doubt, say "N". > + > config SECURITY_DMESG_RESTRICT > bool "Restrict unprivileged access to the kernel syslog" > default n > When a physical page is assigned to a process in user space, it should be unmaped from kernel physmap. From the code, I can see the patch only handle the page in high memory zone. if the kernel use the high memory zone, it will call the kmap. So I would like to know if the physical page is coming from normal zone,how to handle it. Thanks Zhaojunmin
On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote: > This patch adds support for XPFO which protects against 'ret2dir' kernel > attacks. The basic idea is to enforce exclusive ownership of page frames > by either the kernel or userspace, unless explicitly requested by the > kernel. Whenever a page destined for userspace is allocated, it is > unmapped from physmap (the kernel's page table). When such a page is > reclaimed from userspace, it is mapped back to physmap. > > Additional fields in the page_ext struct are used for XPFO housekeeping. > Specifically two flags to distinguish user vs. kernel pages and to tag > unmapped pages and a reference counter to balance kmap/kunmap operations > and a lock to serialize access to the XPFO fields. Thanks for keeping on this! I'd really like to see it land and then get more architectures to support it. > Known issues/limitations: > - Only supports x86-64 (for now) > - Only supports 4k pages (for now) > - There are most likely some legitimate uses cases where the kernel needs > to access userspace which need to be made XPFO-aware > - Performance penalty In the Kconfig you say "slight", but I'm curious what kinds of benchmarks you've done and if there's a more specific cost we can declare, just to give people more of an idea what the hit looks like? (What workloads would trigger a lot of XPFO unmapping, for example?) Thanks! -Kees
On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote: > This patch adds support for XPFO which protects against 'ret2dir' kernel > attacks. The basic idea is to enforce exclusive ownership of page frames > by either the kernel or userspace, unless explicitly requested by the > kernel. Whenever a page destined for userspace is allocated, it is > unmapped from physmap (the kernel's page table). When such a page is > reclaimed from userspace, it is mapped back to physmap. > > Additional fields in the page_ext struct are used for XPFO housekeeping. > Specifically two flags to distinguish user vs. kernel pages and to tag > unmapped pages and a reference counter to balance kmap/kunmap operations > and a lock to serialize access to the XPFO fields. > > Known issues/limitations: > - Only supports x86-64 (for now) > - Only supports 4k pages (for now) > - There are most likely some legitimate uses cases where the kernel needs > to access userspace which need to be made XPFO-aware > - Performance penalty > > Reference paper by the original patch authors: > http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf Would it be possible to create an lkdtm test that can exercise this protection? > Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu> > Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com> > --- > arch/x86/Kconfig | 3 +- > arch/x86/mm/init.c | 2 +- > drivers/ata/libata-sff.c | 4 +- > include/linux/highmem.h | 15 +++- > include/linux/page_ext.h | 7 ++ > include/linux/xpfo.h | 39 +++++++++ > lib/swiotlb.c | 3 +- > mm/Makefile | 1 + > mm/page_alloc.c | 2 + > mm/page_ext.c | 4 + > mm/xpfo.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++ > security/Kconfig | 19 +++++ > 12 files changed, 298 insertions(+), 7 deletions(-) > create mode 100644 include/linux/xpfo.h > create mode 100644 mm/xpfo.c > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index bada636d1065..38b334f8fde5 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -165,6 +165,7 @@ config X86 > select HAVE_STACK_VALIDATION if X86_64 > select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS > select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS > + select ARCH_SUPPORTS_XPFO if X86_64 > > config INSTRUCTION_DECODER > def_bool y > @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT > > config X86_DIRECT_GBPAGES > def_bool y > - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK > + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO > ---help--- > Certain kernel features effectively disable kernel > linear 1 GB mappings (even if the CPU otherwise > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c > index 22af912d66d2..a6fafbae02bb 100644 > --- a/arch/x86/mm/init.c > +++ b/arch/x86/mm/init.c > @@ -161,7 +161,7 @@ static int page_size_mask; > > static void __init probe_page_size_mask(void) > { > -#if !defined(CONFIG_KMEMCHECK) > +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO) > /* > * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will > * use small pages. > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c > index 051b6158d1b7..58af734be25d 100644 > --- a/drivers/ata/libata-sff.c > +++ b/drivers/ata/libata-sff.c > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use a bounce buffer */ > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use bounce buffer */ > diff --git a/include/linux/highmem.h b/include/linux/highmem.h > index bb3f3297062a..7a17c166532f 100644 > --- a/include/linux/highmem.h > +++ b/include/linux/highmem.h > @@ -7,6 +7,7 @@ > #include <linux/mm.h> > #include <linux/uaccess.h> > #include <linux/hardirq.h> > +#include <linux/xpfo.h> > > #include <asm/cacheflush.h> > > @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr) > #ifndef ARCH_HAS_KMAP > static inline void *kmap(struct page *page) > { > + void *kaddr; > + > might_sleep(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > > static inline void kunmap(struct page *page) > { > + xpfo_kunmap(page_address(page), page); > } > > static inline void *kmap_atomic(struct page *page) > { > + void *kaddr; > + > preempt_disable(); > pagefault_disable(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > #define kmap_atomic_prot(page, prot) kmap_atomic(page) > > static inline void __kunmap_atomic(void *addr) > { > + xpfo_kunmap(addr, virt_to_page(addr)); > pagefault_enable(); > preempt_enable(); > } > diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h > index 9298c393ddaa..0e451a42e5a3 100644 > --- a/include/linux/page_ext.h > +++ b/include/linux/page_ext.h > @@ -29,6 +29,8 @@ enum page_ext_flags { > PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ > PAGE_EXT_DEBUG_GUARD, > PAGE_EXT_OWNER, > + PAGE_EXT_XPFO_KERNEL, /* Page is a kernel page */ > + PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */ > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > PAGE_EXT_YOUNG, > PAGE_EXT_IDLE, > @@ -44,6 +46,11 @@ enum page_ext_flags { > */ > struct page_ext { > unsigned long flags; > +#ifdef CONFIG_XPFO > + int inited; /* Map counter and lock initialized */ > + atomic_t mapcount; /* Counter for balancing map/unmap requests */ > + spinlock_t maplock; /* Lock to serialize map/unmap requests */ > +#endif > }; > > extern void pgdat_page_ext_init(struct pglist_data *pgdat); > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h > new file mode 100644 > index 000000000000..77187578ca33 > --- /dev/null > +++ b/include/linux/xpfo.h > @@ -0,0 +1,39 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#ifndef _LINUX_XPFO_H > +#define _LINUX_XPFO_H > + > +#ifdef CONFIG_XPFO > + > +extern struct page_ext_operations page_xpfo_ops; > + > +extern void xpfo_kmap(void *kaddr, struct page *page); > +extern void xpfo_kunmap(void *kaddr, struct page *page); > +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp); > +extern void xpfo_free_page(struct page *page, int order); > + > +extern bool xpfo_page_is_unmapped(struct page *page); > + > +#else /* !CONFIG_XPFO */ > + > +static inline void xpfo_kmap(void *kaddr, struct page *page) { } > +static inline void xpfo_kunmap(void *kaddr, struct page *page) { } > +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { } > +static inline void xpfo_free_page(struct page *page, int order) { } > + > +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } > + > +#endif /* CONFIG_XPFO */ > + > +#endif /* _LINUX_XPFO_H */ > diff --git a/lib/swiotlb.c b/lib/swiotlb.c > index 22e13a0e19d7..455eff44604e 100644 > --- a/lib/swiotlb.c > +++ b/lib/swiotlb.c > @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, > { > unsigned long pfn = PFN_DOWN(orig_addr); > unsigned char *vaddr = phys_to_virt(tlb_addr); > + struct page *page = pfn_to_page(pfn); > > - if (PageHighMem(pfn_to_page(pfn))) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > /* The buffer does not have a mapping. Map it in and copy */ > unsigned int offset = orig_addr & ~PAGE_MASK; > char *buffer; > diff --git a/mm/Makefile b/mm/Makefile > index 295bd7a9f76b..175680f516aa 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o > obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o > obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o > obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o > +obj-$(CONFIG_XPFO) += xpfo.o > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 8fd42aa7c4bd..100e80e008e2 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page, > kernel_poison_pages(page, 1 << order, 0); > kernel_map_pages(page, 1 << order, 0); > kasan_free_pages(page, order); > + xpfo_free_page(page, order); > > return true; > } > @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > kernel_map_pages(page, 1 << order, 1); > kernel_poison_pages(page, 1 << order, 1); > kasan_alloc_pages(page, order); > + xpfo_alloc_page(page, order, gfp_flags); > set_page_owner(page, order, gfp_flags); > } > > diff --git a/mm/page_ext.c b/mm/page_ext.c > index 121dcffc4ec1..ba6dbcacc2db 100644 > --- a/mm/page_ext.c > +++ b/mm/page_ext.c > @@ -7,6 +7,7 @@ > #include <linux/kmemleak.h> > #include <linux/page_owner.h> > #include <linux/page_idle.h> > +#include <linux/xpfo.h> > > /* > * struct page extension > @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = { > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > &page_idle_ops, > #endif > +#ifdef CONFIG_XPFO > + &page_xpfo_ops, > +#endif > }; > > static unsigned long total_usage; > diff --git a/mm/xpfo.c b/mm/xpfo.c > new file mode 100644 > index 000000000000..8e3a6a694b6a > --- /dev/null > +++ b/mm/xpfo.c > @@ -0,0 +1,206 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#include <linux/mm.h> > +#include <linux/module.h> > +#include <linux/page_ext.h> > +#include <linux/xpfo.h> > + > +#include <asm/tlbflush.h> > + > +DEFINE_STATIC_KEY_FALSE(xpfo_inited); > + > +static bool need_xpfo(void) > +{ > + return true; > +} > + > +static void init_xpfo(void) > +{ > + printk(KERN_INFO "XPFO enabled\n"); > + static_branch_enable(&xpfo_inited); > +} > + > +struct page_ext_operations page_xpfo_ops = { > + .need = need_xpfo, > + .init = init_xpfo, > +}; > + > +/* > + * Update a single kernel page table entry > + */ > +static inline void set_kpte(struct page *page, unsigned long kaddr, > + pgprot_t prot) { > + unsigned int level; > + pte_t *kpte = lookup_address(kaddr, &level); > + > + /* We only support 4k pages for now */ > + BUG_ON(!kpte || level != PG_LEVEL_4K); > + > + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); > +} > + > +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) > +{ > + int i, flush_tlb = 0; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + > + /* Initialize the map lock and map counter */ > + if (!page_ext->inited) { > + spin_lock_init(&page_ext->maplock); > + atomic_set(&page_ext->mapcount, 0); > + page_ext->inited = 1; > + } > + BUG_ON(atomic_read(&page_ext->mapcount)); > + > + if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) { > + /* > + * Flush the TLB if the page was previously allocated > + * to the kernel. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL, > + &page_ext->flags)) > + flush_tlb = 1; > + } else { > + /* Tag the page as a kernel page */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + } > + } > + > + if (flush_tlb) { > + kaddr = (unsigned long)page_address(page); > + flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * > + PAGE_SIZE); > + } > +} > + > +void xpfo_free_page(struct page *page, int order) > +{ > + int i; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + if (!page_ext->inited) { > + /* > + * The page was allocated before page_ext was > + * initialized, so it is a kernel page and it needs to > + * be tagged accordingly. > + */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + continue; > + } > + > + /* > + * Map the page back into the kernel if it was previously > + * allocated to user space. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, > + &page_ext->flags)) { > + kaddr = (unsigned long)page_address(page + i); > + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); > + } > + } > +} > + > +void xpfo_kmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page was previously allocated to user space, so map it back > + * into the kernel. No TLB flush required. > + */ > + if ((atomic_inc_return(&page_ext->mapcount) == 1) && > + test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)) > + set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL)); > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kmap); > + > +void xpfo_kunmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page is to be allocated back to user space, so unmap it from the > + * kernel, flush the TLB and tag it as a user page. > + */ > + if (atomic_dec_return(&page_ext->mapcount) == 0) { > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); > + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); > + __flush_tlb_one((unsigned long)kaddr); > + } > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kunmap); > + > +inline bool xpfo_page_is_unmapped(struct page *page) > +{ > + if (!static_branch_unlikely(&xpfo_inited)) > + return false; > + > + return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags); > +} > +EXPORT_SYMBOL(xpfo_page_is_unmapped); > diff --git a/security/Kconfig b/security/Kconfig > index 118f4549404e..4502e15c8419 100644 > --- a/security/Kconfig > +++ b/security/Kconfig > @@ -6,6 +6,25 @@ menu "Security options" > > source security/keys/Kconfig > > +config ARCH_SUPPORTS_XPFO > + bool Can you include a "help" section here to describe what requirements an architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and HAVE_ARCH_VMAP_STACK or some examples. > +config XPFO > + bool "Enable eXclusive Page Frame Ownership (XPFO)" > + default n > + depends on ARCH_SUPPORTS_XPFO > + select PAGE_EXTENSION > + help > + This option offers protection against 'ret2dir' kernel attacks. > + When enabled, every time a page frame is allocated to user space, it > + is unmapped from the direct mapped RAM region in kernel space > + (physmap). Similarly, when a page frame is freed/reclaimed, it is > + mapped back to physmap. > + > + There is a slight performance impact when this option is enabled. > + > + If in doubt, say "N". > + > config SECURITY_DMESG_RESTRICT > bool "Restrict unprivileged access to the kernel syslog" > default n > -- > 2.10.1 > I've added these patches to my kspp tree on kernel.org, so it should get some 0-day testing now... Thanks! -Kees
Sorry for the late reply, I just found your email in my cluttered inbox. On 11/10/2016 08:11 PM, Kees Cook wrote: > On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote: >> This patch adds support for XPFO which protects against 'ret2dir' kernel >> attacks. The basic idea is to enforce exclusive ownership of page frames >> by either the kernel or userspace, unless explicitly requested by the >> kernel. Whenever a page destined for userspace is allocated, it is >> unmapped from physmap (the kernel's page table). When such a page is >> reclaimed from userspace, it is mapped back to physmap. >> >> Additional fields in the page_ext struct are used for XPFO housekeeping. >> Specifically two flags to distinguish user vs. kernel pages and to tag >> unmapped pages and a reference counter to balance kmap/kunmap operations >> and a lock to serialize access to the XPFO fields. > > Thanks for keeping on this! I'd really like to see it land and then > get more architectures to support it. Good to hear :-) >> Known issues/limitations: >> - Only supports x86-64 (for now) >> - Only supports 4k pages (for now) >> - There are most likely some legitimate uses cases where the kernel needs >> to access userspace which need to be made XPFO-aware >> - Performance penalty > > In the Kconfig you say "slight", but I'm curious what kinds of > benchmarks you've done and if there's a more specific cost we can > declare, just to give people more of an idea what the hit looks like? > (What workloads would trigger a lot of XPFO unmapping, for example?) That 'slight' wording is based on the performance numbers published in the referenced paper. So far I've only run kernel compilation tests. For that workload, the big performance hit comes from disabling >4k page sizes (around 10%). Adding XPFO on top causes 'only' another 0.5% performance penalty. I'm currently looking into adding support for larger page sizes to see what the real impact is and then generate some more relevant numbers. ...Juerg > Thanks! > > -Kees >
On 11/10/2016 08:24 PM, Kees Cook wrote: > On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefliger@hpe.com> wrote: >> This patch adds support for XPFO which protects against 'ret2dir' kernel >> attacks. The basic idea is to enforce exclusive ownership of page frames >> by either the kernel or userspace, unless explicitly requested by the >> kernel. Whenever a page destined for userspace is allocated, it is >> unmapped from physmap (the kernel's page table). When such a page is >> reclaimed from userspace, it is mapped back to physmap. >> >> Additional fields in the page_ext struct are used for XPFO housekeeping. >> Specifically two flags to distinguish user vs. kernel pages and to tag >> unmapped pages and a reference counter to balance kmap/kunmap operations >> and a lock to serialize access to the XPFO fields. >> >> Known issues/limitations: >> - Only supports x86-64 (for now) >> - Only supports 4k pages (for now) >> - There are most likely some legitimate uses cases where the kernel needs >> to access userspace which need to be made XPFO-aware >> - Performance penalty >> >> Reference paper by the original patch authors: >> http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf > > Would it be possible to create an lkdtm test that can exercise this protection? I'll look into it. >> diff --git a/security/Kconfig b/security/Kconfig >> index 118f4549404e..4502e15c8419 100644 >> --- a/security/Kconfig >> +++ b/security/Kconfig >> @@ -6,6 +6,25 @@ menu "Security options" >> >> source security/keys/Kconfig >> >> +config ARCH_SUPPORTS_XPFO >> + bool > > Can you include a "help" section here to describe what requirements an > architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and > HAVE_ARCH_VMAP_STACK or some examples. Will do. >> +config XPFO >> + bool "Enable eXclusive Page Frame Ownership (XPFO)" >> + default n >> + depends on ARCH_SUPPORTS_XPFO >> + select PAGE_EXTENSION >> + help >> + This option offers protection against 'ret2dir' kernel attacks. >> + When enabled, every time a page frame is allocated to user space, it >> + is unmapped from the direct mapped RAM region in kernel space >> + (physmap). Similarly, when a page frame is freed/reclaimed, it is >> + mapped back to physmap. >> + >> + There is a slight performance impact when this option is enabled. >> + >> + If in doubt, say "N". >> + >> config SECURITY_DMESG_RESTRICT >> bool "Restrict unprivileged access to the kernel syslog" >> default n > > I've added these patches to my kspp tree on kernel.org, so it should > get some 0-day testing now... Very good. Thanks! > Thanks! Appreciate the feedback. ...Juerg > -Kees >
Hi, I'm trying to give it a spin on arm64, but ... On Fri, Nov 04, 2016 at 03:45:33PM +0100, Juerg Haefliger wrote: > This patch adds support for XPFO which protects against 'ret2dir' kernel > attacks. The basic idea is to enforce exclusive ownership of page frames > by either the kernel or userspace, unless explicitly requested by the > kernel. Whenever a page destined for userspace is allocated, it is > unmapped from physmap (the kernel's page table). When such a page is > reclaimed from userspace, it is mapped back to physmap. > > Additional fields in the page_ext struct are used for XPFO housekeeping. > Specifically two flags to distinguish user vs. kernel pages and to tag > unmapped pages and a reference counter to balance kmap/kunmap operations > and a lock to serialize access to the XPFO fields. > > Known issues/limitations: > - Only supports x86-64 (for now) > - Only supports 4k pages (for now) > - There are most likely some legitimate uses cases where the kernel needs > to access userspace which need to be made XPFO-aware > - Performance penalty > > Reference paper by the original patch authors: > http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf > > Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu> > Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com> > --- > arch/x86/Kconfig | 3 +- > arch/x86/mm/init.c | 2 +- > drivers/ata/libata-sff.c | 4 +- > include/linux/highmem.h | 15 +++- > include/linux/page_ext.h | 7 ++ > include/linux/xpfo.h | 39 +++++++++ > lib/swiotlb.c | 3 +- > mm/Makefile | 1 + > mm/page_alloc.c | 2 + > mm/page_ext.c | 4 + > mm/xpfo.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++ > security/Kconfig | 19 +++++ > 12 files changed, 298 insertions(+), 7 deletions(-) > create mode 100644 include/linux/xpfo.h > create mode 100644 mm/xpfo.c > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index bada636d1065..38b334f8fde5 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -165,6 +165,7 @@ config X86 > select HAVE_STACK_VALIDATION if X86_64 > select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS > select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS > + select ARCH_SUPPORTS_XPFO if X86_64 > > config INSTRUCTION_DECODER > def_bool y > @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT > > config X86_DIRECT_GBPAGES > def_bool y > - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK > + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO > ---help--- > Certain kernel features effectively disable kernel > linear 1 GB mappings (even if the CPU otherwise > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c > index 22af912d66d2..a6fafbae02bb 100644 > --- a/arch/x86/mm/init.c > +++ b/arch/x86/mm/init.c > @@ -161,7 +161,7 @@ static int page_size_mask; > > static void __init probe_page_size_mask(void) > { > -#if !defined(CONFIG_KMEMCHECK) > +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO) > /* > * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will > * use small pages. > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c > index 051b6158d1b7..58af734be25d 100644 > --- a/drivers/ata/libata-sff.c > +++ b/drivers/ata/libata-sff.c > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use a bounce buffer */ > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > - if (PageHighMem(page)) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > unsigned long flags; > > /* FIXME: use bounce buffer */ > diff --git a/include/linux/highmem.h b/include/linux/highmem.h > index bb3f3297062a..7a17c166532f 100644 > --- a/include/linux/highmem.h > +++ b/include/linux/highmem.h > @@ -7,6 +7,7 @@ > #include <linux/mm.h> > #include <linux/uaccess.h> > #include <linux/hardirq.h> > +#include <linux/xpfo.h> > > #include <asm/cacheflush.h> > > @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr) > #ifndef ARCH_HAS_KMAP > static inline void *kmap(struct page *page) > { > + void *kaddr; > + > might_sleep(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > > static inline void kunmap(struct page *page) > { > + xpfo_kunmap(page_address(page), page); > } > > static inline void *kmap_atomic(struct page *page) > { > + void *kaddr; > + > preempt_disable(); > pagefault_disable(); > - return page_address(page); > + kaddr = page_address(page); > + xpfo_kmap(kaddr, page); > + return kaddr; > } > #define kmap_atomic_prot(page, prot) kmap_atomic(page) > > static inline void __kunmap_atomic(void *addr) > { > + xpfo_kunmap(addr, virt_to_page(addr)); > pagefault_enable(); > preempt_enable(); > } > diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h > index 9298c393ddaa..0e451a42e5a3 100644 > --- a/include/linux/page_ext.h > +++ b/include/linux/page_ext.h > @@ -29,6 +29,8 @@ enum page_ext_flags { > PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ > PAGE_EXT_DEBUG_GUARD, > PAGE_EXT_OWNER, > + PAGE_EXT_XPFO_KERNEL, /* Page is a kernel page */ > + PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */ > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > PAGE_EXT_YOUNG, > PAGE_EXT_IDLE, > @@ -44,6 +46,11 @@ enum page_ext_flags { > */ > struct page_ext { > unsigned long flags; > +#ifdef CONFIG_XPFO > + int inited; /* Map counter and lock initialized */ > + atomic_t mapcount; /* Counter for balancing map/unmap requests */ > + spinlock_t maplock; /* Lock to serialize map/unmap requests */ > +#endif > }; > > extern void pgdat_page_ext_init(struct pglist_data *pgdat); > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h > new file mode 100644 > index 000000000000..77187578ca33 > --- /dev/null > +++ b/include/linux/xpfo.h > @@ -0,0 +1,39 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#ifndef _LINUX_XPFO_H > +#define _LINUX_XPFO_H > + > +#ifdef CONFIG_XPFO > + > +extern struct page_ext_operations page_xpfo_ops; > + > +extern void xpfo_kmap(void *kaddr, struct page *page); > +extern void xpfo_kunmap(void *kaddr, struct page *page); > +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp); > +extern void xpfo_free_page(struct page *page, int order); > + > +extern bool xpfo_page_is_unmapped(struct page *page); > + > +#else /* !CONFIG_XPFO */ > + > +static inline void xpfo_kmap(void *kaddr, struct page *page) { } > +static inline void xpfo_kunmap(void *kaddr, struct page *page) { } > +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { } > +static inline void xpfo_free_page(struct page *page, int order) { } > + > +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } > + > +#endif /* CONFIG_XPFO */ > + > +#endif /* _LINUX_XPFO_H */ > diff --git a/lib/swiotlb.c b/lib/swiotlb.c > index 22e13a0e19d7..455eff44604e 100644 > --- a/lib/swiotlb.c > +++ b/lib/swiotlb.c > @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, > { > unsigned long pfn = PFN_DOWN(orig_addr); > unsigned char *vaddr = phys_to_virt(tlb_addr); > + struct page *page = pfn_to_page(pfn); > > - if (PageHighMem(pfn_to_page(pfn))) { > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > /* The buffer does not have a mapping. Map it in and copy */ > unsigned int offset = orig_addr & ~PAGE_MASK; > char *buffer; > diff --git a/mm/Makefile b/mm/Makefile > index 295bd7a9f76b..175680f516aa 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o > obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o > obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o > obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o > +obj-$(CONFIG_XPFO) += xpfo.o > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 8fd42aa7c4bd..100e80e008e2 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page, > kernel_poison_pages(page, 1 << order, 0); > kernel_map_pages(page, 1 << order, 0); > kasan_free_pages(page, order); > + xpfo_free_page(page, order); > > return true; > } > @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > kernel_map_pages(page, 1 << order, 1); > kernel_poison_pages(page, 1 << order, 1); > kasan_alloc_pages(page, order); > + xpfo_alloc_page(page, order, gfp_flags); > set_page_owner(page, order, gfp_flags); > } > > diff --git a/mm/page_ext.c b/mm/page_ext.c > index 121dcffc4ec1..ba6dbcacc2db 100644 > --- a/mm/page_ext.c > +++ b/mm/page_ext.c > @@ -7,6 +7,7 @@ > #include <linux/kmemleak.h> > #include <linux/page_owner.h> > #include <linux/page_idle.h> > +#include <linux/xpfo.h> > > /* > * struct page extension > @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = { > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > &page_idle_ops, > #endif > +#ifdef CONFIG_XPFO > + &page_xpfo_ops, > +#endif > }; > > static unsigned long total_usage; > diff --git a/mm/xpfo.c b/mm/xpfo.c > new file mode 100644 > index 000000000000..8e3a6a694b6a > --- /dev/null > +++ b/mm/xpfo.c > @@ -0,0 +1,206 @@ > +/* > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > + * Copyright (C) 2016 Brown University. All rights reserved. > + * > + * Authors: > + * Juerg Haefliger <juerg.haefliger@hpe.com> > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License version 2 as published by > + * the Free Software Foundation. > + */ > + > +#include <linux/mm.h> > +#include <linux/module.h> > +#include <linux/page_ext.h> > +#include <linux/xpfo.h> > + > +#include <asm/tlbflush.h> > + > +DEFINE_STATIC_KEY_FALSE(xpfo_inited); > + > +static bool need_xpfo(void) > +{ > + return true; > +} > + > +static void init_xpfo(void) > +{ > + printk(KERN_INFO "XPFO enabled\n"); > + static_branch_enable(&xpfo_inited); > +} > + > +struct page_ext_operations page_xpfo_ops = { > + .need = need_xpfo, > + .init = init_xpfo, > +}; > + > +/* > + * Update a single kernel page table entry > + */ > +static inline void set_kpte(struct page *page, unsigned long kaddr, > + pgprot_t prot) { > + unsigned int level; > + pte_t *kpte = lookup_address(kaddr, &level); > + > + /* We only support 4k pages for now */ > + BUG_ON(!kpte || level != PG_LEVEL_4K); > + > + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); > +} As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific, would it be better to put the whole definition into arch-specific part? > + > +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) > +{ > + int i, flush_tlb = 0; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + > + /* Initialize the map lock and map counter */ > + if (!page_ext->inited) { > + spin_lock_init(&page_ext->maplock); > + atomic_set(&page_ext->mapcount, 0); > + page_ext->inited = 1; > + } > + BUG_ON(atomic_read(&page_ext->mapcount)); > + > + if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) { > + /* > + * Flush the TLB if the page was previously allocated > + * to the kernel. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL, > + &page_ext->flags)) > + flush_tlb = 1; > + } else { > + /* Tag the page as a kernel page */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + } > + } > + > + if (flush_tlb) { > + kaddr = (unsigned long)page_address(page); > + flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * > + PAGE_SIZE); > + } > +} > + > +void xpfo_free_page(struct page *page, int order) > +{ > + int i; > + struct page_ext *page_ext; > + unsigned long kaddr; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + for (i = 0; i < (1 << order); i++) { > + page_ext = lookup_page_ext(page + i); > + > + if (!page_ext->inited) { > + /* > + * The page was allocated before page_ext was > + * initialized, so it is a kernel page and it needs to > + * be tagged accordingly. > + */ > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > + continue; > + } > + > + /* > + * Map the page back into the kernel if it was previously > + * allocated to user space. > + */ > + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, > + &page_ext->flags)) { > + kaddr = (unsigned long)page_address(page + i); > + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); Why not PAGE_KERNEL? > + } > + } > +} > + > +void xpfo_kmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page was previously allocated to user space, so map it back > + * into the kernel. No TLB flush required. > + */ > + if ((atomic_inc_return(&page_ext->mapcount) == 1) && > + test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)) > + set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL)); > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kmap); > + > +void xpfo_kunmap(void *kaddr, struct page *page) > +{ > + struct page_ext *page_ext; > + unsigned long flags; > + > + if (!static_branch_unlikely(&xpfo_inited)) > + return; > + > + page_ext = lookup_page_ext(page); > + > + /* > + * The page was allocated before page_ext was initialized (which means > + * it's a kernel page) or it's allocated to the kernel, so nothing to > + * do. > + */ > + if (!page_ext->inited || > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > + return; > + > + spin_lock_irqsave(&page_ext->maplock, flags); > + > + /* > + * The page is to be allocated back to user space, so unmap it from the > + * kernel, flush the TLB and tag it as a user page. > + */ > + if (atomic_dec_return(&page_ext->mapcount) == 0) { > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); > + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); > + __flush_tlb_one((unsigned long)kaddr); Again __flush_tlb_one() is x86-specific. flush_tlb_kernel_range() instead? Thanks, -Takahiro AKASHI > + } > + > + spin_unlock_irqrestore(&page_ext->maplock, flags); > +} > +EXPORT_SYMBOL(xpfo_kunmap); > + > +inline bool xpfo_page_is_unmapped(struct page *page) > +{ > + if (!static_branch_unlikely(&xpfo_inited)) > + return false; > + > + return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags); > +} > +EXPORT_SYMBOL(xpfo_page_is_unmapped); > diff --git a/security/Kconfig b/security/Kconfig > index 118f4549404e..4502e15c8419 100644 > --- a/security/Kconfig > +++ b/security/Kconfig > @@ -6,6 +6,25 @@ menu "Security options" > > source security/keys/Kconfig > > +config ARCH_SUPPORTS_XPFO > + bool > + > +config XPFO > + bool "Enable eXclusive Page Frame Ownership (XPFO)" > + default n > + depends on ARCH_SUPPORTS_XPFO > + select PAGE_EXTENSION > + help > + This option offers protection against 'ret2dir' kernel attacks. > + When enabled, every time a page frame is allocated to user space, it > + is unmapped from the direct mapped RAM region in kernel space > + (physmap). Similarly, when a page frame is freed/reclaimed, it is > + mapped back to physmap. > + > + There is a slight performance impact when this option is enabled. > + > + If in doubt, say "N". > + > config SECURITY_DMESG_RESTRICT > bool "Restrict unprivileged access to the kernel syslog" > default n > -- > 2.10.1 >
On 11/24/2016 11:56 AM, AKASHI Takahiro wrote: > Hi, > > I'm trying to give it a spin on arm64, but ... Thanks for trying this. >> +/* >> + * Update a single kernel page table entry >> + */ >> +static inline void set_kpte(struct page *page, unsigned long kaddr, >> + pgprot_t prot) { >> + unsigned int level; >> + pte_t *kpte = lookup_address(kaddr, &level); >> + >> + /* We only support 4k pages for now */ >> + BUG_ON(!kpte || level != PG_LEVEL_4K); >> + >> + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); >> +} > > As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific, > would it be better to put the whole definition into arch-specific part? Well yes but I haven't really looked into splitting up the arch specific stuff. >> + /* >> + * Map the page back into the kernel if it was previously >> + * allocated to user space. >> + */ >> + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, >> + &page_ext->flags)) { >> + kaddr = (unsigned long)page_address(page + i); >> + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); > > Why not PAGE_KERNEL? Good catch, thanks! >> + /* >> + * The page is to be allocated back to user space, so unmap it from the >> + * kernel, flush the TLB and tag it as a user page. >> + */ >> + if (atomic_dec_return(&page_ext->mapcount) == 0) { >> + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); >> + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); >> + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); >> + __flush_tlb_one((unsigned long)kaddr); > > Again __flush_tlb_one() is x86-specific. > flush_tlb_kernel_range() instead? I'll take a look. If you can tell me what the relevant arm64 equivalents are for the arch-specific functions, that would help tremendously. Thanks for the comments! ...Juerg > Thanks, > -Takahiro AKASHI
On Thu, Nov 24, 2016 at 07:56:30PM +0900, AKASHI Takahiro wrote: > Hi, > > I'm trying to give it a spin on arm64, but ... In my experiment on hikey, the kernel boot failed, catching a page fault around cache operations, (a) __clean_dcache_area_pou() on 4KB-page kernel, (b) __inval_cache_range() on 64KB-page kernel, (See more details for backtrace below.) This is because, on arm64, cache operations are by VA (in particular, of direct/linear mapping of physical memory). So I think that naively unmapping a page from physmap in xpfo_kunmap() won't work well on arm64. -Takahiro AKASHI case (a) -------- Unable to handle kernel paging request at virtual address ffff800000cba000 pgd = ffff80003ba8c000 *pgd=0000000000000000 task: ffff80003be38000 task.stack: ffff80003be40000 PC is at __clean_dcache_area_pou+0x20/0x38 LR is at sync_icache_aliases+0x2c/0x40 ... Call trace: ... __clean_dcache_area_pou+0x20/0x38 __sync_icache_dcache+0x6c/0xa8 alloc_set_pte+0x33c/0x588 filemap_map_pages+0x3a8/0x3b8 handle_mm_fault+0x910/0x1080 do_page_fault+0x2b0/0x358 do_mem_abort+0x44/0xa0 el0_ia+0x18/0x1c case (b) -------- Unable to handle kernel paging request at virtual address ffff80002aed0000 pgd = ffff000008f40000 , *pud=000000003dfc0003 , *pmd=000000003dfa0003 , *pte=000000002aed0000 task: ffff800028711900 task.stack: ffff800029020000 PC is at __inval_cache_range+0x3c/0x60 LR is at __swiotlb_map_sg_attrs+0x6c/0x98 ... Call trace: ... __inval_cache_range+0x3c/0x60 dw_mci_pre_dma_transfer.isra.7+0xfc/0x190 dw_mci_pre_req+0x50/0x60 mmc_start_req+0x4c/0x420 mmc_blk_issue_rw_rq+0xb0/0x9b8 mmc_blk_issue_rq+0x154/0x518 mmc_queue_thread+0xac/0x158 kthread+0xd0/0xe8 ret_from_fork+0x10/0x20 > > On Fri, Nov 04, 2016 at 03:45:33PM +0100, Juerg Haefliger wrote: > > This patch adds support for XPFO which protects against 'ret2dir' kernel > > attacks. The basic idea is to enforce exclusive ownership of page frames > > by either the kernel or userspace, unless explicitly requested by the > > kernel. Whenever a page destined for userspace is allocated, it is > > unmapped from physmap (the kernel's page table). When such a page is > > reclaimed from userspace, it is mapped back to physmap. > > > > Additional fields in the page_ext struct are used for XPFO housekeeping. > > Specifically two flags to distinguish user vs. kernel pages and to tag > > unmapped pages and a reference counter to balance kmap/kunmap operations > > and a lock to serialize access to the XPFO fields. > > > > Known issues/limitations: > > - Only supports x86-64 (for now) > > - Only supports 4k pages (for now) > > - There are most likely some legitimate uses cases where the kernel needs > > to access userspace which need to be made XPFO-aware > > - Performance penalty > > > > Reference paper by the original patch authors: > > http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf > > > > Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu> > > Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com> > > --- > > arch/x86/Kconfig | 3 +- > > arch/x86/mm/init.c | 2 +- > > drivers/ata/libata-sff.c | 4 +- > > include/linux/highmem.h | 15 +++- > > include/linux/page_ext.h | 7 ++ > > include/linux/xpfo.h | 39 +++++++++ > > lib/swiotlb.c | 3 +- > > mm/Makefile | 1 + > > mm/page_alloc.c | 2 + > > mm/page_ext.c | 4 + > > mm/xpfo.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++ > > security/Kconfig | 19 +++++ > > 12 files changed, 298 insertions(+), 7 deletions(-) > > create mode 100644 include/linux/xpfo.h > > create mode 100644 mm/xpfo.c > > > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > > index bada636d1065..38b334f8fde5 100644 > > --- a/arch/x86/Kconfig > > +++ b/arch/x86/Kconfig > > @@ -165,6 +165,7 @@ config X86 > > select HAVE_STACK_VALIDATION if X86_64 > > select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS > > select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS > > + select ARCH_SUPPORTS_XPFO if X86_64 > > > > config INSTRUCTION_DECODER > > def_bool y > > @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT > > > > config X86_DIRECT_GBPAGES > > def_bool y > > - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK > > + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO > > ---help--- > > Certain kernel features effectively disable kernel > > linear 1 GB mappings (even if the CPU otherwise > > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c > > index 22af912d66d2..a6fafbae02bb 100644 > > --- a/arch/x86/mm/init.c > > +++ b/arch/x86/mm/init.c > > @@ -161,7 +161,7 @@ static int page_size_mask; > > > > static void __init probe_page_size_mask(void) > > { > > -#if !defined(CONFIG_KMEMCHECK) > > +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO) > > /* > > * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will > > * use small pages. > > diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c > > index 051b6158d1b7..58af734be25d 100644 > > --- a/drivers/ata/libata-sff.c > > +++ b/drivers/ata/libata-sff.c > > @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) > > > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > > > - if (PageHighMem(page)) { > > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > > unsigned long flags; > > > > /* FIXME: use a bounce buffer */ > > @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) > > > > DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); > > > > - if (PageHighMem(page)) { > > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > > unsigned long flags; > > > > /* FIXME: use bounce buffer */ > > diff --git a/include/linux/highmem.h b/include/linux/highmem.h > > index bb3f3297062a..7a17c166532f 100644 > > --- a/include/linux/highmem.h > > +++ b/include/linux/highmem.h > > @@ -7,6 +7,7 @@ > > #include <linux/mm.h> > > #include <linux/uaccess.h> > > #include <linux/hardirq.h> > > +#include <linux/xpfo.h> > > > > #include <asm/cacheflush.h> > > > > @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr) > > #ifndef ARCH_HAS_KMAP > > static inline void *kmap(struct page *page) > > { > > + void *kaddr; > > + > > might_sleep(); > > - return page_address(page); > > + kaddr = page_address(page); > > + xpfo_kmap(kaddr, page); > > + return kaddr; > > } > > > > static inline void kunmap(struct page *page) > > { > > + xpfo_kunmap(page_address(page), page); > > } > > > > static inline void *kmap_atomic(struct page *page) > > { > > + void *kaddr; > > + > > preempt_disable(); > > pagefault_disable(); > > - return page_address(page); > > + kaddr = page_address(page); > > + xpfo_kmap(kaddr, page); > > + return kaddr; > > } > > #define kmap_atomic_prot(page, prot) kmap_atomic(page) > > > > static inline void __kunmap_atomic(void *addr) > > { > > + xpfo_kunmap(addr, virt_to_page(addr)); > > pagefault_enable(); > > preempt_enable(); > > } > > diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h > > index 9298c393ddaa..0e451a42e5a3 100644 > > --- a/include/linux/page_ext.h > > +++ b/include/linux/page_ext.h > > @@ -29,6 +29,8 @@ enum page_ext_flags { > > PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ > > PAGE_EXT_DEBUG_GUARD, > > PAGE_EXT_OWNER, > > + PAGE_EXT_XPFO_KERNEL, /* Page is a kernel page */ > > + PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */ > > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > > PAGE_EXT_YOUNG, > > PAGE_EXT_IDLE, > > @@ -44,6 +46,11 @@ enum page_ext_flags { > > */ > > struct page_ext { > > unsigned long flags; > > +#ifdef CONFIG_XPFO > > + int inited; /* Map counter and lock initialized */ > > + atomic_t mapcount; /* Counter for balancing map/unmap requests */ > > + spinlock_t maplock; /* Lock to serialize map/unmap requests */ > > +#endif > > }; > > > > extern void pgdat_page_ext_init(struct pglist_data *pgdat); > > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h > > new file mode 100644 > > index 000000000000..77187578ca33 > > --- /dev/null > > +++ b/include/linux/xpfo.h > > @@ -0,0 +1,39 @@ > > +/* > > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > > + * Copyright (C) 2016 Brown University. All rights reserved. > > + * > > + * Authors: > > + * Juerg Haefliger <juerg.haefliger@hpe.com> > > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > > + * > > + * This program is free software; you can redistribute it and/or modify it > > + * under the terms of the GNU General Public License version 2 as published by > > + * the Free Software Foundation. > > + */ > > + > > +#ifndef _LINUX_XPFO_H > > +#define _LINUX_XPFO_H > > + > > +#ifdef CONFIG_XPFO > > + > > +extern struct page_ext_operations page_xpfo_ops; > > + > > +extern void xpfo_kmap(void *kaddr, struct page *page); > > +extern void xpfo_kunmap(void *kaddr, struct page *page); > > +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp); > > +extern void xpfo_free_page(struct page *page, int order); > > + > > +extern bool xpfo_page_is_unmapped(struct page *page); > > + > > +#else /* !CONFIG_XPFO */ > > + > > +static inline void xpfo_kmap(void *kaddr, struct page *page) { } > > +static inline void xpfo_kunmap(void *kaddr, struct page *page) { } > > +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { } > > +static inline void xpfo_free_page(struct page *page, int order) { } > > + > > +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } > > + > > +#endif /* CONFIG_XPFO */ > > + > > +#endif /* _LINUX_XPFO_H */ > > diff --git a/lib/swiotlb.c b/lib/swiotlb.c > > index 22e13a0e19d7..455eff44604e 100644 > > --- a/lib/swiotlb.c > > +++ b/lib/swiotlb.c > > @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, > > { > > unsigned long pfn = PFN_DOWN(orig_addr); > > unsigned char *vaddr = phys_to_virt(tlb_addr); > > + struct page *page = pfn_to_page(pfn); > > > > - if (PageHighMem(pfn_to_page(pfn))) { > > + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { > > /* The buffer does not have a mapping. Map it in and copy */ > > unsigned int offset = orig_addr & ~PAGE_MASK; > > char *buffer; > > diff --git a/mm/Makefile b/mm/Makefile > > index 295bd7a9f76b..175680f516aa 100644 > > --- a/mm/Makefile > > +++ b/mm/Makefile > > @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o > > obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o > > obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o > > obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o > > +obj-$(CONFIG_XPFO) += xpfo.o > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index 8fd42aa7c4bd..100e80e008e2 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page, > > kernel_poison_pages(page, 1 << order, 0); > > kernel_map_pages(page, 1 << order, 0); > > kasan_free_pages(page, order); > > + xpfo_free_page(page, order); > > > > return true; > > } > > @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > > kernel_map_pages(page, 1 << order, 1); > > kernel_poison_pages(page, 1 << order, 1); > > kasan_alloc_pages(page, order); > > + xpfo_alloc_page(page, order, gfp_flags); > > set_page_owner(page, order, gfp_flags); > > } > > > > diff --git a/mm/page_ext.c b/mm/page_ext.c > > index 121dcffc4ec1..ba6dbcacc2db 100644 > > --- a/mm/page_ext.c > > +++ b/mm/page_ext.c > > @@ -7,6 +7,7 @@ > > #include <linux/kmemleak.h> > > #include <linux/page_owner.h> > > #include <linux/page_idle.h> > > +#include <linux/xpfo.h> > > > > /* > > * struct page extension > > @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = { > > #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) > > &page_idle_ops, > > #endif > > +#ifdef CONFIG_XPFO > > + &page_xpfo_ops, > > +#endif > > }; > > > > static unsigned long total_usage; > > diff --git a/mm/xpfo.c b/mm/xpfo.c > > new file mode 100644 > > index 000000000000..8e3a6a694b6a > > --- /dev/null > > +++ b/mm/xpfo.c > > @@ -0,0 +1,206 @@ > > +/* > > + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. > > + * Copyright (C) 2016 Brown University. All rights reserved. > > + * > > + * Authors: > > + * Juerg Haefliger <juerg.haefliger@hpe.com> > > + * Vasileios P. Kemerlis <vpk@cs.brown.edu> > > + * > > + * This program is free software; you can redistribute it and/or modify it > > + * under the terms of the GNU General Public License version 2 as published by > > + * the Free Software Foundation. > > + */ > > + > > +#include <linux/mm.h> > > +#include <linux/module.h> > > +#include <linux/page_ext.h> > > +#include <linux/xpfo.h> > > + > > +#include <asm/tlbflush.h> > > + > > +DEFINE_STATIC_KEY_FALSE(xpfo_inited); > > + > > +static bool need_xpfo(void) > > +{ > > + return true; > > +} > > + > > +static void init_xpfo(void) > > +{ > > + printk(KERN_INFO "XPFO enabled\n"); > > + static_branch_enable(&xpfo_inited); > > +} > > + > > +struct page_ext_operations page_xpfo_ops = { > > + .need = need_xpfo, > > + .init = init_xpfo, > > +}; > > + > > +/* > > + * Update a single kernel page table entry > > + */ > > +static inline void set_kpte(struct page *page, unsigned long kaddr, > > + pgprot_t prot) { > > + unsigned int level; > > + pte_t *kpte = lookup_address(kaddr, &level); > > + > > + /* We only support 4k pages for now */ > > + BUG_ON(!kpte || level != PG_LEVEL_4K); > > + > > + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); > > +} > > As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific, > would it be better to put the whole definition into arch-specific part? > > > + > > +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) > > +{ > > + int i, flush_tlb = 0; > > + struct page_ext *page_ext; > > + unsigned long kaddr; > > + > > + if (!static_branch_unlikely(&xpfo_inited)) > > + return; > > + > > + for (i = 0; i < (1 << order); i++) { > > + page_ext = lookup_page_ext(page + i); > > + > > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > > + > > + /* Initialize the map lock and map counter */ > > + if (!page_ext->inited) { > > + spin_lock_init(&page_ext->maplock); > > + atomic_set(&page_ext->mapcount, 0); > > + page_ext->inited = 1; > > + } > > + BUG_ON(atomic_read(&page_ext->mapcount)); > > + > > + if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) { > > + /* > > + * Flush the TLB if the page was previously allocated > > + * to the kernel. > > + */ > > + if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL, > > + &page_ext->flags)) > > + flush_tlb = 1; > > + } else { > > + /* Tag the page as a kernel page */ > > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > > + } > > + } > > + > > + if (flush_tlb) { > > + kaddr = (unsigned long)page_address(page); > > + flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * > > + PAGE_SIZE); > > + } > > +} > > + > > +void xpfo_free_page(struct page *page, int order) > > +{ > > + int i; > > + struct page_ext *page_ext; > > + unsigned long kaddr; > > + > > + if (!static_branch_unlikely(&xpfo_inited)) > > + return; > > + > > + for (i = 0; i < (1 << order); i++) { > > + page_ext = lookup_page_ext(page + i); > > + > > + if (!page_ext->inited) { > > + /* > > + * The page was allocated before page_ext was > > + * initialized, so it is a kernel page and it needs to > > + * be tagged accordingly. > > + */ > > + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); > > + continue; > > + } > > + > > + /* > > + * Map the page back into the kernel if it was previously > > + * allocated to user space. > > + */ > > + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, > > + &page_ext->flags)) { > > + kaddr = (unsigned long)page_address(page + i); > > + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); > > Why not PAGE_KERNEL? > > > + } > > + } > > +} > > + > > +void xpfo_kmap(void *kaddr, struct page *page) > > +{ > > + struct page_ext *page_ext; > > + unsigned long flags; > > + > > + if (!static_branch_unlikely(&xpfo_inited)) > > + return; > > + > > + page_ext = lookup_page_ext(page); > > + > > + /* > > + * The page was allocated before page_ext was initialized (which means > > + * it's a kernel page) or it's allocated to the kernel, so nothing to > > + * do. > > + */ > > + if (!page_ext->inited || > > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > > + return; > > + > > + spin_lock_irqsave(&page_ext->maplock, flags); > > + > > + /* > > + * The page was previously allocated to user space, so map it back > > + * into the kernel. No TLB flush required. > > + */ > > + if ((atomic_inc_return(&page_ext->mapcount) == 1) && > > + test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)) > > + set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL)); > > + > > + spin_unlock_irqrestore(&page_ext->maplock, flags); > > +} > > +EXPORT_SYMBOL(xpfo_kmap); > > + > > +void xpfo_kunmap(void *kaddr, struct page *page) > > +{ > > + struct page_ext *page_ext; > > + unsigned long flags; > > + > > + if (!static_branch_unlikely(&xpfo_inited)) > > + return; > > + > > + page_ext = lookup_page_ext(page); > > + > > + /* > > + * The page was allocated before page_ext was initialized (which means > > + * it's a kernel page) or it's allocated to the kernel, so nothing to > > + * do. > > + */ > > + if (!page_ext->inited || > > + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) > > + return; > > + > > + spin_lock_irqsave(&page_ext->maplock, flags); > > + > > + /* > > + * The page is to be allocated back to user space, so unmap it from the > > + * kernel, flush the TLB and tag it as a user page. > > + */ > > + if (atomic_dec_return(&page_ext->mapcount) == 0) { > > + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); > > + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); > > + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); > > + __flush_tlb_one((unsigned long)kaddr); > > Again __flush_tlb_one() is x86-specific. > flush_tlb_kernel_range() instead? > > Thanks, > -Takahiro AKASHI > > > + } > > + > > + spin_unlock_irqrestore(&page_ext->maplock, flags); > > +} > > +EXPORT_SYMBOL(xpfo_kunmap); > > + > > +inline bool xpfo_page_is_unmapped(struct page *page) > > +{ > > + if (!static_branch_unlikely(&xpfo_inited)) > > + return false; > > + > > + return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags); > > +} > > +EXPORT_SYMBOL(xpfo_page_is_unmapped); > > diff --git a/security/Kconfig b/security/Kconfig > > index 118f4549404e..4502e15c8419 100644 > > --- a/security/Kconfig > > +++ b/security/Kconfig > > @@ -6,6 +6,25 @@ menu "Security options" > > > > source security/keys/Kconfig > > > > +config ARCH_SUPPORTS_XPFO > > + bool > > + > > +config XPFO > > + bool "Enable eXclusive Page Frame Ownership (XPFO)" > > + default n > > + depends on ARCH_SUPPORTS_XPFO > > + select PAGE_EXTENSION > > + help > > + This option offers protection against 'ret2dir' kernel attacks. > > + When enabled, every time a page frame is allocated to user space, it > > + is unmapped from the direct mapped RAM region in kernel space > > + (physmap). Similarly, when a page frame is freed/reclaimed, it is > > + mapped back to physmap. > > + > > + There is a slight performance impact when this option is enabled. > > + > > + If in doubt, say "N". > > + > > config SECURITY_DMESG_RESTRICT > > bool "Restrict unprivileged access to the kernel syslog" > > default n > > -- > > 2.10.1 > >
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..38b334f8fde5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,7 @@ config X86 select HAVE_STACK_VALIDATION if X86_64 select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS + select ARCH_SUPPORTS_XPFO if X86_64 config INSTRUCTION_DECODER def_bool y @@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT config X86_DIRECT_GBPAGES def_bool y - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO ---help--- Certain kernel features effectively disable kernel linear 1 GB mappings (even if the CPU otherwise diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 22af912d66d2..a6fafbae02bb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -161,7 +161,7 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { -#if !defined(CONFIG_KMEMCHECK) +#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO) /* * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will * use small pages. diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 051b6158d1b7..58af734be25d 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); - if (PageHighMem(page)) { + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { unsigned long flags; /* FIXME: use a bounce buffer */ @@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read"); - if (PageHighMem(page)) { + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { unsigned long flags; /* FIXME: use bounce buffer */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index bb3f3297062a..7a17c166532f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/xpfo.h> #include <asm/cacheflush.h> @@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr) #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { + void *kaddr; + might_sleep(); - return page_address(page); + kaddr = page_address(page); + xpfo_kmap(kaddr, page); + return kaddr; } static inline void kunmap(struct page *page) { + xpfo_kunmap(page_address(page), page); } static inline void *kmap_atomic(struct page *page) { + void *kaddr; + preempt_disable(); pagefault_disable(); - return page_address(page); + kaddr = page_address(page); + xpfo_kmap(kaddr, page); + return kaddr; } #define kmap_atomic_prot(page, prot) kmap_atomic(page) static inline void __kunmap_atomic(void *addr) { + xpfo_kunmap(addr, virt_to_page(addr)); pagefault_enable(); preempt_enable(); } diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 9298c393ddaa..0e451a42e5a3 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -29,6 +29,8 @@ enum page_ext_flags { PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ PAGE_EXT_DEBUG_GUARD, PAGE_EXT_OWNER, + PAGE_EXT_XPFO_KERNEL, /* Page is a kernel page */ + PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */ #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, @@ -44,6 +46,11 @@ enum page_ext_flags { */ struct page_ext { unsigned long flags; +#ifdef CONFIG_XPFO + int inited; /* Map counter and lock initialized */ + atomic_t mapcount; /* Counter for balancing map/unmap requests */ + spinlock_t maplock; /* Lock to serialize map/unmap requests */ +#endif }; extern void pgdat_page_ext_init(struct pglist_data *pgdat); diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h new file mode 100644 index 000000000000..77187578ca33 --- /dev/null +++ b/include/linux/xpfo.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. + * Copyright (C) 2016 Brown University. All rights reserved. + * + * Authors: + * Juerg Haefliger <juerg.haefliger@hpe.com> + * Vasileios P. Kemerlis <vpk@cs.brown.edu> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#ifndef _LINUX_XPFO_H +#define _LINUX_XPFO_H + +#ifdef CONFIG_XPFO + +extern struct page_ext_operations page_xpfo_ops; + +extern void xpfo_kmap(void *kaddr, struct page *page); +extern void xpfo_kunmap(void *kaddr, struct page *page); +extern void xpfo_alloc_page(struct page *page, int order, gfp_t gfp); +extern void xpfo_free_page(struct page *page, int order); + +extern bool xpfo_page_is_unmapped(struct page *page); + +#else /* !CONFIG_XPFO */ + +static inline void xpfo_kmap(void *kaddr, struct page *page) { } +static inline void xpfo_kunmap(void *kaddr, struct page *page) { } +static inline void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) { } +static inline void xpfo_free_page(struct page *page, int order) { } + +static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } + +#endif /* CONFIG_XPFO */ + +#endif /* _LINUX_XPFO_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 22e13a0e19d7..455eff44604e 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -390,8 +390,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, { unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + struct page *page = pfn_to_page(pfn); - if (PageHighMem(pfn_to_page(pfn))) { + if (PageHighMem(page) || xpfo_page_is_unmapped(page)) { /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; char *buffer; diff --git a/mm/Makefile b/mm/Makefile index 295bd7a9f76b..175680f516aa 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -100,3 +100,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_XPFO) += xpfo.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fd42aa7c4bd..100e80e008e2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1045,6 +1045,7 @@ static __always_inline bool free_pages_prepare(struct page *page, kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); kasan_free_pages(page, order); + xpfo_free_page(page, order); return true; } @@ -1745,6 +1746,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, kernel_map_pages(page, 1 << order, 1); kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + xpfo_alloc_page(page, order, gfp_flags); set_page_owner(page, order, gfp_flags); } diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..ba6dbcacc2db 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -7,6 +7,7 @@ #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> +#include <linux/xpfo.h> /* * struct page extension @@ -68,6 +69,9 @@ static struct page_ext_operations *page_ext_ops[] = { #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_XPFO + &page_xpfo_ops, +#endif }; static unsigned long total_usage; diff --git a/mm/xpfo.c b/mm/xpfo.c new file mode 100644 index 000000000000..8e3a6a694b6a --- /dev/null +++ b/mm/xpfo.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P. + * Copyright (C) 2016 Brown University. All rights reserved. + * + * Authors: + * Juerg Haefliger <juerg.haefliger@hpe.com> + * Vasileios P. Kemerlis <vpk@cs.brown.edu> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page_ext.h> +#include <linux/xpfo.h> + +#include <asm/tlbflush.h> + +DEFINE_STATIC_KEY_FALSE(xpfo_inited); + +static bool need_xpfo(void) +{ + return true; +} + +static void init_xpfo(void) +{ + printk(KERN_INFO "XPFO enabled\n"); + static_branch_enable(&xpfo_inited); +} + +struct page_ext_operations page_xpfo_ops = { + .need = need_xpfo, + .init = init_xpfo, +}; + +/* + * Update a single kernel page table entry + */ +static inline void set_kpte(struct page *page, unsigned long kaddr, + pgprot_t prot) { + unsigned int level; + pte_t *kpte = lookup_address(kaddr, &level); + + /* We only support 4k pages for now */ + BUG_ON(!kpte || level != PG_LEVEL_4K); + + set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot))); +} + +void xpfo_alloc_page(struct page *page, int order, gfp_t gfp) +{ + int i, flush_tlb = 0; + struct page_ext *page_ext; + unsigned long kaddr; + + if (!static_branch_unlikely(&xpfo_inited)) + return; + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); + + /* Initialize the map lock and map counter */ + if (!page_ext->inited) { + spin_lock_init(&page_ext->maplock); + atomic_set(&page_ext->mapcount, 0); + page_ext->inited = 1; + } + BUG_ON(atomic_read(&page_ext->mapcount)); + + if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) { + /* + * Flush the TLB if the page was previously allocated + * to the kernel. + */ + if (test_and_clear_bit(PAGE_EXT_XPFO_KERNEL, + &page_ext->flags)) + flush_tlb = 1; + } else { + /* Tag the page as a kernel page */ + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); + } + } + + if (flush_tlb) { + kaddr = (unsigned long)page_address(page); + flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * + PAGE_SIZE); + } +} + +void xpfo_free_page(struct page *page, int order) +{ + int i; + struct page_ext *page_ext; + unsigned long kaddr; + + if (!static_branch_unlikely(&xpfo_inited)) + return; + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + + if (!page_ext->inited) { + /* + * The page was allocated before page_ext was + * initialized, so it is a kernel page and it needs to + * be tagged accordingly. + */ + set_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags); + continue; + } + + /* + * Map the page back into the kernel if it was previously + * allocated to user space. + */ + if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, + &page_ext->flags)) { + kaddr = (unsigned long)page_address(page + i); + set_kpte(page + i, kaddr, __pgprot(__PAGE_KERNEL)); + } + } +} + +void xpfo_kmap(void *kaddr, struct page *page) +{ + struct page_ext *page_ext; + unsigned long flags; + + if (!static_branch_unlikely(&xpfo_inited)) + return; + + page_ext = lookup_page_ext(page); + + /* + * The page was allocated before page_ext was initialized (which means + * it's a kernel page) or it's allocated to the kernel, so nothing to + * do. + */ + if (!page_ext->inited || + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) + return; + + spin_lock_irqsave(&page_ext->maplock, flags); + + /* + * The page was previously allocated to user space, so map it back + * into the kernel. No TLB flush required. + */ + if ((atomic_inc_return(&page_ext->mapcount) == 1) && + test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)) + set_kpte(page, (unsigned long)kaddr, __pgprot(__PAGE_KERNEL)); + + spin_unlock_irqrestore(&page_ext->maplock, flags); +} +EXPORT_SYMBOL(xpfo_kmap); + +void xpfo_kunmap(void *kaddr, struct page *page) +{ + struct page_ext *page_ext; + unsigned long flags; + + if (!static_branch_unlikely(&xpfo_inited)) + return; + + page_ext = lookup_page_ext(page); + + /* + * The page was allocated before page_ext was initialized (which means + * it's a kernel page) or it's allocated to the kernel, so nothing to + * do. + */ + if (!page_ext->inited || + test_bit(PAGE_EXT_XPFO_KERNEL, &page_ext->flags)) + return; + + spin_lock_irqsave(&page_ext->maplock, flags); + + /* + * The page is to be allocated back to user space, so unmap it from the + * kernel, flush the TLB and tag it as a user page. + */ + if (atomic_dec_return(&page_ext->mapcount) == 0) { + BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags)); + set_bit(PAGE_EXT_XPFO_UNMAPPED, &page_ext->flags); + set_kpte(page, (unsigned long)kaddr, __pgprot(0)); + __flush_tlb_one((unsigned long)kaddr); + } + + spin_unlock_irqrestore(&page_ext->maplock, flags); +} +EXPORT_SYMBOL(xpfo_kunmap); + +inline bool xpfo_page_is_unmapped(struct page *page) +{ + if (!static_branch_unlikely(&xpfo_inited)) + return false; + + return test_bit(PAGE_EXT_XPFO_UNMAPPED, &lookup_page_ext(page)->flags); +} +EXPORT_SYMBOL(xpfo_page_is_unmapped); diff --git a/security/Kconfig b/security/Kconfig index 118f4549404e..4502e15c8419 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -6,6 +6,25 @@ menu "Security options" source security/keys/Kconfig +config ARCH_SUPPORTS_XPFO + bool + +config XPFO + bool "Enable eXclusive Page Frame Ownership (XPFO)" + default n + depends on ARCH_SUPPORTS_XPFO + select PAGE_EXTENSION + help + This option offers protection against 'ret2dir' kernel attacks. + When enabled, every time a page frame is allocated to user space, it + is unmapped from the direct mapped RAM region in kernel space + (physmap). Similarly, when a page frame is freed/reclaimed, it is + mapped back to physmap. + + There is a slight performance impact when this option is enabled. + + If in doubt, say "N". + config SECURITY_DMESG_RESTRICT bool "Restrict unprivileged access to the kernel syslog" default n
This patch adds support for XPFO which protects against 'ret2dir' kernel attacks. The basic idea is to enforce exclusive ownership of page frames by either the kernel or userspace, unless explicitly requested by the kernel. Whenever a page destined for userspace is allocated, it is unmapped from physmap (the kernel's page table). When such a page is reclaimed from userspace, it is mapped back to physmap. Additional fields in the page_ext struct are used for XPFO housekeeping. Specifically two flags to distinguish user vs. kernel pages and to tag unmapped pages and a reference counter to balance kmap/kunmap operations and a lock to serialize access to the XPFO fields. Known issues/limitations: - Only supports x86-64 (for now) - Only supports 4k pages (for now) - There are most likely some legitimate uses cases where the kernel needs to access userspace which need to be made XPFO-aware - Performance penalty Reference paper by the original patch authors: http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf Suggested-by: Vasileios P. Kemerlis <vpk@cs.columbia.edu> Signed-off-by: Juerg Haefliger <juerg.haefliger@hpe.com> --- arch/x86/Kconfig | 3 +- arch/x86/mm/init.c | 2 +- drivers/ata/libata-sff.c | 4 +- include/linux/highmem.h | 15 +++- include/linux/page_ext.h | 7 ++ include/linux/xpfo.h | 39 +++++++++ lib/swiotlb.c | 3 +- mm/Makefile | 1 + mm/page_alloc.c | 2 + mm/page_ext.c | 4 + mm/xpfo.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++ security/Kconfig | 19 +++++ 12 files changed, 298 insertions(+), 7 deletions(-) create mode 100644 include/linux/xpfo.h create mode 100644 mm/xpfo.c