Message ID | 20190129003422.9328-7-rick.p.edgecombe@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Merge text_poke fixes and executable lockdowns | expand |
On Mon, Jan 28, 2019 at 04:34:08PM -0800, Rick Edgecombe wrote: > From: Nadav Amit <namit@vmware.com> > > text_poke() can potentially compromise the security as it sets temporary s/the // > PTEs in the fixmap. These PTEs might be used to rewrite the kernel code > from other cores accidentally or maliciously, if an attacker gains the > ability to write onto kernel memory. Eww, sneaky. That would be a really nasty attack. > Moreover, since remote TLBs are not flushed after the temporary PTEs are > removed, the time-window in which the code is writable is not limited if > the fixmap PTEs - maliciously or accidentally - are cached in the TLB. > To address these potential security hazards, we use a temporary mm for > patching the code. > > Finally, text_poke() is also not conservative enough when mapping pages, > as it always tries to map 2 pages, even when a single one is sufficient. > So try to be more conservative, and do not map more than needed. > > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Kees Cook <keescook@chromium.org> > Cc: Dave Hansen <dave.hansen@intel.com> > Cc: Masami Hiramatsu <mhiramat@kernel.org> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Nadav Amit <namit@vmware.com> > Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > --- > arch/x86/include/asm/fixmap.h | 2 - > arch/x86/kernel/alternative.c | 106 +++++++++++++++++++++++++++------- > arch/x86/xen/mmu_pv.c | 2 - > 3 files changed, 84 insertions(+), 26 deletions(-) > > diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h > index 50ba74a34a37..9da8cccdf3fb 100644 > --- a/arch/x86/include/asm/fixmap.h > +++ b/arch/x86/include/asm/fixmap.h > @@ -103,8 +103,6 @@ enum fixed_addresses { > #ifdef CONFIG_PARAVIRT > FIX_PARAVIRT_BOOTMAP, > #endif > - FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ > - FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ Two fixmap slots less - good riddance. :) > #ifdef CONFIG_X86_INTEL_MID > FIX_LNW_VRTC, > #endif > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index ae05fbb50171..76d482a2b716 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -11,6 +11,7 @@ > #include <linux/stop_machine.h> > #include <linux/slab.h> > #include <linux/kdebug.h> > +#include <linux/mmu_context.h> > #include <asm/text-patching.h> > #include <asm/alternative.h> > #include <asm/sections.h> > @@ -683,41 +684,102 @@ __ro_after_init unsigned long poking_addr; > > static void *__text_poke(void *addr, const void *opcode, size_t len) > { > + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; > + temporary_mm_state_t prev; > + struct page *pages[2] = {NULL}; > unsigned long flags; > - char *vaddr; > - struct page *pages[2]; > - int i; > + pte_t pte, *ptep; > + spinlock_t *ptl; > + pgprot_t prot; > > /* > - * While boot memory allocator is runnig we cannot use struct > - * pages as they are not yet initialized. > + * While boot memory allocator is running we cannot use struct pages as > + * they are not yet initialized. > */ > BUG_ON(!after_bootmem); > > if (!core_kernel_text((unsigned long)addr)) { > pages[0] = vmalloc_to_page(addr); > - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); > + if (cross_page_boundary) > + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); > } else { > pages[0] = virt_to_page(addr); > WARN_ON(!PageReserved(pages[0])); > - pages[1] = virt_to_page(addr + PAGE_SIZE); > + if (cross_page_boundary) > + pages[1] = virt_to_page(addr + PAGE_SIZE); > } > - BUG_ON(!pages[0]); > + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); checkpatch fires a lot for this patchset and I think we should tone down the BUG_ON() use. WARNING: Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON() #116: FILE: arch/x86/kernel/alternative.c:711: + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); While the below BUG_ON makes sense, this here could be a WARN_ON or so. Which begs the next question: AFAICT, nothing looks at text_poke*()'s retval. So why are we even bothering returning something? > + > local_irq_save(flags); > - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); > - if (pages[1]) > - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); > - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); > - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); > - clear_fixmap(FIX_TEXT_POKE0); > - if (pages[1]) > - clear_fixmap(FIX_TEXT_POKE1); > - local_flush_tlb(); > - sync_core(); > - /* Could also do a CLFLUSH here to speed up CPU recovery; but > - that causes hangs on some VIA CPUs. */ > - for (i = 0; i < len; i++) > - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); > + > + /* > + * The lock is not really needed, but this allows to avoid open-coding. > + */ > + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); > + > + /* > + * This must not fail; preallocated in poking_init(). > + */ > + VM_BUG_ON(!ptep); > + > + /* > + * flush_tlb_mm_range() would be called when the poking_mm is not > + * loaded. When PCID is in use, the flush would be deferred to the time > + * the poking_mm is loaded again. Set the PTE as non-global to prevent > + * it from being used when we are done. > + */ > + prot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); So _KERNPG_TABLE | _PAGE_NX as this is pagetable page, AFAICT.
On Tue, Feb 05, 2019 at 10:58:53AM +0100, Borislav Petkov wrote: > > @@ -683,41 +684,102 @@ __ro_after_init unsigned long poking_addr; > > > > static void *__text_poke(void *addr, const void *opcode, size_t len) > > { > > + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; > > + temporary_mm_state_t prev; > > + struct page *pages[2] = {NULL}; > > unsigned long flags; > > - char *vaddr; > > - struct page *pages[2]; > > - int i; > > + pte_t pte, *ptep; > > + spinlock_t *ptl; > > + pgprot_t prot; > > > > /* > > - * While boot memory allocator is runnig we cannot use struct > > - * pages as they are not yet initialized. > > + * While boot memory allocator is running we cannot use struct pages as > > + * they are not yet initialized. > > */ > > BUG_ON(!after_bootmem); > > > > if (!core_kernel_text((unsigned long)addr)) { > > pages[0] = vmalloc_to_page(addr); > > - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); > > + if (cross_page_boundary) > > + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); > > } else { > > pages[0] = virt_to_page(addr); > > WARN_ON(!PageReserved(pages[0])); > > - pages[1] = virt_to_page(addr + PAGE_SIZE); > > + if (cross_page_boundary) > > + pages[1] = virt_to_page(addr + PAGE_SIZE); > > } > > - BUG_ON(!pages[0]); > > + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); > > checkpatch fires a lot for this patchset and I think we should tone down > the BUG_ON() use. I've been pushing for BUG_ON() in this patch set; sod checkpatch. Maybe not this BUG_ON in particular, but a number of them introduced here are really situations where we can't do anything sane. This BUG_ON() in particular is the choice between corrupted text or an instantly dead machine; what would you do? In general, text_poke() cannot fail: - suppose changing a single jump label requires poking multiple sites (not uncommon), we fail halfway through and then have to undo the first pokes, but those pokes fail again. - this then leaves us no way forward and no way back, we've got inconsistent text state -> FAIL. So even an 'early' fail (like here) doesn't work in the rollback scenario if you combine them. So while in general I agree with BUG_ON() being undesirable, I think liberal sprinking in text_poke() is fine; you really _REALLY_ want this to work or fail loudly. Text corruption is just painful.
On Tue, Feb 05, 2019 at 12:31:46PM +0100, Peter Zijlstra wrote: > ... > > So while in general I agree with BUG_ON() being undesirable, I think > liberal sprinking in text_poke() is fine; you really _REALLY_ want this > to work or fail loudly. Text corruption is just painful. Ok. It would be good to have the gist of this sentiment in a comment above it so that it is absolutely clear why we're doing it. And since text_poke() can't fail, then it doesn't need a retval too. AFAICT, nothing is actually using it.
On Tue, Feb 05, 2019 at 01:35:33PM +0100, Borislav Petkov wrote: > On Tue, Feb 05, 2019 at 12:31:46PM +0100, Peter Zijlstra wrote: > > ... > > > > So while in general I agree with BUG_ON() being undesirable, I think > > liberal sprinking in text_poke() is fine; you really _REALLY_ want this > > to work or fail loudly. Text corruption is just painful. > > Ok. It would be good to have the gist of this sentiment in a comment > above it so that it is absolutely clear why we're doing it. > > And since text_poke() can't fail, then it doesn't need a retval too. > AFAICT, nothing is actually using it. See patch 12, that removes the return value (after fixing the few users that currently 'rely' on it).
On Tue, Feb 05, 2019 at 12:31:46PM +0100, Peter Zijlstra wrote: > In general, text_poke() cannot fail: > > - suppose changing a single jump label requires poking multiple sites > (not uncommon), we fail halfway through and then have to undo the > first pokes, but those pokes fail again. > > - this then leaves us no way forward and no way back, we've got > inconsistent text state -> FAIL. Note that this exact fail scenario still exists in the CPU hotplug code. See kernel/cpu.c:cpuhp_thread_fun(): /* * If we fail on a rollback, we're up a creek without no * paddle, no way forward, no way back. We loose, thanks for * playing. */ WARN_ON_ONCE(st->rollback);
> On Feb 5, 2019, at 4:35 AM, Borislav Petkov <bp@alien8.de> wrote: > > On Tue, Feb 05, 2019 at 12:31:46PM +0100, Peter Zijlstra wrote: >> ... >> >> So while in general I agree with BUG_ON() being undesirable, I think >> liberal sprinking in text_poke() is fine; you really _REALLY_ want this >> to work or fail loudly. Text corruption is just painful. > > Ok. It would be good to have the gist of this sentiment in a comment > above it so that it is absolutely clear why we're doing it. I added a short comment for v3 above each BUG_ON(). > And since text_poke() can't fail, then it doesn't need a retval too. > AFAICT, nothing is actually using it. As Peter said, this is addressed in a separate patch (one patch per logical change).
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 50ba74a34a37..9da8cccdf3fb 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -103,8 +103,6 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ae05fbb50171..76d482a2b716 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> +#include <linux/mmu_context.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -683,41 +684,102 @@ __ro_after_init unsigned long poking_addr; static void *__text_poke(void *addr, const void *opcode, size_t len) { + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + temporary_mm_state_t prev; + struct page *pages[2] = {NULL}; unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t prot; /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. */ BUG_ON(!after_bootmem); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { pages[0] = virt_to_page(addr); WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); } - BUG_ON(!pages[0]); + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + /* + * flush_tlb_mm_range() would be called when the poking_mm is not + * loaded. When PCID is in use, the flush would be deferred to the time + * the poking_mm is loaded again. Set the PTE as non-global to prevent + * it from being used when we are done. + */ + prot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + pte = mk_pte(pages[0], prot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], prot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + pte_unmap_unlock(ptep, ptl); + /* + * If the text doesn't match what we just wrote; something is + * fundamentally screwy, there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + local_irq_restore(flags); return addr; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0f4fe206dcc2..82b181fcefe5 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2319,8 +2319,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #elif defined(CONFIG_X86_VSYSCALL_EMULATION) case VSYSCALL_PAGE: #endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: /* All local page mappings */ pte = pfn_pte(phys, prot); break;