diff mbox series

[v3,4/6] powerpc: Introduce temporary mm

Message ID 20200827052659.24922-5-cmr@codefail.de (mailing list archive)
State New, archived
Headers show
Series Use per-CPU temporary mappings for patching | expand

Commit Message

Christopher M. Riedl Aug. 27, 2020, 5:26 a.m. UTC
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. A side benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.

Mappings in the temporary mm can be set in the userspace portion of the
address-space.

Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use.

Based on x86 implementation:

commit cefa929c034e
("x86/mm: Introduce temporary mm structs")

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
 arch/powerpc/include/asm/debug.h |  1 +
 arch/powerpc/kernel/process.c    |  5 +++
 arch/powerpc/lib/code-patching.c | 65 ++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

Comments

Jann Horn Aug. 27, 2020, 2:15 p.m. UTC | #1
On Thu, Aug 27, 2020 at 7:24 AM Christopher M. Riedl <cmr@codefail.de> wrote:
> x86 supports the notion of a temporary mm which restricts access to
> temporary PTEs to a single CPU. A temporary mm is useful for situations
> where a CPU needs to perform sensitive operations (such as patching a
> STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
> said mappings to other CPUs. A side benefit is that other CPU TLBs do
> not need to be flushed when the temporary mm is torn down.
>
> Mappings in the temporary mm can be set in the userspace portion of the
> address-space.
[...]
> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
[...]
> @@ -44,6 +45,70 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
>  }
>
>  #ifdef CONFIG_STRICT_KERNEL_RWX
> +
> +struct temp_mm {
> +       struct mm_struct *temp;
> +       struct mm_struct *prev;
> +       bool is_kernel_thread;
> +       struct arch_hw_breakpoint brk[HBP_NUM_MAX];
> +};
> +
> +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
> +{
> +       temp_mm->temp = mm;
> +       temp_mm->prev = NULL;
> +       temp_mm->is_kernel_thread = false;
> +       memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
> +}
> +
> +static inline void use_temporary_mm(struct temp_mm *temp_mm)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       temp_mm->is_kernel_thread = current->mm == NULL;

(That's a somewhat misleading variable name - kernel threads can have
a non-NULL ->mm, too.)

> +       if (temp_mm->is_kernel_thread)
> +               temp_mm->prev = current->active_mm;
> +       else
> +               temp_mm->prev = current->mm;

Why the branch? Shouldn't current->active_mm work in both cases?


> +       /*
> +        * Hash requires a non-NULL current->mm to allocate a userspace address
> +        * when handling a page fault. Does not appear to hurt in Radix either.
> +        */
> +       current->mm = temp_mm->temp;

This looks dangerous to me. There are various places that attempt to
find all userspace tasks that use a given mm by iterating through all
tasks on the system and comparing each task's ->mm pointer to
current's. Things like current_is_single_threaded() as part of various
security checks, mm_update_next_owner(), zap_threads(), and so on. So
if this is reachable from userspace task context (which I think it
is?), I don't think we're allowed to switch out the ->mm pointer here.


> +       switch_mm_irqs_off(NULL, temp_mm->temp, current);

switch_mm_irqs_off() calls switch_mmu_context(), which in the nohash
implementation increments next->context.active and decrements
prev->context.active if prev is non-NULL, right? So this would
increase temp_mm->temp->context.active...

> +       if (ppc_breakpoint_available()) {
> +               struct arch_hw_breakpoint null_brk = {0};
> +               int i = 0;
> +
> +               for (; i < nr_wp_slots(); ++i) {
> +                       __get_breakpoint(i, &temp_mm->brk[i]);
> +                       if (temp_mm->brk[i].type != 0)
> +                               __set_breakpoint(i, &null_brk);
> +               }
> +       }
> +}
> +
> +static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       if (temp_mm->is_kernel_thread)
> +               current->mm = NULL;
> +       else
> +               current->mm = temp_mm->prev;
> +       switch_mm_irqs_off(NULL, temp_mm->prev, current);

... whereas this would increase temp_mm->prev->context.active. As far
as I can tell, that'll mean that both the original mm and the patching
mm will have their .active counts permanently too high after
use_temporary_mm()+unuse_temporary_mm()?

> +       if (ppc_breakpoint_available()) {
> +               int i = 0;
> +
> +               for (; i < nr_wp_slots(); ++i)
> +                       if (temp_mm->brk[i].type != 0)
> +                               __set_breakpoint(i, &temp_mm->brk[i]);
> +       }
> +}
Christopher M. Riedl Sept. 7, 2020, 12:15 a.m. UTC | #2
On Thu Aug 27, 2020 at 11:15 AM CDT, Jann Horn wrote:
> On Thu, Aug 27, 2020 at 7:24 AM Christopher M. Riedl <cmr@codefail.de>
> wrote:
> > x86 supports the notion of a temporary mm which restricts access to
> > temporary PTEs to a single CPU. A temporary mm is useful for situations
> > where a CPU needs to perform sensitive operations (such as patching a
> > STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
> > said mappings to other CPUs. A side benefit is that other CPU TLBs do
> > not need to be flushed when the temporary mm is torn down.
> >
> > Mappings in the temporary mm can be set in the userspace portion of the
> > address-space.
> [...]
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> [...]
> > @@ -44,6 +45,70 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
> >  }
> >
> >  #ifdef CONFIG_STRICT_KERNEL_RWX
> > +
> > +struct temp_mm {
> > +       struct mm_struct *temp;
> > +       struct mm_struct *prev;
> > +       bool is_kernel_thread;
> > +       struct arch_hw_breakpoint brk[HBP_NUM_MAX];
> > +};
> > +
> > +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
> > +{
> > +       temp_mm->temp = mm;
> > +       temp_mm->prev = NULL;
> > +       temp_mm->is_kernel_thread = false;
> > +       memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
> > +}
> > +
> > +static inline void use_temporary_mm(struct temp_mm *temp_mm)
> > +{
> > +       lockdep_assert_irqs_disabled();
> > +
> > +       temp_mm->is_kernel_thread = current->mm == NULL;
>
> (That's a somewhat misleading variable name - kernel threads can have
> a non-NULL ->mm, too.)
>

Oh I didn't know that, in that case yes this is not a good name. I am
considering some changes (based on your comments about current->mm
below) which would make this variable superfluous.

> > +       if (temp_mm->is_kernel_thread)
> > +               temp_mm->prev = current->active_mm;
> > +       else
> > +               temp_mm->prev = current->mm;
>
> Why the branch? Shouldn't current->active_mm work in both cases?
>
>

Yes you are correct.

> > +       /*
> > +        * Hash requires a non-NULL current->mm to allocate a userspace address
> > +        * when handling a page fault. Does not appear to hurt in Radix either.
> > +        */
> > +       current->mm = temp_mm->temp;
>
> This looks dangerous to me. There are various places that attempt to
> find all userspace tasks that use a given mm by iterating through all
> tasks on the system and comparing each task's ->mm pointer to
> current's. Things like current_is_single_threaded() as part of various
> security checks, mm_update_next_owner(), zap_threads(), and so on. So
> if this is reachable from userspace task context (which I think it
> is?), I don't think we're allowed to switch out the ->mm pointer here.
>
>

Thanks for pointing this out! I took a step back and looked at this
again in more detail. The only reason for reassigning the ->mm pointer
is that when patching we need to hash the page and allocate an SLB 
entry w/ the hash MMU. That codepath includes a check to ensure that
->mm is not NULL. Overwriting ->mm temporarily and restoring it is
pretty crappy in retrospect. I _think_ a better approach is to just call
the hashing and allocate SLB functions from `map_patch` directly - this
both removes the need to overwrite ->mm (since the functions take an mm
parameter) and it avoids taking two exceptions when doing the actual
patching.

This works fine on Power9 and a Power8 at least but needs some testing
on PPC32 before I can send a v4.

> > +       switch_mm_irqs_off(NULL, temp_mm->temp, current);
>
> switch_mm_irqs_off() calls switch_mmu_context(), which in the nohash
> implementation increments next->context.active and decrements
> prev->context.active if prev is non-NULL, right? So this would
> increase temp_mm->temp->context.active...
>
> > +       if (ppc_breakpoint_available()) {
> > +               struct arch_hw_breakpoint null_brk = {0};
> > +               int i = 0;
> > +
> > +               for (; i < nr_wp_slots(); ++i) {
> > +                       __get_breakpoint(i, &temp_mm->brk[i]);
> > +                       if (temp_mm->brk[i].type != 0)
> > +                               __set_breakpoint(i, &null_brk);
> > +               }
> > +       }
> > +}
> > +
> > +static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
> > +{
> > +       lockdep_assert_irqs_disabled();
> > +
> > +       if (temp_mm->is_kernel_thread)
> > +               current->mm = NULL;
> > +       else
> > +               current->mm = temp_mm->prev;
> > +       switch_mm_irqs_off(NULL, temp_mm->prev, current);
>
> ... whereas this would increase temp_mm->prev->context.active. As far
> as I can tell, that'll mean that both the original mm and the patching
> mm will have their .active counts permanently too high after
> use_temporary_mm()+unuse_temporary_mm()?
>

Yes you are correct. Hmm, I can't immediately recall why prev=NULL here,
and I can't find anything in the various powerpc
switch_mm_irqs_off/switch_mmu_context implementations that would break
by setting prev=actual previous mm here. I will fix this for v4. Thanks!

> > +       if (ppc_breakpoint_available()) {
> > +               int i = 0;
> > +
> > +               for (; i < nr_wp_slots(); ++i)
> > +                       if (temp_mm->brk[i].type != 0)
> > +                               __set_breakpoint(i, &temp_mm->brk[i]);
> > +       }
> > +}
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index ec57daf87f40..827350c9bcf3 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -46,6 +46,7 @@  static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 016bd831908e..0758a8db6342 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -843,6 +843,11 @@  static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk)
 	return 0;
 }
 
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk)
+{
+	memcpy(brk, this_cpu_ptr(&current_brk[nr]), sizeof(*brk));
+}
+
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 {
 	memcpy(this_cpu_ptr(&current_brk[nr]), brk, sizeof(*brk));
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 85d3fdca9452..89b37ece6d2f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -17,6 +17,7 @@ 
 #include <asm/code-patching.h>
 #include <asm/setup.h>
 #include <asm/inst.h>
+#include <asm/mmu_context.h>
 
 static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr,
 			       struct ppc_inst *patch_addr)
@@ -44,6 +45,70 @@  int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
+
+struct temp_mm {
+	struct mm_struct *temp;
+	struct mm_struct *prev;
+	bool is_kernel_thread;
+	struct arch_hw_breakpoint brk[HBP_NUM_MAX];
+};
+
+static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
+{
+	temp_mm->temp = mm;
+	temp_mm->prev = NULL;
+	temp_mm->is_kernel_thread = false;
+	memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
+}
+
+static inline void use_temporary_mm(struct temp_mm *temp_mm)
+{
+	lockdep_assert_irqs_disabled();
+
+	temp_mm->is_kernel_thread = current->mm == NULL;
+	if (temp_mm->is_kernel_thread)
+		temp_mm->prev = current->active_mm;
+	else
+		temp_mm->prev = current->mm;
+
+	/*
+	 * Hash requires a non-NULL current->mm to allocate a userspace address
+	 * when handling a page fault. Does not appear to hurt in Radix either.
+	 */
+	current->mm = temp_mm->temp;
+	switch_mm_irqs_off(NULL, temp_mm->temp, current);
+
+	if (ppc_breakpoint_available()) {
+		struct arch_hw_breakpoint null_brk = {0};
+		int i = 0;
+
+		for (; i < nr_wp_slots(); ++i) {
+			__get_breakpoint(i, &temp_mm->brk[i]);
+			if (temp_mm->brk[i].type != 0)
+				__set_breakpoint(i, &null_brk);
+		}
+	}
+}
+
+static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
+{
+	lockdep_assert_irqs_disabled();
+
+	if (temp_mm->is_kernel_thread)
+		current->mm = NULL;
+	else
+		current->mm = temp_mm->prev;
+	switch_mm_irqs_off(NULL, temp_mm->prev, current);
+
+	if (ppc_breakpoint_available()) {
+		int i = 0;
+
+		for (; i < nr_wp_slots(); ++i)
+			if (temp_mm->brk[i].type != 0)
+				__set_breakpoint(i, &temp_mm->brk[i]);
+	}
+}
+
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 
 #ifdef CONFIG_LKDTM