diff mbox series

[v2,05/20] x86/alternative: initializing temporary mm for patching

Message ID 20190129003422.9328-6-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series Merge text_poke fixes and executable lockdowns | expand

Commit Message

Rick Edgecombe Jan. 29, 2019, 12:34 a.m. UTC
From: Nadav Amit <namit@vmware.com>

To prevent improper use of the PTEs that are used for text patching, we
want to use a temporary mm struct. We initailize it by copying the init
mm.

The address that will be used for patching is taken from the lower area
that is usually used for the task memory. Doing so prevents the need to
frequently synchronize the temporary-mm (e.g., when BPF programs are
installed), since different PGDs are used for the task memory.

Finally, we randomize the address of the PTEs to harden against exploits
that use these PTEs.

Cc: Kees Cook <keescook@chromium.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/pgtable.h       |  3 +++
 arch/x86/include/asm/text-patching.h |  2 ++
 arch/x86/kernel/alternative.c        |  3 +++
 arch/x86/mm/init_64.c                | 36 ++++++++++++++++++++++++++++
 init/main.c                          |  3 +++
 5 files changed, 47 insertions(+)

Comments

Borislav Petkov Feb. 5, 2019, 9:18 a.m. UTC | #1
Just nitpicks:

Subject: [PATCH v2 05/20] x86/alternative: initializing temporary mm for patching

s/initailizing/Initialize/

On Mon, Jan 28, 2019 at 04:34:07PM -0800, Rick Edgecombe wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> To prevent improper use of the PTEs that are used for text patching, we
> want to use a temporary mm struct. We initailize it by copying the init

Please remove the "we" from commit messages and use impartial, passive
formulations.

Thx.
Nadav Amit Feb. 11, 2019, 12:39 a.m. UTC | #2
> On Jan 28, 2019, at 4:34 PM, Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
> 
> From: Nadav Amit <namit@vmware.com>
> 
> To prevent improper use of the PTEs that are used for text patching, we
> want to use a temporary mm struct. We initailize it by copying the init
> mm.
> 
> The address that will be used for patching is taken from the lower area
> that is usually used for the task memory. Doing so prevents the need to
> frequently synchronize the temporary-mm (e.g., when BPF programs are
> installed), since different PGDs are used for the task memory.
> 
> Finally, we randomize the address of the PTEs to harden against exploits
> that use these PTEs.
> 
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
> Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
> Suggested-by: Andy Lutomirski <luto@kernel.org>
> Signed-off-by: Nadav Amit <namit@vmware.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> arch/x86/include/asm/pgtable.h       |  3 +++
> arch/x86/include/asm/text-patching.h |  2 ++
> arch/x86/kernel/alternative.c        |  3 +++
> arch/x86/mm/init_64.c                | 36 ++++++++++++++++++++++++++++
> init/main.c                          |  3 +++
> 5 files changed, 47 insertions(+)
> 
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 40616e805292..e8f630d9a2ed 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
> 	/* Default trampoline pgd value */
> 	trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
> }
> +
> +void __init poking_init(void);
> +
> # ifdef CONFIG_RANDOMIZE_MEMORY
> void __meminit init_trampoline(void);
> # else
> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> index f8fc8e86cf01..a75eed841eed 100644
> --- a/arch/x86/include/asm/text-patching.h
> +++ b/arch/x86/include/asm/text-patching.h
> @@ -39,5 +39,7 @@ extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
> extern int poke_int3_handler(struct pt_regs *regs);
> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
> extern int after_bootmem;
> +extern __ro_after_init struct mm_struct *poking_mm;
> +extern __ro_after_init unsigned long poking_addr;
> 
> #endif /* _ASM_X86_TEXT_PATCHING_H */
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 12fddbc8c55b..ae05fbb50171 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -678,6 +678,9 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
> 	return addr;
> }
> 
> +__ro_after_init struct mm_struct *poking_mm;
> +__ro_after_init unsigned long poking_addr;
> +
> static void *__text_poke(void *addr, const void *opcode, size_t len)
> {
> 	unsigned long flags;
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index bccff68e3267..125c8c48aa24 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -53,6 +53,7 @@
> #include <asm/init.h>
> #include <asm/uv/uv.h>
> #include <asm/setup.h>
> +#include <asm/text-patching.h>
> 
> #include "mm_internal.h"
> 
> @@ -1383,6 +1384,41 @@ unsigned long memory_block_size_bytes(void)
> 	return memory_block_size_probed;
> }
> 
> +/*
> + * Initialize an mm_struct to be used during poking and a pointer to be used
> + * during patching.
> + */
> +void __init poking_init(void)
> +{
> +	spinlock_t *ptl;
> +	pte_t *ptep;
> +
> +	poking_mm = copy_init_mm();
> +	BUG_ON(!poking_mm);
> +
> +	/*
> +	 * Randomize the poking address, but make sure that the following page
> +	 * will be mapped at the same PMD. We need 2 pages, so find space for 3,
> +	 * and adjust the address if the PMD ends after the first one.
> +	 */
> +	poking_addr = TASK_UNMAPPED_BASE;
> +	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
> +		poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
> +			(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
> +
> +	if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
> +		poking_addr += PAGE_SIZE;

Further thinking about it, I think that allocating the virtual address for
poking from user address-range is problematic. The user can set watchpoints
on different addresses, cause some static-keys to be enabled/disabled, and
monitor the signals to derandomize the poking address.

Andy, I think you were pushing this change. Can I go back to use a vmalloc’d
address instead, or do you have a better solution? I prefer not to
save/restore DR7, of course.
Andy Lutomirski Feb. 11, 2019, 5:18 a.m. UTC | #3
On Feb 10, 2019, at 4:39 PM, Nadav Amit <nadav.amit@gmail.com> wrote:

>> On Jan 28, 2019, at 4:34 PM, Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>> 
>> From: Nadav Amit <namit@vmware.com>
>> 
>> To prevent improper use of the PTEs that are used for text patching, we
>> want to use a temporary mm struct. We initailize it by copying the init
>> mm.
>> 
>> The address that will be used for patching is taken from the lower area
>> that is usually used for the task memory. Doing so prevents the need to
>> frequently synchronize the temporary-mm (e.g., when BPF programs are
>> installed), since different PGDs are used for the task memory.
>> 
>> Finally, we randomize the address of the PTEs to harden against exploits
>> that use these PTEs.
>> 
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Dave Hansen <dave.hansen@intel.com>
>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>> Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
>> Suggested-by: Andy Lutomirski <luto@kernel.org>
>> Signed-off-by: Nadav Amit <namit@vmware.com>
>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>> ---
>> arch/x86/include/asm/pgtable.h       |  3 +++
>> arch/x86/include/asm/text-patching.h |  2 ++
>> arch/x86/kernel/alternative.c        |  3 +++
>> arch/x86/mm/init_64.c                | 36 ++++++++++++++++++++++++++++
>> init/main.c                          |  3 +++
>> 5 files changed, 47 insertions(+)
>> 
>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>> index 40616e805292..e8f630d9a2ed 100644
>> --- a/arch/x86/include/asm/pgtable.h
>> +++ b/arch/x86/include/asm/pgtable.h
>> @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
>>    /* Default trampoline pgd value */
>>    trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>> }
>> +
>> +void __init poking_init(void);
>> +
>> # ifdef CONFIG_RANDOMIZE_MEMORY
>> void __meminit init_trampoline(void);
>> # else
>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>> index f8fc8e86cf01..a75eed841eed 100644
>> --- a/arch/x86/include/asm/text-patching.h
>> +++ b/arch/x86/include/asm/text-patching.h
>> @@ -39,5 +39,7 @@ extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
>> extern int poke_int3_handler(struct pt_regs *regs);
>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>> extern int after_bootmem;
>> +extern __ro_after_init struct mm_struct *poking_mm;
>> +extern __ro_after_init unsigned long poking_addr;
>> 
>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index 12fddbc8c55b..ae05fbb50171 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -678,6 +678,9 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
>>    return addr;
>> }
>> 
>> +__ro_after_init struct mm_struct *poking_mm;
>> +__ro_after_init unsigned long poking_addr;
>> +
>> static void *__text_poke(void *addr, const void *opcode, size_t len)
>> {
>>    unsigned long flags;
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index bccff68e3267..125c8c48aa24 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -53,6 +53,7 @@
>> #include <asm/init.h>
>> #include <asm/uv/uv.h>
>> #include <asm/setup.h>
>> +#include <asm/text-patching.h>
>> 
>> #include "mm_internal.h"
>> 
>> @@ -1383,6 +1384,41 @@ unsigned long memory_block_size_bytes(void)
>>    return memory_block_size_probed;
>> }
>> 
>> +/*
>> + * Initialize an mm_struct to be used during poking and a pointer to be used
>> + * during patching.
>> + */
>> +void __init poking_init(void)
>> +{
>> +    spinlock_t *ptl;
>> +    pte_t *ptep;
>> +
>> +    poking_mm = copy_init_mm();
>> +    BUG_ON(!poking_mm);
>> +
>> +    /*
>> +     * Randomize the poking address, but make sure that the following page
>> +     * will be mapped at the same PMD. We need 2 pages, so find space for 3,
>> +     * and adjust the address if the PMD ends after the first one.
>> +     */
>> +    poking_addr = TASK_UNMAPPED_BASE;
>> +    if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
>> +        poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
>> +            (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
>> +
>> +    if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
>> +        poking_addr += PAGE_SIZE;
> 
> Further thinking about it, I think that allocating the virtual address for
> poking from user address-range is problematic. The user can set watchpoints
> on different addresses, cause some static-keys to be enabled/disabled, and
> monitor the signals to derandomize the poking address.
> 

Hmm, I hadn’t thought about watchpoints. I’m not sure how much we care about possible derandomization like this, but we certainly don’t want to send signals or otherwise malfunction.

> Andy, I think you were pushing this change. Can I go back to use a vmalloc’d
> address instead, or do you have a better solution?

Hmm. If we use a vmalloc address, we have to make sure it’s not actually allocated. I suppose we could allocate one once at boot and use that.  We also have the problem that the usual APIs for handling “user” addresses might assume they’re actually in the user range, although this seems unlikely to be a problem in practice.  More seriously, though, the code that manipulates per-mm paging structures assumes that *all* of the structures up to the top level are per-mm, and, if we use anything less than a private pgd, this isn’t the case.

> I prefer not to
> save/restore DR7, of course.
> 

I suspect we may want to use the temporary mm concept for EFI, too, so we may want to just suck it up and save/restore DR7.  But only if a watchpoint is in use, of course. I have an old patch I could dust off that tracks DR7 to make things like this efficient.
Nadav Amit Feb. 11, 2019, 6:04 p.m. UTC | #4
> On Feb 10, 2019, at 9:18 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> 
> 
> 
> On Feb 10, 2019, at 4:39 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> 
>>> On Jan 28, 2019, at 4:34 PM, Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>>> 
>>> From: Nadav Amit <namit@vmware.com>
>>> 
>>> To prevent improper use of the PTEs that are used for text patching, we
>>> want to use a temporary mm struct. We initailize it by copying the init
>>> mm.
>>> 
>>> The address that will be used for patching is taken from the lower area
>>> that is usually used for the task memory. Doing so prevents the need to
>>> frequently synchronize the temporary-mm (e.g., when BPF programs are
>>> installed), since different PGDs are used for the task memory.
>>> 
>>> Finally, we randomize the address of the PTEs to harden against exploits
>>> that use these PTEs.
>>> 
>>> Cc: Kees Cook <keescook@chromium.org>
>>> Cc: Dave Hansen <dave.hansen@intel.com>
>>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>>> Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
>>> Suggested-by: Andy Lutomirski <luto@kernel.org>
>>> Signed-off-by: Nadav Amit <namit@vmware.com>
>>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>>> ---
>>> arch/x86/include/asm/pgtable.h       |  3 +++
>>> arch/x86/include/asm/text-patching.h |  2 ++
>>> arch/x86/kernel/alternative.c        |  3 +++
>>> arch/x86/mm/init_64.c                | 36 ++++++++++++++++++++++++++++
>>> init/main.c                          |  3 +++
>>> 5 files changed, 47 insertions(+)
>>> 
>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>> index 40616e805292..e8f630d9a2ed 100644
>>> --- a/arch/x86/include/asm/pgtable.h
>>> +++ b/arch/x86/include/asm/pgtable.h
>>> @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
>>>   /* Default trampoline pgd value */
>>>   trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>> }
>>> +
>>> +void __init poking_init(void);
>>> +
>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>> void __meminit init_trampoline(void);
>>> # else
>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>> index f8fc8e86cf01..a75eed841eed 100644
>>> --- a/arch/x86/include/asm/text-patching.h
>>> +++ b/arch/x86/include/asm/text-patching.h
>>> @@ -39,5 +39,7 @@ extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
>>> extern int poke_int3_handler(struct pt_regs *regs);
>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>> extern int after_bootmem;
>>> +extern __ro_after_init struct mm_struct *poking_mm;
>>> +extern __ro_after_init unsigned long poking_addr;
>>> 
>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>> index 12fddbc8c55b..ae05fbb50171 100644
>>> --- a/arch/x86/kernel/alternative.c
>>> +++ b/arch/x86/kernel/alternative.c
>>> @@ -678,6 +678,9 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
>>>   return addr;
>>> }
>>> 
>>> +__ro_after_init struct mm_struct *poking_mm;
>>> +__ro_after_init unsigned long poking_addr;
>>> +
>>> static void *__text_poke(void *addr, const void *opcode, size_t len)
>>> {
>>>   unsigned long flags;
>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>> index bccff68e3267..125c8c48aa24 100644
>>> --- a/arch/x86/mm/init_64.c
>>> +++ b/arch/x86/mm/init_64.c
>>> @@ -53,6 +53,7 @@
>>> #include <asm/init.h>
>>> #include <asm/uv/uv.h>
>>> #include <asm/setup.h>
>>> +#include <asm/text-patching.h>
>>> 
>>> #include "mm_internal.h"
>>> 
>>> @@ -1383,6 +1384,41 @@ unsigned long memory_block_size_bytes(void)
>>>   return memory_block_size_probed;
>>> }
>>> 
>>> +/*
>>> + * Initialize an mm_struct to be used during poking and a pointer to be used
>>> + * during patching.
>>> + */
>>> +void __init poking_init(void)
>>> +{
>>> +    spinlock_t *ptl;
>>> +    pte_t *ptep;
>>> +
>>> +    poking_mm = copy_init_mm();
>>> +    BUG_ON(!poking_mm);
>>> +
>>> +    /*
>>> +     * Randomize the poking address, but make sure that the following page
>>> +     * will be mapped at the same PMD. We need 2 pages, so find space for 3,
>>> +     * and adjust the address if the PMD ends after the first one.
>>> +     */
>>> +    poking_addr = TASK_UNMAPPED_BASE;
>>> +    if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
>>> +        poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
>>> +            (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
>>> +
>>> +    if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
>>> +        poking_addr += PAGE_SIZE;
>> 
>> Further thinking about it, I think that allocating the virtual address for
>> poking from user address-range is problematic. The user can set watchpoints
>> on different addresses, cause some static-keys to be enabled/disabled, and
>> monitor the signals to derandomize the poking address.
> 
> Hmm, I hadn’t thought about watchpoints. I’m not sure how much we care
> about possible derandomization like this, but we certainly don’t want to
> send signals or otherwise malfunction.
> 
>> Andy, I think you were pushing this change. Can I go back to use a vmalloc’d
>> address instead, or do you have a better solution?
> 
> Hmm. If we use a vmalloc address, we have to make sure it’s not actually
> allocated. I suppose we could allocate one once at boot and use that. We
> also have the problem that the usual APIs for handling “user” addresses
> might assume they’re actually in the user range, although this seems
> unlikely to be a problem in practice. More seriously, though, the code
> that manipulates per-mm paging structures assumes that *all* of the
> structures up to the top level are per-mm, and, if we use anything less
> than a private pgd, this isn’t the case.

I forgot that I only had this conversation in my mind ;-)

Well, I did write some code that kept some vmalloc’d area private, and it
did require more synchronization between the pgd’s. It is still possible
to use another top-level PGD, but … (continued below)

> 
>> I prefer not to
>> save/restore DR7, of course.
> 
> I suspect we may want to use the temporary mm concept for EFI, too, so we
> may want to just suck it up and save/restore DR7. But only if a watchpoint
> is in use, of course. I have an old patch I could dust off that tracks DR7
> to make things like this efficient.

… but, if this is the case, then I will just make (un)use_temporary_mm() to
save/restore DR7. I guess you are ok with such a solution. I will
incorporate it into Rick’s v3.
Andy Lutomirski Feb. 11, 2019, 7:07 p.m. UTC | #5
On Mon, Feb 11, 2019 at 10:05 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Feb 10, 2019, at 9:18 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >
> >
> >
> > On Feb 10, 2019, at 4:39 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >
> >>> On Jan 28, 2019, at 4:34 PM, Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
> >>>
> >>> From: Nadav Amit <namit@vmware.com>
> >>>
> >>> To prevent improper use of the PTEs that are used for text patching, we
> >>> want to use a temporary mm struct. We initailize it by copying the init
> >>> mm.
> >>>
> >>> The address that will be used for patching is taken from the lower area
> >>> that is usually used for the task memory. Doing so prevents the need to
> >>> frequently synchronize the temporary-mm (e.g., when BPF programs are
> >>> installed), since different PGDs are used for the task memory.
> >>>
> >>> Finally, we randomize the address of the PTEs to harden against exploits
> >>> that use these PTEs.
> >>>
> >>> Cc: Kees Cook <keescook@chromium.org>
> >>> Cc: Dave Hansen <dave.hansen@intel.com>
> >>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
> >>> Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
> >>> Suggested-by: Andy Lutomirski <luto@kernel.org>
> >>> Signed-off-by: Nadav Amit <namit@vmware.com>
> >>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> >>> ---
> >>> arch/x86/include/asm/pgtable.h       |  3 +++
> >>> arch/x86/include/asm/text-patching.h |  2 ++
> >>> arch/x86/kernel/alternative.c        |  3 +++
> >>> arch/x86/mm/init_64.c                | 36 ++++++++++++++++++++++++++++
> >>> init/main.c                          |  3 +++
> >>> 5 files changed, 47 insertions(+)
> >>>
> >>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> >>> index 40616e805292..e8f630d9a2ed 100644
> >>> --- a/arch/x86/include/asm/pgtable.h
> >>> +++ b/arch/x86/include/asm/pgtable.h
> >>> @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
> >>>   /* Default trampoline pgd value */
> >>>   trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
> >>> }
> >>> +
> >>> +void __init poking_init(void);
> >>> +
> >>> # ifdef CONFIG_RANDOMIZE_MEMORY
> >>> void __meminit init_trampoline(void);
> >>> # else
> >>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> >>> index f8fc8e86cf01..a75eed841eed 100644
> >>> --- a/arch/x86/include/asm/text-patching.h
> >>> +++ b/arch/x86/include/asm/text-patching.h
> >>> @@ -39,5 +39,7 @@ extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
> >>> extern int poke_int3_handler(struct pt_regs *regs);
> >>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
> >>> extern int after_bootmem;
> >>> +extern __ro_after_init struct mm_struct *poking_mm;
> >>> +extern __ro_after_init unsigned long poking_addr;
> >>>
> >>> #endif /* _ASM_X86_TEXT_PATCHING_H */
> >>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> >>> index 12fddbc8c55b..ae05fbb50171 100644
> >>> --- a/arch/x86/kernel/alternative.c
> >>> +++ b/arch/x86/kernel/alternative.c
> >>> @@ -678,6 +678,9 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
> >>>   return addr;
> >>> }
> >>>
> >>> +__ro_after_init struct mm_struct *poking_mm;
> >>> +__ro_after_init unsigned long poking_addr;
> >>> +
> >>> static void *__text_poke(void *addr, const void *opcode, size_t len)
> >>> {
> >>>   unsigned long flags;
> >>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> >>> index bccff68e3267..125c8c48aa24 100644
> >>> --- a/arch/x86/mm/init_64.c
> >>> +++ b/arch/x86/mm/init_64.c
> >>> @@ -53,6 +53,7 @@
> >>> #include <asm/init.h>
> >>> #include <asm/uv/uv.h>
> >>> #include <asm/setup.h>
> >>> +#include <asm/text-patching.h>
> >>>
> >>> #include "mm_internal.h"
> >>>
> >>> @@ -1383,6 +1384,41 @@ unsigned long memory_block_size_bytes(void)
> >>>   return memory_block_size_probed;
> >>> }
> >>>
> >>> +/*
> >>> + * Initialize an mm_struct to be used during poking and a pointer to be used
> >>> + * during patching.
> >>> + */
> >>> +void __init poking_init(void)
> >>> +{
> >>> +    spinlock_t *ptl;
> >>> +    pte_t *ptep;
> >>> +
> >>> +    poking_mm = copy_init_mm();
> >>> +    BUG_ON(!poking_mm);
> >>> +
> >>> +    /*
> >>> +     * Randomize the poking address, but make sure that the following page
> >>> +     * will be mapped at the same PMD. We need 2 pages, so find space for 3,
> >>> +     * and adjust the address if the PMD ends after the first one.
> >>> +     */
> >>> +    poking_addr = TASK_UNMAPPED_BASE;
> >>> +    if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
> >>> +        poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
> >>> +            (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
> >>> +
> >>> +    if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
> >>> +        poking_addr += PAGE_SIZE;
> >>
> >> Further thinking about it, I think that allocating the virtual address for
> >> poking from user address-range is problematic. The user can set watchpoints
> >> on different addresses, cause some static-keys to be enabled/disabled, and
> >> monitor the signals to derandomize the poking address.
> >
> > Hmm, I hadn’t thought about watchpoints. I’m not sure how much we care
> > about possible derandomization like this, but we certainly don’t want to
> > send signals or otherwise malfunction.
> >
> >> Andy, I think you were pushing this change. Can I go back to use a vmalloc’d
> >> address instead, or do you have a better solution?
> >
> > Hmm. If we use a vmalloc address, we have to make sure it’s not actually
> > allocated. I suppose we could allocate one once at boot and use that. We
> > also have the problem that the usual APIs for handling “user” addresses
> > might assume they’re actually in the user range, although this seems
> > unlikely to be a problem in practice. More seriously, though, the code
> > that manipulates per-mm paging structures assumes that *all* of the
> > structures up to the top level are per-mm, and, if we use anything less
> > than a private pgd, this isn’t the case.
>
> I forgot that I only had this conversation in my mind ;-)
>
> Well, I did write some code that kept some vmalloc’d area private, and it
> did require more synchronization between the pgd’s. It is still possible
> to use another top-level PGD, but … (continued below)
>
> >
> >> I prefer not to
> >> save/restore DR7, of course.
> >
> > I suspect we may want to use the temporary mm concept for EFI, too, so we
> > may want to just suck it up and save/restore DR7. But only if a watchpoint
> > is in use, of course. I have an old patch I could dust off that tracks DR7
> > to make things like this efficient.
>
> … but, if this is the case, then I will just make (un)use_temporary_mm() to
> save/restore DR7. I guess you are ok with such a solution. I will
> incorporate it into Rick’s v3.
>

I'm certainly amenable to other solutions, but this one does seem the
least messy.  I looked at my old patch, and it doesn't do what you
want.  I'd suggest you just add a percpu variable like cpu_dr7 and rig
up some accessors so that it stays up to date.  Then you can skip the
dr7 writes if there are no watchpoints set.

Also, EFI is probably a less interesting example than rare_write.
With rare_write, especially the dynamically allocated variants that
people keep coming up with, we'll need a swath of address space fully
as large as the vmalloc area. and getting *that* right while still
using the kernel address range might be more of a mess than we really
want to deal with.

--Andy
Nadav Amit Feb. 11, 2019, 7:18 p.m. UTC | #6
> On Feb 11, 2019, at 11:07 AM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> I'm certainly amenable to other solutions, but this one does seem the
> least messy.  I looked at my old patch, and it doesn't do what you
> want.  I'd suggest you just add a percpu variable like cpu_dr7 and rig
> up some accessors so that it stays up to date.  Then you can skip the
> dr7 writes if there are no watchpoints set.
> 
> Also, EFI is probably a less interesting example than rare_write.
> With rare_write, especially the dynamically allocated variants that
> people keep coming up with, we'll need a swath of address space fully
> as large as the vmalloc area. and getting *that* right while still
> using the kernel address range might be more of a mess than we really
> want to deal with.

As long as you feel comfortable with this solution, I’m fine with it.

Here is what I have (untested). I prefer to save/restore all the DRs,
because IIRC DR6 indications are updated even if breakpoints are disabled
(in DR7). And anyhow, that is the standard interface.


-- >8 --

From: Nadav Amit <namit@vmware.com>
Date: Mon, 11 Feb 2019 03:07:08 -0800
Subject: [PATCH] mm: save DRs when loading temporary mm

Signed-off-by: Nadav Amit <namit@vmware.com>
---
 arch/x86/include/asm/mmu_context.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d684b954f3c0..4f92ec3df149 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/mpx.h>
+#include <asm/debugreg.h>
 
 extern atomic64_t last_mm_ctx_id;
 
@@ -358,6 +359,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 
 typedef struct {
 	struct mm_struct *prev;
+	unsigned short bp_enabled : 1;
 } temp_mm_state_t;
 
 /*
@@ -380,6 +382,15 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
 	lockdep_assert_irqs_disabled();
 	state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	switch_mm_irqs_off(NULL, mm, current);
+
+	/*
+	 * If breakpoints are enabled, disable them while the temporary mm is
+	 * used - they do not belong and might cause wrong signals or crashes.
+	 */
+	state.bp_enabled = hw_breakpoint_active();
+	if (state.bp_enabled)
+		hw_breakpoint_disable();
+
 	return state;
 }
 
@@ -387,6 +398,13 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev)
 {
 	lockdep_assert_irqs_disabled();
 	switch_mm_irqs_off(NULL, prev.prev, current);
+
+	/*
+	 * Restore the breakpoints if they were disabled before the temporary mm
+	 * was loaded.
+	 */
+	if (prev.bp_enabled)
+		hw_breakpoint_restore();
 }
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
Andy Lutomirski Feb. 11, 2019, 10:47 p.m. UTC | #7
On Mon, Feb 11, 2019 at 11:18 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Feb 11, 2019, at 11:07 AM, Andy Lutomirski <luto@kernel.org> wrote:
> >
> > I'm certainly amenable to other solutions, but this one does seem the
> > least messy.  I looked at my old patch, and it doesn't do what you
> > want.  I'd suggest you just add a percpu variable like cpu_dr7 and rig
> > up some accessors so that it stays up to date.  Then you can skip the
> > dr7 writes if there are no watchpoints set.
> >
> > Also, EFI is probably a less interesting example than rare_write.
> > With rare_write, especially the dynamically allocated variants that
> > people keep coming up with, we'll need a swath of address space fully
> > as large as the vmalloc area. and getting *that* right while still
> > using the kernel address range might be more of a mess than we really
> > want to deal with.
>
> As long as you feel comfortable with this solution, I’m fine with it.
>
> Here is what I have (untested). I prefer to save/restore all the DRs,
> because IIRC DR6 indications are updated even if breakpoints are disabled
> (in DR7). And anyhow, that is the standard interface.

Seems reasonable, but:

>
>
> -- >8 --
>
> From: Nadav Amit <namit@vmware.com>
> Date: Mon, 11 Feb 2019 03:07:08 -0800
> Subject: [PATCH] mm: save DRs when loading temporary mm
>
> Signed-off-by: Nadav Amit <namit@vmware.com>
> ---
>  arch/x86/include/asm/mmu_context.h | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
>
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index d684b954f3c0..4f92ec3df149 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -13,6 +13,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/paravirt.h>
>  #include <asm/mpx.h>
> +#include <asm/debugreg.h>
>
>  extern atomic64_t last_mm_ctx_id;
>
> @@ -358,6 +359,7 @@ static inline unsigned long __get_current_cr3_fast(void)
>
>  typedef struct {
>         struct mm_struct *prev;
> +       unsigned short bp_enabled : 1;
>  } temp_mm_state_t;
>
>  /*
> @@ -380,6 +382,15 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
>         lockdep_assert_irqs_disabled();
>         state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>         switch_mm_irqs_off(NULL, mm, current);
> +
> +       /*
> +        * If breakpoints are enabled, disable them while the temporary mm is
> +        * used - they do not belong and might cause wrong signals or crashes.
> +        */

Maybe clarify this?  Add some mention that the specific problem is
that user code could set a watchpoint on an address that is also used
in the temporary mm.

Arguably we should not disable *kernel* breakpoints a la perf, but
that seems like quite a minor issue, at least as long as
use_temporary_mm() doesn't get wider use.  But a comment that this
also disables perf breakpoints and that this could be undesirable
might be in order as well.
Nadav Amit Feb. 12, 2019, 6:23 p.m. UTC | #8
> On Feb 11, 2019, at 2:47 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> On Mon, Feb 11, 2019 at 11:18 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>> 
>> +
>> +       /*
>> +        * If breakpoints are enabled, disable them while the temporary mm is
>> +        * used - they do not belong and might cause wrong signals or crashes.
>> +        */
> 
> Maybe clarify this?  Add some mention that the specific problem is
> that user code could set a watchpoint on an address that is also used
> in the temporary mm.
> 
> Arguably we should not disable *kernel* breakpoints a la perf, but
> that seems like quite a minor issue, at least as long as
> use_temporary_mm() doesn't get wider use.  But a comment that this
> also disables perf breakpoints and that this could be undesirable
> might be in order as well.

I think that in the future there may also be security benefits for disabling
breakpoints when you are in a sensitive code-block, for instance when you
poke text, to prevent the control flow from being hijacked (by exploiting a
bug in the debug exception handler). Some additional steps need to be taken
for that to be beneficial so I leave it out of the comment for now.

Anyhow, how about this:

-- >8 --

From: Nadav Amit <namit@vmware.com>
Date: Mon, 11 Feb 2019 03:07:08 -0800
Subject: [PATCH] x86/mm: Save DRs when loading a temporary mm

Prevent user watchpoints from mistakenly firing while the temporary mm
is being used. As the addresses that of the temporary mm might overlap
those of the user-process, this is necessary to prevent wrong signals
or worse things from happening.

Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 arch/x86/include/asm/mmu_context.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d684b954f3c0..0d6c72ece750 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/mpx.h>
+#include <asm/debugreg.h>
 
 extern atomic64_t last_mm_ctx_id;
 
@@ -358,6 +359,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 
 typedef struct {
 	struct mm_struct *prev;
+	unsigned short bp_enabled : 1;
 } temp_mm_state_t;
 
 /*
@@ -380,6 +382,22 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
 	lockdep_assert_irqs_disabled();
 	state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	switch_mm_irqs_off(NULL, mm, current);
+
+	/*
+	 * If breakpoints are enabled, disable them while the temporary mm is
+	 * used. Userspace might set up watchpoints on addresses that are used
+	 * in the temporary mm, which would lead to wrong signals being sent or
+	 * crashes.
+	 *
+	 * Note that breakpoints are not disabled selectively, which also causes
+	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
+	 * undesirable, but still seems reasonable as the code that runs in the
+	 * temporary mm should be short.
+	 */
+	state.bp_enabled = hw_breakpoint_active();
+	if (state.bp_enabled)
+		hw_breakpoint_disable();
+
 	return state;
 }
 
@@ -387,6 +405,13 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev)
 {
 	lockdep_assert_irqs_disabled();
 	switch_mm_irqs_off(NULL, prev.prev, current);
+
+	/*
+	 * Restore the breakpoints if they were disabled before the temporary mm
+	 * was loaded.
+	 */
+	if (prev.bp_enabled)
+		hw_breakpoint_restore();
 }
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff mbox series

Patch

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..e8f630d9a2ed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1021,6 +1021,9 @@  static inline void __meminit init_trampoline_default(void)
 	/* Default trampoline pgd value */
 	trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
+
+void __init poking_init(void);
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index f8fc8e86cf01..a75eed841eed 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -39,5 +39,7 @@  extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 extern int after_bootmem;
+extern __ro_after_init struct mm_struct *poking_mm;
+extern __ro_after_init unsigned long poking_addr;
 
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 12fddbc8c55b..ae05fbb50171 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -678,6 +678,9 @@  void *__init_or_module text_poke_early(void *addr, const void *opcode,
 	return addr;
 }
 
+__ro_after_init struct mm_struct *poking_mm;
+__ro_after_init unsigned long poking_addr;
+
 static void *__text_poke(void *addr, const void *opcode, size_t len)
 {
 	unsigned long flags;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..125c8c48aa24 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@ 
 #include <asm/init.h>
 #include <asm/uv/uv.h>
 #include <asm/setup.h>
+#include <asm/text-patching.h>
 
 #include "mm_internal.h"
 
@@ -1383,6 +1384,41 @@  unsigned long memory_block_size_bytes(void)
 	return memory_block_size_probed;
 }
 
+/*
+ * Initialize an mm_struct to be used during poking and a pointer to be used
+ * during patching.
+ */
+void __init poking_init(void)
+{
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	poking_mm = copy_init_mm();
+	BUG_ON(!poking_mm);
+
+	/*
+	 * Randomize the poking address, but make sure that the following page
+	 * will be mapped at the same PMD. We need 2 pages, so find space for 3,
+	 * and adjust the address if the PMD ends after the first one.
+	 */
+	poking_addr = TASK_UNMAPPED_BASE;
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+		poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
+			(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
+
+	if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
+		poking_addr += PAGE_SIZE;
+
+	/*
+	 * We need to trigger the allocation of the page-tables that will be
+	 * needed for poking now. Later, poking may be performed in an atomic
+	 * section, which might cause allocation to fail.
+	 */
+	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+	BUG_ON(!ptep);
+	pte_unmap_unlock(ptep, ptl);
+}
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/init/main.c b/init/main.c
index e2e80ca3165a..f5947ba53bb4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -496,6 +496,8 @@  void __init __weak thread_stack_cache_init(void)
 
 void __init __weak mem_encrypt_init(void) { }
 
+void __init __weak poking_init(void) { }
+
 bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
@@ -730,6 +732,7 @@  asmlinkage __visible void __init start_kernel(void)
 	taskstats_init_early();
 	delayacct_init();
 
+	poking_init();
 	check_bugs();
 
 	acpi_subsystem_init();