diff mbox series

[RFC,v2,17/19] heki: x86: Update permissions counters during text patching

Message ID 20231113022326.24388-18-mic@digikod.net (mailing list archive)
State Handled Elsewhere
Headers show
Series Hypervisor-Enforced Kernel Integrity | expand

Commit Message

Mickaël Salaün Nov. 13, 2023, 2:23 a.m. UTC
From: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>

X86 uses a function called __text_poke() to modify executable code. This
patching function is used by many features such as KProbes and FTrace.

Update the permissions counters for the text page so that write
permissions can be temporarily established in the EPT to modify the
instructions in that page.

Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Mickaël Salaün <mic@digikod.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---

Changes since v1:
* New patch
---
 arch/x86/kernel/alternative.c |  5 ++++
 arch/x86/mm/heki.c            | 49 +++++++++++++++++++++++++++++++++++
 include/linux/heki.h          | 14 ++++++++++
 3 files changed, 68 insertions(+)

Comments

Peter Zijlstra Nov. 13, 2023, 8:19 a.m. UTC | #1
On Sun, Nov 12, 2023 at 09:23:24PM -0500, Mickaël Salaün wrote:
> From: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> 
> X86 uses a function called __text_poke() to modify executable code. This
> patching function is used by many features such as KProbes and FTrace.
> 
> Update the permissions counters for the text page so that write
> permissions can be temporarily established in the EPT to modify the
> instructions in that page.
> 
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> Cc: Mickaël Salaün <mic@digikod.net>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <seanjc@google.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> Cc: Wanpeng Li <wanpengli@tencent.com>
> Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> ---
> 
> Changes since v1:
> * New patch
> ---
>  arch/x86/kernel/alternative.c |  5 ++++
>  arch/x86/mm/heki.c            | 49 +++++++++++++++++++++++++++++++++++
>  include/linux/heki.h          | 14 ++++++++++
>  3 files changed, 68 insertions(+)
> 
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 517ee01503be..64fd8757ba5c 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -18,6 +18,7 @@
>  #include <linux/mmu_context.h>
>  #include <linux/bsearch.h>
>  #include <linux/sync_core.h>
> +#include <linux/heki.h>
>  #include <asm/text-patching.h>
>  #include <asm/alternative.h>
>  #include <asm/sections.h>
> @@ -1801,6 +1802,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>  	 */
>  	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
>  
> +	heki_text_poke_start(pages, cross_page_boundary ? 2 : 1, pgprot);
>  	/*
>  	 * The lock is not really needed, but this allows to avoid open-coding.
>  	 */
> @@ -1865,7 +1867,10 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>  	}
>  
>  	local_irq_restore(flags);
> +
>  	pte_unmap_unlock(ptep, ptl);
> +	heki_text_poke_end(pages, cross_page_boundary ? 2 : 1, pgprot);
> +
>  	return addr;
>  }

This makes no sense, we already use a custom CR3 with userspace alias
for the actual pages to write to, why are you then frobbing permissions
on that *again* ?
Madhavan T. Venkataraman Nov. 27, 2023, 4:48 p.m. UTC | #2
Apologies for the late reply. I was on vacation. Please see my response below:

On 11/13/23 02:19, Peter Zijlstra wrote:
> On Sun, Nov 12, 2023 at 09:23:24PM -0500, Mickaël Salaün wrote:
>> From: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>>
>> X86 uses a function called __text_poke() to modify executable code. This
>> patching function is used by many features such as KProbes and FTrace.
>>
>> Update the permissions counters for the text page so that write
>> permissions can be temporarily established in the EPT to modify the
>> instructions in that page.
>>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: H. Peter Anvin <hpa@zytor.com>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>> Cc: Mickaël Salaün <mic@digikod.net>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Sean Christopherson <seanjc@google.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
>> Cc: Wanpeng Li <wanpengli@tencent.com>
>> Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>> ---
>>
>> Changes since v1:
>> * New patch
>> ---
>>  arch/x86/kernel/alternative.c |  5 ++++
>>  arch/x86/mm/heki.c            | 49 +++++++++++++++++++++++++++++++++++
>>  include/linux/heki.h          | 14 ++++++++++
>>  3 files changed, 68 insertions(+)
>>
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index 517ee01503be..64fd8757ba5c 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -18,6 +18,7 @@
>>  #include <linux/mmu_context.h>
>>  #include <linux/bsearch.h>
>>  #include <linux/sync_core.h>
>> +#include <linux/heki.h>
>>  #include <asm/text-patching.h>
>>  #include <asm/alternative.h>
>>  #include <asm/sections.h>
>> @@ -1801,6 +1802,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>>  	 */
>>  	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
>>  
>> +	heki_text_poke_start(pages, cross_page_boundary ? 2 : 1, pgprot);
>>  	/*
>>  	 * The lock is not really needed, but this allows to avoid open-coding.
>>  	 */
>> @@ -1865,7 +1867,10 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>>  	}
>>  
>>  	local_irq_restore(flags);
>> +
>>  	pte_unmap_unlock(ptep, ptl);
>> +	heki_text_poke_end(pages, cross_page_boundary ? 2 : 1, pgprot);
>> +
>>  	return addr;
>>  }
> 
> This makes no sense, we already use a custom CR3 with userspace alias
> for the actual pages to write to, why are you then frobbing permissions
> on that *again* ?

Today, the permissions for a guest page in the extended page table (EPT) are RWX (unless permissions are
restricted for some specific reason like for shadow page table pages). In this Heki feature, we don't allow
RWX by default in the EPT. We only allow those permissions in the EPT that the guest page actually needs.
E.g., for a text page, it is R_X in both the guest page table and the EPT.

For text patching, the above code establishes an alternate mapping in the guest page table that is RW_ so
that the text can be patched. That needs to be reflected in the EPT so that the EPT permissions will change
from R_X to RWX. In other words, RWX is allowed only as necessary. At the end of patching, the EPT permissions
are restored to R_X.

Does that address your comment?

Madhavan
Peter Zijlstra Nov. 27, 2023, 8:08 p.m. UTC | #3
On Mon, Nov 27, 2023 at 10:48:29AM -0600, Madhavan T. Venkataraman wrote:
> Apologies for the late reply. I was on vacation. Please see my response below:
> 
> On 11/13/23 02:19, Peter Zijlstra wrote:
> > On Sun, Nov 12, 2023 at 09:23:24PM -0500, Mickaël Salaün wrote:
> >> From: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> >>
> >> X86 uses a function called __text_poke() to modify executable code. This
> >> patching function is used by many features such as KProbes and FTrace.
> >>
> >> Update the permissions counters for the text page so that write
> >> permissions can be temporarily established in the EPT to modify the
> >> instructions in that page.
> >>
> >> Cc: Borislav Petkov <bp@alien8.de>
> >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >> Cc: H. Peter Anvin <hpa@zytor.com>
> >> Cc: Ingo Molnar <mingo@redhat.com>
> >> Cc: Kees Cook <keescook@chromium.org>
> >> Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> >> Cc: Mickaël Salaün <mic@digikod.net>
> >> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >> Cc: Sean Christopherson <seanjc@google.com>
> >> Cc: Thomas Gleixner <tglx@linutronix.de>
> >> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
> >> Cc: Wanpeng Li <wanpengli@tencent.com>
> >> Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
> >> ---
> >>
> >> Changes since v1:
> >> * New patch
> >> ---
> >>  arch/x86/kernel/alternative.c |  5 ++++
> >>  arch/x86/mm/heki.c            | 49 +++++++++++++++++++++++++++++++++++
> >>  include/linux/heki.h          | 14 ++++++++++
> >>  3 files changed, 68 insertions(+)
> >>
> >> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> >> index 517ee01503be..64fd8757ba5c 100644
> >> --- a/arch/x86/kernel/alternative.c
> >> +++ b/arch/x86/kernel/alternative.c
> >> @@ -18,6 +18,7 @@
> >>  #include <linux/mmu_context.h>
> >>  #include <linux/bsearch.h>
> >>  #include <linux/sync_core.h>
> >> +#include <linux/heki.h>
> >>  #include <asm/text-patching.h>
> >>  #include <asm/alternative.h>
> >>  #include <asm/sections.h>
> >> @@ -1801,6 +1802,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
> >>  	 */
> >>  	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
> >>  
> >> +	heki_text_poke_start(pages, cross_page_boundary ? 2 : 1, pgprot);
> >>  	/*
> >>  	 * The lock is not really needed, but this allows to avoid open-coding.
> >>  	 */
> >> @@ -1865,7 +1867,10 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
> >>  	}
> >>  
> >>  	local_irq_restore(flags);
> >> +
> >>  	pte_unmap_unlock(ptep, ptl);
> >> +	heki_text_poke_end(pages, cross_page_boundary ? 2 : 1, pgprot);
> >> +
> >>  	return addr;
> >>  }
> > 
> > This makes no sense, we already use a custom CR3 with userspace alias
> > for the actual pages to write to, why are you then frobbing permissions
> > on that *again* ?
> 
> Today, the permissions for a guest page in the extended page table
> (EPT) are RWX (unless permissions are restricted for some specific
> reason like for shadow page table pages). In this Heki feature, we
> don't allow RWX by default in the EPT. We only allow those permissions
> in the EPT that the guest page actually needs.  E.g., for a text page,
> it is R_X in both the guest page table and the EPT.

To what end? If you always mirror what the guest does, you've not
actually gained anything.

> For text patching, the above code establishes an alternate mapping in
> the guest page table that is RW_ so that the text can be patched. That
> needs to be reflected in the EPT so that the EPT permissions will
> change from R_X to RWX. In other words, RWX is allowed only as
> necessary. At the end of patching, the EPT permissions are restored to
> R_X.
> 
> Does that address your comment?

No, if you want to mirror the native PTEs why don't you hook into the
paravirt page-table muck and get all that for free?

Also, this is the user range, are you saying you're also playing these
daft games with user maps?
Madhavan T. Venkataraman Nov. 29, 2023, 9:07 p.m. UTC | #4
On 11/27/23 14:08, Peter Zijlstra wrote:
> On Mon, Nov 27, 2023 at 10:48:29AM -0600, Madhavan T. Venkataraman wrote:
>> Apologies for the late reply. I was on vacation. Please see my response below:
>>
>> On 11/13/23 02:19, Peter Zijlstra wrote:
>>> On Sun, Nov 12, 2023 at 09:23:24PM -0500, Mickaël Salaün wrote:
>>>> From: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>>>>
>>>> X86 uses a function called __text_poke() to modify executable code. This
>>>> patching function is used by many features such as KProbes and FTrace.
>>>>
>>>> Update the permissions counters for the text page so that write
>>>> permissions can be temporarily established in the EPT to modify the
>>>> instructions in that page.
>>>>
>>>> Cc: Borislav Petkov <bp@alien8.de>
>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>>> Cc: H. Peter Anvin <hpa@zytor.com>
>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>> Cc: Kees Cook <keescook@chromium.org>
>>>> Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>>>> Cc: Mickaël Salaün <mic@digikod.net>
>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>> Cc: Sean Christopherson <seanjc@google.com>
>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>> Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
>>>> Cc: Wanpeng Li <wanpengli@tencent.com>
>>>> Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
>>>> ---
>>>>
>>>> Changes since v1:
>>>> * New patch
>>>> ---
>>>>  arch/x86/kernel/alternative.c |  5 ++++
>>>>  arch/x86/mm/heki.c            | 49 +++++++++++++++++++++++++++++++++++
>>>>  include/linux/heki.h          | 14 ++++++++++
>>>>  3 files changed, 68 insertions(+)
>>>>
>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>> index 517ee01503be..64fd8757ba5c 100644
>>>> --- a/arch/x86/kernel/alternative.c
>>>> +++ b/arch/x86/kernel/alternative.c
>>>> @@ -18,6 +18,7 @@
>>>>  #include <linux/mmu_context.h>
>>>>  #include <linux/bsearch.h>
>>>>  #include <linux/sync_core.h>
>>>> +#include <linux/heki.h>
>>>>  #include <asm/text-patching.h>
>>>>  #include <asm/alternative.h>
>>>>  #include <asm/sections.h>
>>>> @@ -1801,6 +1802,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>>>>  	 */
>>>>  	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
>>>>  
>>>> +	heki_text_poke_start(pages, cross_page_boundary ? 2 : 1, pgprot);
>>>>  	/*
>>>>  	 * The lock is not really needed, but this allows to avoid open-coding.
>>>>  	 */
>>>> @@ -1865,7 +1867,10 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
>>>>  	}
>>>>  
>>>>  	local_irq_restore(flags);
>>>> +
>>>>  	pte_unmap_unlock(ptep, ptl);
>>>> +	heki_text_poke_end(pages, cross_page_boundary ? 2 : 1, pgprot);
>>>> +
>>>>  	return addr;
>>>>  }
>>>
>>> This makes no sense, we already use a custom CR3 with userspace alias
>>> for the actual pages to write to, why are you then frobbing permissions
>>> on that *again* ?
>>
>> Today, the permissions for a guest page in the extended page table
>> (EPT) are RWX (unless permissions are restricted for some specific
>> reason like for shadow page table pages). In this Heki feature, we
>> don't allow RWX by default in the EPT. We only allow those permissions
>> in the EPT that the guest page actually needs.  E.g., for a text page,
>> it is R_X in both the guest page table and the EPT.
> 
> To what end? If you always mirror what the guest does, you've not
> actually gained anything.
> 
>> For text patching, the above code establishes an alternate mapping in
>> the guest page table that is RW_ so that the text can be patched. That
>> needs to be reflected in the EPT so that the EPT permissions will
>> change from R_X to RWX. In other words, RWX is allowed only as
>> necessary. At the end of patching, the EPT permissions are restored to
>> R_X.
>>
>> Does that address your comment?
> 
> No, if you want to mirror the native PTEs why don't you hook into the
> paravirt page-table muck and get all that for free?
> 
> Also, this is the user range, are you saying you're also playing these
> daft games with user maps?

I think that we should have done a better job of communicating the threat model in Heki and
how we are trying to address it. I will correct that here. I think this will help answer
many questions. Some of these questions also came up in the LPC when we presented this.
Apologies for the slightly long answer. It is for everyone's benefit. Bear with me.

Threat Model
------------

In the threat model in Heki, the attacker is a user space attacker who exploits
a kernel vulnerability to gain more privileges or bypass the kernel's access
control and self-protection mechanisms. 

In the context of the guest page table, one of the things that the threat model translates
to is a hacker gaining access to a guest page with RWX permissions. E.g., by adding execute
permissions to a writable page or by adding write permissions to an executable page.

Today, the permissions for a guest page in the extended page table are RWX by
default. So, if a hacker manages to establish RWX for a page in the guest page
table, then that is all he needs to do some damage.

How to defeat the threat
------------------------

To defeat this, we need to establish the correct permissions for a guest page
in the extended page table as well. That is, R_X for a text page, R__ for a
read-only page and RW_ for a writable page. The only exception is a guest page
that is mapped via multiple mappings with different permissions in each
mapping. In that case, the collective permissions across all mappings needs
to be established in the extended page table so that all mappings can work.

Mechanism
---------

To achieve all this, Heki finds all the kernel mappings at the end of kernel
boot and reflects their permissions in the extended page table before
kicking off the init process.

During runtime, permissions on a guest page can change because of genuine
kernel operations:
	- vmap/vunmap
	- text patching for FTrace, Kprobes, etc
	- set_memory_*()

In each of these cases as well, the permissions need to be reflected in the
extended page table. In summary, the extended page table permissions mirror
the guest page table ones.

Authentication
--------------

The above approach addresses the case where a hacker tries to directly
modify a guest page table entry. It doesn't matter since the extended page
table permissions are not changed.

Now, the question is - what if a hacker manages to use the Heki primitives
and establish the permissions he wants in the extended page table? All of
this work is for nothing!!

The answer is - authentication. If an entity outside the guest can validate
or authenticate each guest request to change extended page table permissions,
then it can tell a genuine request from an attack. We are planning to make the
VMM that entity. In the case of a genuine request, the VMM will call the
hypervisor and establish the correct permissions in the extended page table.
If the VMM thinks that it is an attack, it will have the hypervisor send an
exception to the guest.

In the current version (RFC V2), we don't have authentication in place. The
VMM is not in the picture yet. This is WIP. We are hoping to implement some
authentication in V3 and improve it as we go forward. So, in V2, we only have
the mechanisms in place.

So, I agree that it is kinda hard to see the value of Heki without authentication.

Kernel Lockdown
---------------

But, we must provide at least some security in V2. Otherwise, it is useless.

So, we have implemented what we call a kernel lockdown. At the end of kernel
boot, Heki establishes permissions in the extended page table as mentioned
before. Also, it adds an immutable attribute for kernel text and kernel RO data.
Beyond that point, guest requests that attempt to modify permissions on any of
the immutable pages will be denied.

This means that features like FTrace and KProbes will not work on kernel text
in V2. This is a temporary limitation. Once authentication is in place, the
limitation will go away.

Additional information
----------------------

The following primitives call Heki functions to reflect guest page table
permissions in the extended page table:

heki_arch_late_init()
	This calls heki_protect().
	
	This is called at the end of kernel boot to protect all guest kernel
	mappings at that point.

vmap_range_noflush()
vmap_small_pages_range_noflush()
	These functions call heki_map().

	These are the lowest level functions called from different places
	to map something in the kernel address space.

set_memory_nx()
set_memory_rw()
	These functions call heki_change_page_attr_set().

set_memory_x()
set_memory_ro()
set_memory_rox()
	These functions call heki_change_page_attr_clear().

__text_poke()
	This function is called by various features to patch text.
	This calls heki_text_poke_start() and heki_text_poke_end().

	heki_text_poke_start() is called to add write permissions to the
	extended page table so that text can be patched. heki_text_poke_end()
	is called to revert write permissions in the extended page table.

User mappings
-------------

All of this work is only for protecting guest kernel mappings. So, the idea is
- the VMM+Hypervisor protect the integrity of the kernel and the kernel
protects the integrity of user land. So, user pages continue to have RWX in the
extended page table. The MBEC feature makes it possible to have separate execute
permission bits in the page table entry for user and kernel.

One final thing
---------------

Peter mentioned the following:

"if you want to mirror the native PTEs why don't you hook into the
paravirt page-table muck and get all that for free?"

We did consider using a shadow page table kind of approach so that guest page table
modifications can be intercepted and reflected in the page table entry. We did not
do this for two reasons:

- there are bits in the page table entry that are not permission bits. We would like
  the guest kernel to be able to modify them directly.

- we cannot tell a genuine request from an attack.

That said, we are still considering making the guest page table read only for added
security. This is WIP. I do not have details yet.

If I have not addressed any comment, please let me know.

As always, thanks for your comments.

Madhavan T Venkataraman
Peter Zijlstra Nov. 30, 2023, 11:33 a.m. UTC | #5
On Wed, Nov 29, 2023 at 03:07:15PM -0600, Madhavan T. Venkataraman wrote:

> Kernel Lockdown
> ---------------
> 
> But, we must provide at least some security in V2. Otherwise, it is useless.
> 
> So, we have implemented what we call a kernel lockdown. At the end of kernel
> boot, Heki establishes permissions in the extended page table as mentioned
> before. Also, it adds an immutable attribute for kernel text and kernel RO data.
> Beyond that point, guest requests that attempt to modify permissions on any of
> the immutable pages will be denied.
> 
> This means that features like FTrace and KProbes will not work on kernel text
> in V2. This is a temporary limitation. Once authentication is in place, the
> limitation will go away.

So either you're saying your patch 17 / text_poke is broken (so why
include it ?!?) or your statement above is incorrect. Pick one.


> __text_poke()
> 	This function is called by various features to patch text.
> 	This calls heki_text_poke_start() and heki_text_poke_end().
> 
> 	heki_text_poke_start() is called to add write permissions to the
> 	extended page table so that text can be patched. heki_text_poke_end()
> 	is called to revert write permissions in the extended page table.

This, if text_poke works, then static_call / jump_label / ftrace and
everything else should work, they all rely on this.


> Peter mentioned the following:
> 
> "if you want to mirror the native PTEs why don't you hook into the
> paravirt page-table muck and get all that for free?"
> 
> We did consider using a shadow page table kind of approach so that guest page table
> modifications can be intercepted and reflected in the page table entry. We did not
> do this for two reasons:
> 
> - there are bits in the page table entry that are not permission bits. We would like
>   the guest kernel to be able to modify them directly.

This statement makes no sense.

> - we cannot tell a genuine request from an attack.

Why not? How is an explicit call different from an explicit call in a
paravirt hook?

From a maintenance pov we already hate paravirt with a passion, but it
is ever so much better than sprinkling yet another pile of crap
(heki_*) around.
Edgecombe, Rick P Dec. 1, 2023, 12:45 a.m. UTC | #6
On Wed, 2023-11-29 at 15:07 -0600, Madhavan T. Venkataraman wrote:
> Threat Model
> ------------
> 
> In the threat model in Heki, the attacker is a user space attacker
> who exploits
> a kernel vulnerability to gain more privileges or bypass the kernel's
> access
> control and self-protection mechanisms. 
> 
> In the context of the guest page table, one of the things that the
> threat model translates
> to is a hacker gaining access to a guest page with RWX permissions.
> E.g., by adding execute
> permissions to a writable page or by adding write permissions to an
> executable page.
> 
> Today, the permissions for a guest page in the extended page table
> are RWX by
> default. So, if a hacker manages to establish RWX for a page in the
> guest page
> table, then that is all he needs to do some damage.

I had a few random comments from watching the plumbers talk online:

Is there really a big difference between a page that is RWX, and a RW
page that is about to become RX? I realize that there is an addition of
timing, but when executable code is getting loaded it can be written to
then and later executed. I think that gap could be addressed in two
different ways, both pretty difficult:
 1. Verifying the loaded code before it gets marked 
    executable. This is difficult because the kernel does lots of 
    tweaks on the code it is loading (alternatives, etc). It can't 
    just check a signature.
 2. Loading the code in a protected environment. In this model the 
    (for example) module signature would be checked, then the code 
    would be loaded in some sort of protected environment. This way 
    integrity of the loaded code would be enforced. But extracting 
    module loading into a separate domain would be difficult. 
    Various scattered features all have their hands in the loading.

Secondly, I wonder if another way to look at the memory parts of HEKI
could be that this is a way to protect certain page table bits from
stay writes. The RWX bits in the EPT are not directly writable, so more
steps are needed to change things than just a stray write (instead the
helpers involved in the operations need to be called). If that is a
fair way of looking at it, then I wonder how HEKI compares to a
solution like this security-wise:
https://lore.kernel.org/lkml/20210830235927.6443-1-rick.p.edgecombe@intel.com/

Functional-wise it had the benefit of working on bare metal and
supporting the normal kernel features.
Madhavan T. Venkataraman Dec. 6, 2023, 4:37 p.m. UTC | #7
On 11/30/23 05:33, Peter Zijlstra wrote:
> On Wed, Nov 29, 2023 at 03:07:15PM -0600, Madhavan T. Venkataraman wrote:
> 
>> Kernel Lockdown
>> ---------------
>>
>> But, we must provide at least some security in V2. Otherwise, it is useless.
>>
>> So, we have implemented what we call a kernel lockdown. At the end of kernel
>> boot, Heki establishes permissions in the extended page table as mentioned
>> before. Also, it adds an immutable attribute for kernel text and kernel RO data.
>> Beyond that point, guest requests that attempt to modify permissions on any of
>> the immutable pages will be denied.
>>
>> This means that features like FTrace and KProbes will not work on kernel text
>> in V2. This is a temporary limitation. Once authentication is in place, the
>> limitation will go away.
> 
> So either you're saying your patch 17 / text_poke is broken (so why
> include it ?!?) or your statement above is incorrect. Pick one.
> 

It has been included so that people can be aware of the changes.

I will remove the text_poke() changes from the patchset and send it later when
I have some authentication in place. It will make sense then.

> 
>> __text_poke()
>> 	This function is called by various features to patch text.
>> 	This calls heki_text_poke_start() and heki_text_poke_end().
>>
>> 	heki_text_poke_start() is called to add write permissions to the
>> 	extended page table so that text can be patched. heki_text_poke_end()
>> 	is called to revert write permissions in the extended page table.
> 
> This, if text_poke works, then static_call / jump_label / ftrace and
> everything else should work, they all rely on this.
> 
> 
>> Peter mentioned the following:
>>
>> "if you want to mirror the native PTEs why don't you hook into the
>> paravirt page-table muck and get all that for free?"
>>
>> We did consider using a shadow page table kind of approach so that guest page table
>> modifications can be intercepted and reflected in the page table entry. We did not
>> do this for two reasons:
>>
>> - there are bits in the page table entry that are not permission bits. We would like
>>   the guest kernel to be able to modify them directly.
> 
> This statement makes no sense.
> 
>> - we cannot tell a genuine request from an attack.
> 
> Why not? How is an explicit call different from an explicit call in a
> paravirt hook?
> 
>>From a maintenance pov we already hate paravirt with a passion, but it
> is ever so much better than sprinkling yet another pile of crap
> (heki_*) around.

I only said that the idea was considered.

We can resume the discussion on this topic when I send the text_poke() changes in a later
version of the Heki patchset.

Madhavan
Madhavan T. Venkataraman Dec. 6, 2023, 4:41 p.m. UTC | #8
On 11/30/23 18:45, Edgecombe, Rick P wrote:
> On Wed, 2023-11-29 at 15:07 -0600, Madhavan T. Venkataraman wrote:
>> Threat Model
>> ------------
>>
>> In the threat model in Heki, the attacker is a user space attacker
>> who exploits
>> a kernel vulnerability to gain more privileges or bypass the kernel's
>> access
>> control and self-protection mechanisms. 
>>
>> In the context of the guest page table, one of the things that the
>> threat model translates
>> to is a hacker gaining access to a guest page with RWX permissions.
>> E.g., by adding execute
>> permissions to a writable page or by adding write permissions to an
>> executable page.
>>
>> Today, the permissions for a guest page in the extended page table
>> are RWX by
>> default. So, if a hacker manages to establish RWX for a page in the
>> guest page
>> table, then that is all he needs to do some damage.
> 
> I had a few random comments from watching the plumbers talk online:
> 
> Is there really a big difference between a page that is RWX, and a RW
> page that is about to become RX? I realize that there is an addition of
> timing, but when executable code is getting loaded it can be written to
> then and later executed. I think that gap could be addressed in two
> different ways, both pretty difficult:
>  1. Verifying the loaded code before it gets marked 
>     executable. This is difficult because the kernel does lots of 
>     tweaks on the code it is loading (alternatives, etc). It can't 
>     just check a signature.
>  2. Loading the code in a protected environment. In this model the 
>     (for example) module signature would be checked, then the code 
>     would be loaded in some sort of protected environment. This way 
>     integrity of the loaded code would be enforced. But extracting 
>     module loading into a separate domain would be difficult. 
>     Various scattered features all have their hands in the loading.
> 
> Secondly, I wonder if another way to look at the memory parts of HEKI
> could be that this is a way to protect certain page table bits from
> stay writes. The RWX bits in the EPT are not directly writable, so more
> steps are needed to change things than just a stray write (instead the
> helpers involved in the operations need to be called). If that is a
> fair way of looking at it, then I wonder how HEKI compares to a
> solution like this security-wise:
> https://lore.kernel.org/lkml/20210830235927.6443-1-rick.p.edgecombe@intel.com/
> 
> Functional-wise it had the benefit of working on bare metal and
> supporting the normal kernel features.

Thanks for the comments. I will think about what you have said and will respond
soon.

Madhavan
Peter Zijlstra Dec. 6, 2023, 6:51 p.m. UTC | #9
On Wed, Dec 06, 2023 at 10:37:33AM -0600, Madhavan T. Venkataraman wrote:
> 
> 
> On 11/30/23 05:33, Peter Zijlstra wrote:
> > On Wed, Nov 29, 2023 at 03:07:15PM -0600, Madhavan T. Venkataraman wrote:
> > 
> >> Kernel Lockdown
> >> ---------------
> >>
> >> But, we must provide at least some security in V2. Otherwise, it is useless.
> >>
> >> So, we have implemented what we call a kernel lockdown. At the end of kernel
> >> boot, Heki establishes permissions in the extended page table as mentioned
> >> before. Also, it adds an immutable attribute for kernel text and kernel RO data.
> >> Beyond that point, guest requests that attempt to modify permissions on any of
> >> the immutable pages will be denied.
> >>
> >> This means that features like FTrace and KProbes will not work on kernel text
> >> in V2. This is a temporary limitation. Once authentication is in place, the
> >> limitation will go away.
> > 
> > So either you're saying your patch 17 / text_poke is broken (so why
> > include it ?!?) or your statement above is incorrect. Pick one.
> > 
> 
> It has been included so that people can be aware of the changes.
> 
> I will remove the text_poke() changes from the patchset and send it later when
> I have some authentication in place. It will make sense then.

If you know its broken then fucking say so in the Changelog instead of
wasting everybody's time.. OMG.
Madhavan T. Venkataraman Dec. 8, 2023, 6:41 p.m. UTC | #10
On 12/6/23 12:51, Peter Zijlstra wrote:
> On Wed, Dec 06, 2023 at 10:37:33AM -0600, Madhavan T. Venkataraman wrote:
>>
>>
>> On 11/30/23 05:33, Peter Zijlstra wrote:
>>> On Wed, Nov 29, 2023 at 03:07:15PM -0600, Madhavan T. Venkataraman wrote:
>>>
>>>> Kernel Lockdown
>>>> ---------------
>>>>
>>>> But, we must provide at least some security in V2. Otherwise, it is useless.
>>>>
>>>> So, we have implemented what we call a kernel lockdown. At the end of kernel
>>>> boot, Heki establishes permissions in the extended page table as mentioned
>>>> before. Also, it adds an immutable attribute for kernel text and kernel RO data.
>>>> Beyond that point, guest requests that attempt to modify permissions on any of
>>>> the immutable pages will be denied.
>>>>
>>>> This means that features like FTrace and KProbes will not work on kernel text
>>>> in V2. This is a temporary limitation. Once authentication is in place, the
>>>> limitation will go away.
>>>
>>> So either you're saying your patch 17 / text_poke is broken (so why
>>> include it ?!?) or your statement above is incorrect. Pick one.
>>>
>>
>> It has been included so that people can be aware of the changes.
>>
>> I will remove the text_poke() changes from the patchset and send it later when
>> I have some authentication in place. It will make sense then.
> 
> If you know its broken then fucking say so in the Changelog instead of
> wasting everybody's time.. OMG.

It is not broken. It addresses one part of the problem. The other part is WIP.

I am preparing a detailed response to your comments. I ask you to be patient until then. In fact, I would appreciate your input/suggestions on some problems we are trying to solve in this context. I will mention them in my response.

Madhavan
diff mbox series

Patch

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 517ee01503be..64fd8757ba5c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,6 +18,7 @@ 
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
 #include <linux/sync_core.h>
+#include <linux/heki.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
@@ -1801,6 +1802,7 @@  static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
 	 */
 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
 
+	heki_text_poke_start(pages, cross_page_boundary ? 2 : 1, pgprot);
 	/*
 	 * The lock is not really needed, but this allows to avoid open-coding.
 	 */
@@ -1865,7 +1867,10 @@  static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
 	}
 
 	local_irq_restore(flags);
+
 	pte_unmap_unlock(ptep, ptl);
+	heki_text_poke_end(pages, cross_page_boundary ? 2 : 1, pgprot);
+
 	return addr;
 }
 
diff --git a/arch/x86/mm/heki.c b/arch/x86/mm/heki.c
index c0eace9e343f..e4c60d8b4f2d 100644
--- a/arch/x86/mm/heki.c
+++ b/arch/x86/mm/heki.c
@@ -5,8 +5,11 @@ 
  * Copyright © 2023 Microsoft Corporation
  */
 
+#include <asm/pgtable.h>
+#include <asm/text-patching.h>
 #include <linux/heki.h>
 #include <linux/kvm_mem_attr.h>
+#include <linux/mm.h>
 
 #ifdef pr_fmt
 #undef pr_fmt
@@ -63,3 +66,49 @@  void heki_pgprot_to_permissions(pgprot_t prot, unsigned long *set,
 	if (pgprot_val(prot) & _PAGE_NX)
 		*clear |= MEM_ATTR_EXEC;
 }
+
+static unsigned long heki_pgprot_to_flags(pgprot_t prot)
+{
+	unsigned long flags = 0;
+
+	if (pgprot_val(prot) & _PAGE_RW)
+		flags |= _PAGE_RW;
+	if (pgprot_val(prot) & _PAGE_NX)
+		flags |= _PAGE_NX;
+	return flags;
+}
+
+static void heki_text_poke_common(struct page **pages, int npages,
+				  pgprot_t prot, enum heki_cmd cmd)
+{
+	struct heki_args args = {
+		.cmd = cmd,
+	};
+	unsigned long va = poking_addr;
+	int i;
+
+	if (!heki.counters)
+		return;
+
+	mutex_lock(&heki_lock);
+
+	for (i = 0; i < npages; i++, va += PAGE_SIZE) {
+		args.va = va;
+		args.pa = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		args.size = PAGE_SIZE;
+		args.flags = heki_pgprot_to_flags(prot);
+		heki_callback(&args);
+	}
+
+	mutex_unlock(&heki_lock);
+}
+
+void heki_text_poke_start(struct page **pages, int npages, pgprot_t prot)
+{
+	heki_text_poke_common(pages, npages, prot, HEKI_MAP);
+}
+
+void heki_text_poke_end(struct page **pages, int npages, pgprot_t prot)
+{
+	heki_text_poke_common(pages, npages, prot, HEKI_UNMAP);
+}
diff --git a/include/linux/heki.h b/include/linux/heki.h
index 079b34af07f0..6f2cfddc6dac 100644
--- a/include/linux/heki.h
+++ b/include/linux/heki.h
@@ -111,6 +111,7 @@  typedef void (*heki_func_t)(struct heki_args *args);
 
 extern struct heki heki;
 extern bool heki_enabled;
+extern struct mutex heki_lock;
 
 extern bool __read_mostly enable_mbec;
 
@@ -123,12 +124,15 @@  void heki_map(unsigned long va, unsigned long end);
 void heki_update(unsigned long va, unsigned long end, unsigned long set,
 		 unsigned long clear);
 void heki_unmap(unsigned long va, unsigned long end);
+void heki_callback(struct heki_args *args);
 
 /* Arch-specific functions. */
 void heki_arch_early_init(void);
 unsigned long heki_flags_to_permissions(unsigned long flags);
 void heki_pgprot_to_permissions(pgprot_t prot, unsigned long *set,
 				unsigned long *clear);
+void heki_text_poke_start(struct page **pages, int npages, pgprot_t prot);
+void heki_text_poke_end(struct page **pages, int npages, pgprot_t prot);
 
 #else /* !CONFIG_HEKI */
 
@@ -149,6 +153,16 @@  static inline void heki_unmap(unsigned long va, unsigned long end)
 {
 }
 
+/* Arch-specific functions. */
+static inline void heki_text_poke_start(struct page **pages, int npages,
+					pgprot_t prot)
+{
+}
+static inline void heki_text_poke_end(struct page **pages, int npages,
+				      pgprot_t prot)
+{
+}
+
 #endif /* CONFIG_HEKI */
 
 #endif /* __HEKI_H__ */