diff mbox

[RFC,v2,14/32] x86: mm: Provide support to use memblock when spliting large pages

Message ID 148846771545.2349.9373586041426414252.stgit@brijesh-build-machine (mailing list archive)
State New, archived
Headers show

Commit Message

Brijesh Singh March 2, 2017, 3:15 p.m. UTC
If kernel_maps_pages_in_pgd is called early in boot process to change the
memory attributes then it fails to allocate memory when spliting large
pages. The patch extends the cpa_data to provide the support to use
memblock_alloc when slab allocator is not available.

The feature will be used in Secure Encrypted Virtualization (SEV) mode,
where we may need to change the memory region attributes in early boot
process.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/mm/pageattr.c |   51 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 9 deletions(-)

Comments

Borislav Petkov March 10, 2017, 11:06 a.m. UTC | #1
On Thu, Mar 02, 2017 at 10:15:15AM -0500, Brijesh Singh wrote:
> If kernel_maps_pages_in_pgd is called early in boot process to change the

kernel_map_pages_in_pgd()

> memory attributes then it fails to allocate memory when spliting large
> pages. The patch extends the cpa_data to provide the support to use
> memblock_alloc when slab allocator is not available.
> 
> The feature will be used in Secure Encrypted Virtualization (SEV) mode,
> where we may need to change the memory region attributes in early boot
> process.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  arch/x86/mm/pageattr.c |   51 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index 46cc89d..9e4ab3b 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -14,6 +14,7 @@
>  #include <linux/gfp.h>
>  #include <linux/pci.h>
>  #include <linux/vmalloc.h>
> +#include <linux/memblock.h>
>  
>  #include <asm/e820/api.h>
>  #include <asm/processor.h>
> @@ -37,6 +38,7 @@ struct cpa_data {
>  	int		flags;
>  	unsigned long	pfn;
>  	unsigned	force_split : 1;
> +	unsigned	force_memblock :1;
>  	int		curpage;
>  	struct page	**pages;
>  };
> @@ -627,9 +629,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
>  
>  static int
>  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
> -		   struct page *base)
> +		  pte_t *pbase, unsigned long new_pfn)
>  {
> -	pte_t *pbase = (pte_t *)page_address(base);
>  	unsigned long ref_pfn, pfn, pfninc = 1;
>  	unsigned int i, level;
>  	pte_t *tmp;
> @@ -646,7 +647,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>  		return 1;
>  	}
>  
> -	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
> +	paravirt_alloc_pte(&init_mm, new_pfn);
>  
>  	switch (level) {
>  	case PG_LEVEL_2M:
> @@ -707,7 +708,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>  	 * pagetable protections, the actual ptes set above control the
>  	 * primary protection behavior:
>  	 */
> -	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
> +	__set_pmd_pte(kpte, address,
> +		native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));
>  
>  	/*
>  	 * Intel Atom errata AAH41 workaround.
> @@ -723,21 +725,50 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>  	return 0;
>  }
>  
> +static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
> +{
> +	unsigned long phys;
> +	struct page *base;
> +
> +	if (cpa->force_memblock) {
> +		phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);

Maybe there's a reason this fires:

WARNING: modpost: Found 2 section mismatch(es).
To see full details build your kernel with:
'make CONFIG_DEBUG_SECTION_MISMATCH=y'

WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from the function __change_page_attr() to the function .init.text:memblock_alloc()
The function __change_page_attr() references
the function __init memblock_alloc().
This is often because __change_page_attr lacks a __init
annotation or the annotation of memblock_alloc is wrong.

WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from the function __change_page_attr() to the function .meminit.text:memblock_free()
The function __change_page_attr() references
the function __meminit memblock_free().
This is often because __change_page_attr lacks a __meminit
annotation or the annotation of memblock_free is wrong.

Why do we need this whole early mapping? For the guest? I don't like
that memblock thing at all.

So I think the approach with the .data..percpu..hv_shared section is
fine and we should consider SEV-ES

http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf

and do this right from the get-go so that when SEV-ES comes along, we
should simply be ready and extend that mechanism to put the whole Guest
Hypervisor Communication Block in there.

But then the fact that you're mapping those decrypted in init_mm.pgd
makes me think you don't need that early mapping thing at all. Those are
the decrypted mappings of the hypervisor. And that you can do late.

Now, what would be better, IMHO (and I have no idea about virtualization
design so take with a grain of salt) is if the guest would allocate
enough memory for the GHCB and mark it decrypted from the very
beginning. It will be the communication vehicle with the hypervisor
anyway.

And we already do similar things in sme_map_bootdata() for the baremetal
kernel to map boot_data, initrd, EFI, ... and so on things decrypted.

And we should extend that mechanism to map the GHCB in the guest too and
then we can get rid of all that need for ->force_memblock which makes
the crazy mess in pageattr.c even crazier. And it would be lovely if we
can do it without it.

But maybe Paolo might have an even better idea...

Thanks.
Brijesh Singh March 10, 2017, 10:41 p.m. UTC | #2
Hi Boris,

On 03/10/2017 05:06 AM, Borislav Petkov wrote:
> On Thu, Mar 02, 2017 at 10:15:15AM -0500, Brijesh Singh wrote:
>> If kernel_maps_pages_in_pgd is called early in boot process to change the
>
> kernel_map_pages_in_pgd()
>
>> memory attributes then it fails to allocate memory when spliting large
>> pages. The patch extends the cpa_data to provide the support to use
>> memblock_alloc when slab allocator is not available.
>>
>> The feature will be used in Secure Encrypted Virtualization (SEV) mode,
>> where we may need to change the memory region attributes in early boot
>> process.
>>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> ---
>>  arch/x86/mm/pageattr.c |   51 ++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 42 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
>> index 46cc89d..9e4ab3b 100644
>> --- a/arch/x86/mm/pageattr.c
>> +++ b/arch/x86/mm/pageattr.c
>> @@ -14,6 +14,7 @@
>>  #include <linux/gfp.h>
>>  #include <linux/pci.h>
>>  #include <linux/vmalloc.h>
>> +#include <linux/memblock.h>
>>
>>  #include <asm/e820/api.h>
>>  #include <asm/processor.h>
>> @@ -37,6 +38,7 @@ struct cpa_data {
>>  	int		flags;
>>  	unsigned long	pfn;
>>  	unsigned	force_split : 1;
>> +	unsigned	force_memblock :1;
>>  	int		curpage;
>>  	struct page	**pages;
>>  };
>> @@ -627,9 +629,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
>>
>>  static int
>>  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>> -		   struct page *base)
>> +		  pte_t *pbase, unsigned long new_pfn)
>>  {
>> -	pte_t *pbase = (pte_t *)page_address(base);
>>  	unsigned long ref_pfn, pfn, pfninc = 1;
>>  	unsigned int i, level;
>>  	pte_t *tmp;
>> @@ -646,7 +647,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>>  		return 1;
>>  	}
>>
>> -	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>> +	paravirt_alloc_pte(&init_mm, new_pfn);
>>
>>  	switch (level) {
>>  	case PG_LEVEL_2M:
>> @@ -707,7 +708,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>>  	 * pagetable protections, the actual ptes set above control the
>>  	 * primary protection behavior:
>>  	 */
>> -	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
>> +	__set_pmd_pte(kpte, address,
>> +		native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));
>>
>>  	/*
>>  	 * Intel Atom errata AAH41 workaround.
>> @@ -723,21 +725,50 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>>  	return 0;
>>  }
>>
>> +static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
>> +{
>> +	unsigned long phys;
>> +	struct page *base;
>> +
>> +	if (cpa->force_memblock) {
>> +		phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
>
> Maybe there's a reason this fires:
>
> WARNING: modpost: Found 2 section mismatch(es).
> To see full details build your kernel with:
> 'make CONFIG_DEBUG_SECTION_MISMATCH=y'
>
> WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from the function __change_page_attr() to the function .init.text:memblock_alloc()
> The function __change_page_attr() references
> the function __init memblock_alloc().
> This is often because __change_page_attr lacks a __init
> annotation or the annotation of memblock_alloc is wrong.
>
> WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from the function __change_page_attr() to the function .meminit.text:memblock_free()
> The function __change_page_attr() references
> the function __meminit memblock_free().
> This is often because __change_page_attr lacks a __meminit
> annotation or the annotation of memblock_free is wrong.
>

I can take a look at fixing those warning. In my initial attempt was to create
a new function to clear encryption bit but it ended up looking very similar to
__change_page_attr_set_clr() hence decided to extend the exiting function to
use memblock_alloc().


> Why do we need this whole early mapping? For the guest? I don't like
> that memblock thing at all.

Early in boot process, guest kernel allocates some structure (its either
statically allocated or dynamic allocated via memblock_alloc). And shares the physical
address of these structure with hypervisor. Since entire guest memory area is mapped
as encrypted hence those structure's are mapped as encrypted memory range. We need
a method to clear the encryption bit. Sometime these structure maybe part of 2M pages
and need to split into smaller pages.

>
> So I think the approach with the .data..percpu..hv_shared section is
> fine and we should consider SEV-ES
>
> http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf
>
> and do this right from the get-go so that when SEV-ES comes along, we
> should simply be ready and extend that mechanism to put the whole Guest
> Hypervisor Communication Block in there.
>

> But then the fact that you're mapping those decrypted in init_mm.pgd
> makes me think you don't need that early mapping thing at all. Those are
> the decrypted mappings of the hypervisor. And that you can do late.
>

In most cases, guest and hypervisor communication starts as soon as guest provides
the physical address to hypervisor. So we must map the pages as decrypted before
sharing the physical address to hypervisor.

> Now, what would be better, IMHO (and I have no idea about virtualization
> design so take with a grain of salt) is if the guest would allocate
> enough memory for the GHCB and mark it decrypted from the very
> beginning. It will be the communication vehicle with the hypervisor
> anyway.
>
> And we already do similar things in sme_map_bootdata() for the baremetal
> kernel to map boot_data, initrd, EFI, ... and so on things decrypted.
>

I will take a look at sme_map_bootdata but I believe the main difference is,
in case of SME those memory regions were allocated by bios or bootloader as
decrypted and sme_map_bootdata clears the encryptions bit.

In case of guest, memory maybe dynamically allocated at boot time and may not have same
attribute as early mapping.

> And we should extend that mechanism to map the GHCB in the guest too and
> then we can get rid of all that need for ->force_memblock which makes
> the crazy mess in pageattr.c even crazier. And it would be lovely if we
> can do it without it.
>
> But maybe Paolo might have an even better idea...
>

I am sure he will have better idea :)

-Brijesh
Paolo Bonzini March 16, 2017, 12:28 p.m. UTC | #3
On 02/03/2017 16:15, Brijesh Singh wrote:
> 
>  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
> -		   struct page *base)
> +		  pte_t *pbase, unsigned long new_pfn)
>  {
> -	pte_t *pbase = (pte_t *)page_address(base);

Just one comment and I'll reply to Boris, I think you can compute pbase 
with pfn_to_kaddr, and avoid adding a new argument.

>  	 */
> -	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
> +	__set_pmd_pte(kpte, address,
> +		native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));

And this probably is better written as:

	__set_pmd_pte(kpte, address, pfn_pte(new_pfn, __pgprot(_KERNPG_TABLE));

Paolo
Borislav Petkov March 16, 2017, 6:28 p.m. UTC | #4
On Fri, Mar 10, 2017 at 04:41:56PM -0600, Brijesh Singh wrote:
> I can take a look at fixing those warning. In my initial attempt was to create
> a new function to clear encryption bit but it ended up looking very similar to
> __change_page_attr_set_clr() hence decided to extend the exiting function to
> use memblock_alloc().

... except that having all that SEV-specific code in main code paths is
yucky and I'd like to avoid it, if possible.

> Early in boot process, guest kernel allocates some structure (its either
> statically allocated or dynamic allocated via memblock_alloc). And shares the physical
> address of these structure with hypervisor. Since entire guest memory area is mapped
> as encrypted hence those structure's are mapped as encrypted memory range. We need
> a method to clear the encryption bit. Sometime these structure maybe part of 2M pages
> and need to split into smaller pages.

So how hard would it be if the hypervisor allocated that memory for the
guest instead? It would allocate it decrypted and guest would need to
access it decrypted too. All in preparation for SEV-ES which will need a
block of unencrypted memory for the guest anyway...

> In most cases, guest and hypervisor communication starts as soon as guest provides
> the physical address to hypervisor. So we must map the pages as decrypted before
> sharing the physical address to hypervisor.

See above: so purely theoretically speaking, the hypervisor could prep
that decrypted range for the guest. I'd look in Paolo's direction,
though, for the feasibility of something like that.

Thanks.
Paolo Bonzini March 16, 2017, 10:25 p.m. UTC | #5
On 16/03/2017 19:28, Borislav Petkov wrote:
> So how hard would it be if the hypervisor allocated that memory for the
> guest instead? It would allocate it decrypted and guest would need to
> access it decrypted too. All in preparation for SEV-ES which will need a
> block of unencrypted memory for the guest anyway...

The kvmclock memory is initially zero so there is no need for the
hypervisor to allocate anything; the point of these patches is just to
access the data in a natural way from Linux source code.

I also don't really like the patch as is (plus it fails modpost), but
IMO reusing __change_page_attr and __split_large_page is the right thing
to do.

Paolo
Borislav Petkov March 17, 2017, 10:17 a.m. UTC | #6
On Thu, Mar 16, 2017 at 11:25:36PM +0100, Paolo Bonzini wrote:
> The kvmclock memory is initially zero so there is no need for the
> hypervisor to allocate anything; the point of these patches is just to
> access the data in a natural way from Linux source code.

I realize that.

> I also don't really like the patch as is (plus it fails modpost), but
> IMO reusing __change_page_attr and __split_large_page is the right thing
> to do.

Right, so teaching pageattr.c about memblock could theoretically come
around and bite us later when a page allocated with memblock gets freed
with free_page().

And looking at this more, we have all this kernel pagetable preparation
code down the init_mem_mapping() call and the pagetable setup in
arch/x86/mm/init_{32,64}.c

And that code even does some basic page splitting. Oh and it uses
alloc_low_pages() which knows whether to do memblock reservation or the
common __get_free_pages() when slabs are up.

So what would be much cleaner, IMHO, is if one would reuse that code to
change init_mm.pgd mappings early without copying pageattr.c.

init_mem_mapping() gets called before kvm_guest_init() in setup_arch()
so the guest would simply fixup its pagetable right there.
Paolo Bonzini March 17, 2017, 10:47 a.m. UTC | #7
On 17/03/2017 11:17, Borislav Petkov wrote:
> 
>> I also don't really like the patch as is (plus it fails modpost), but
>> IMO reusing __change_page_attr and __split_large_page is the right thing
>> to do.
> 
> Right, so teaching pageattr.c about memblock could theoretically come
> around and bite us later when a page allocated with memblock gets freed
> with free_page().

Theoretically or practically?

> And looking at this more, we have all this kernel pagetable preparation
> code down the init_mem_mapping() call and the pagetable setup in
> arch/x86/mm/init_{32,64}.c

It only looks at the E820 map, doesn't it?  Why does it have to do
anything with percpu memory areas?

Paolo

> And that code even does some basic page splitting. Oh and it uses
> alloc_low_pages() which knows whether to do memblock reservation or the
> common __get_free_pages() when slabs are up.
> 
> So what would be much cleaner, IMHO, is if one would reuse that code to
> change init_mm.pgd mappings early without copying pageattr.c.
> 
> init_mem_mapping() gets called before kvm_guest_init() in setup_arch()
> so the guest would simply fixup its pagetable right there.
Borislav Petkov March 17, 2017, 10:56 a.m. UTC | #8
On Fri, Mar 17, 2017 at 11:47:16AM +0100, Paolo Bonzini wrote:
> Theoretically or practically?

In the sense, it needs to be tried first to see how ugly it can get.

> It only looks at the E820 map, doesn't it?  Why does it have to do
> anything with percpu memory areas?

That's irrelevant. What we want to do is take what's in init_mm.pgd and
modify it. And use the facilities in arch/x86/mm/init_{32,64}.c because
they already know about early/late pagetable pages allocation and they
deal with the kernel pagetable anyway.

And *not* teach pageattr.c about memblock because that can be misused,
as tglx pointed out on IRC.
Paolo Bonzini March 17, 2017, 11:03 a.m. UTC | #9
On 17/03/2017 11:56, Borislav Petkov wrote:
>> Theoretically or practically?
> In the sense, it needs to be tried first to see how ugly it can get.
> 
>> It only looks at the E820 map, doesn't it?  Why does it have to do
>> anything with percpu memory areas?
> That's irrelevant. What we want to do is take what's in init_mm.pgd and
> modify it. And use the facilities in arch/x86/mm/init_{32,64}.c because
> they already know about early/late pagetable pages allocation and they
> deal with the kernel pagetable anyway.

If it is possible to do it in a fairly hypervisor-independent manner,
I'm all for it.  That is, only by looking at AMD-specified CPUID leaves
and at kernel ELF sections.

Paolo

> And *not* teach pageattr.c about memblock because that can be misused,
> as tglx pointed out on IRC.
Borislav Petkov March 17, 2017, 11:33 a.m. UTC | #10
On Fri, Mar 17, 2017 at 12:03:31PM +0100, Paolo Bonzini wrote:

> If it is possible to do it in a fairly hypervisor-independent manner,
> I'm all for it.  That is, only by looking at AMD-specified CPUID leaves
> and at kernel ELF sections.

Not even that.

What that needs to be able to do is:

	kvm_map_percpu_hv_shared(st, sizeof(*st)))

where st is the percpu steal time ptr:

	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

Underneath, what it does basically is it clears the encryption mask from
the pte, see patch 16/32.

And I keep talking about SEV-ES because this is going to expand on the
need of having a shared memory region which the hypervisor and the guest
needs to access, thus unencrypted. See

http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf

This is where you come in and say what would be the best approach there...
Paolo Bonzini March 17, 2017, 2:45 p.m. UTC | #11
On 17/03/2017 12:33, Borislav Petkov wrote:
> On Fri, Mar 17, 2017 at 12:03:31PM +0100, Paolo Bonzini wrote:
> 
>> If it is possible to do it in a fairly hypervisor-independent manner,
>> I'm all for it.  That is, only by looking at AMD-specified CPUID leaves
>> and at kernel ELF sections.
> 
> Not even that.
> 
> What that needs to be able to do is:
> 
> 	kvm_map_percpu_hv_shared(st, sizeof(*st)))
> 
> where st is the percpu steal time ptr:
> 
> 	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
> 
> Underneath, what it does basically is it clears the encryption mask from
> the pte, see patch 16/32.

Yes, and I'd like that to be done with a new data section rather than a
special KVM hook.

> And I keep talking about SEV-ES because this is going to expand on the
> need of having a shared memory region which the hypervisor and the guest
> needs to access, thus unencrypted. See
> 
> http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf
> 
> This is where you come in and say what would be the best approach there...

I have no idea.  SEV-ES seems to be very hard to set up at the beginning
of the kernel bootstrap.  There's all sorts of chicken and egg problems,
as well as complicated handshakes between the firmware and the guest,
and the way to do it also depends on the trust and threat models.

A much simpler way is to just boot under a trusted hypervisor, do
"modprobe sev-es" and save a snapshot of the guest.  Then you sign the
snapshot and pass it to your cloud provider.

Paolo
Borislav Petkov March 18, 2017, 4:37 p.m. UTC | #12
On Fri, Mar 17, 2017 at 03:45:26PM +0100, Paolo Bonzini wrote:
> Yes, and I'd like that to be done with a new data section rather than a
> special KVM hook.

Can you give more details about how pls? Or is there already an example for that
somewhere in the kvm code?

> I have no idea.  SEV-ES seems to be very hard to set up at the beginning
> of the kernel bootstrap.  There's all sorts of chicken and egg problems,
> as well as complicated handshakes between the firmware and the guest,
> and the way to do it also depends on the trust and threat models.
> 
> A much simpler way is to just boot under a trusted hypervisor, do
> "modprobe sev-es" and save a snapshot of the guest.  Then you sign the
> snapshot and pass it to your cloud provider.

Right, especially the early trapping could be a pain. I don't think this
is cast in stone yet, though...

We'll see.

Thanks.
diff mbox

Patch

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 46cc89d..9e4ab3b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@ 
 #include <linux/gfp.h>
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
+#include <linux/memblock.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -37,6 +38,7 @@  struct cpa_data {
 	int		flags;
 	unsigned long	pfn;
 	unsigned	force_split : 1;
+	unsigned	force_memblock :1;
 	int		curpage;
 	struct page	**pages;
 };
@@ -627,9 +629,8 @@  try_preserve_large_page(pte_t *kpte, unsigned long address,
 
 static int
 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
-		   struct page *base)
+		  pte_t *pbase, unsigned long new_pfn)
 {
-	pte_t *pbase = (pte_t *)page_address(base);
 	unsigned long ref_pfn, pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *tmp;
@@ -646,7 +647,7 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		return 1;
 	}
 
-	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+	paravirt_alloc_pte(&init_mm, new_pfn);
 
 	switch (level) {
 	case PG_LEVEL_2M:
@@ -707,7 +708,8 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	 * pagetable protections, the actual ptes set above control the
 	 * primary protection behavior:
 	 */
-	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+	__set_pmd_pte(kpte, address,
+		native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));
 
 	/*
 	 * Intel Atom errata AAH41 workaround.
@@ -723,21 +725,50 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	return 0;
 }
 
+static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
+{
+	unsigned long phys;
+	struct page *base;
+
+	if (cpa->force_memblock) {
+		phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+		if (!phys)
+			return NULL;
+		*pfn = phys >> PAGE_SHIFT;
+		return (pte_t *)__va(phys);
+	}
+
+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	if (!base)
+		return NULL;
+	*pfn = page_to_pfn(base);
+	return (pte_t *)page_address(base);
+}
+
+static void try_free_pte(struct cpa_data *cpa, pte_t *pte)
+{
+	if (cpa->force_memblock)
+		memblock_free(__pa(pte), PAGE_SIZE);
+	else
+		__free_page((struct page *)pte);
+}
+
 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 			    unsigned long address)
 {
-	struct page *base;
+	pte_t *new_pte;
+	unsigned long new_pfn;
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	new_pte = try_alloc_pte(cpa, &new_pfn);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
-	if (!base)
+	if (!new_pte)
 		return -ENOMEM;
 
-	if (__split_large_page(cpa, kpte, address, base))
-		__free_page(base);
+	if (__split_large_page(cpa, kpte, address, new_pte, new_pfn))
+		try_free_pte(cpa, new_pte);
 
 	return 0;
 }
@@ -2035,6 +2066,7 @@  int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 			    unsigned numpages, unsigned long page_flags)
 {
 	int retval = -EINVAL;
+	int use_memblock = !slab_is_available();
 
 	struct cpa_data cpa = {
 		.vaddr = &address,
@@ -2044,6 +2076,7 @@  int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 		.mask_set = __pgprot(0),
 		.mask_clr = __pgprot(0),
 		.flags = 0,
+		.force_memblock = use_memblock,
 	};
 
 	if (!(__supported_pte_mask & _PAGE_NX))