diff mbox series

[v3,13/21] x86/virt/tdx: Allocate and set up PAMTs for TDMRs

Message ID ffc2eefdd212a31278978e8bfccd571355db69b0.1649219184.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai April 6, 2022, 4:49 a.m. UTC
In order to provide crypto protection to guests, the TDX module uses
additional metadata to record things like which guest "owns" a given
page of memory.  This metadata, referred as Physical Address Metadata
Table (PAMT), essentially serves as the 'struct page' for the TDX
module.  PAMTs are not reserved by hardware upfront.  They must be
allocated by the kernel and then given to the TDX module.

TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
(TDMR) has 3 PAMTs to track the 3 supported page sizes respectively.
Each PAMT must be a physically contiguous area from the Convertible
Memory Regions (CMR).  However, the PAMTs which track pages in one TDMR
do not need to reside within that TDMR but can be anywhere in CMRs.
If one PAMT overlaps with any TDMR, the overlapping part must be
reported as a reserved area in that particular TDMR.

Use alloc_contig_pages() since PAMT must be a physically contiguous area
and it may be potentially large (~1/256th of the size of the given TDMR).

The current version of TDX supports at most 16 reserved areas per TDMR
to cover both PAMTs and potential memory holes within the TDMR.  If many
PAMTs are allocated within a single TDMR, 16 reserved areas may not be
sufficient to cover all of them.

Adopt the following policies when allocating PAMTs for a given TDMR:

  - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
    the total number of reserved areas consumed for PAMTs.
  - Try to first allocate PAMT from the local node of the TDMR for better
    NUMA locality.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---
 arch/x86/Kconfig            |   1 +
 arch/x86/virt/vmx/tdx/tdx.c | 165 ++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)

Comments

Dave Hansen April 28, 2022, 5:12 p.m. UTC | #1
On 4/5/22 21:49, Kai Huang wrote:
> In order to provide crypto protection to guests, the TDX module uses
> additional metadata to record things like which guest "owns" a given
> page of memory.  This metadata, referred as Physical Address Metadata
> Table (PAMT), essentially serves as the 'struct page' for the TDX
> module.  PAMTs are not reserved by hardware upfront.  They must be
> allocated by the kernel and then given to the TDX module.
> 
> TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
> (TDMR) has 3 PAMTs to track the 3 supported page sizes respectively.

s/respectively//

> Each PAMT must be a physically contiguous area from the Convertible

							^ s/the/a/

> Memory Regions (CMR).  However, the PAMTs which track pages in one TDMR
> do not need to reside within that TDMR but can be anywhere in CMRs.
> If one PAMT overlaps with any TDMR, the overlapping part must be
> reported as a reserved area in that particular TDMR.
> 
> Use alloc_contig_pages() since PAMT must be a physically contiguous area
> and it may be potentially large (~1/256th of the size of the given TDMR).

This is also a good place to note the downsides of using
alloc_contig_pages().

> The current version of TDX supports at most 16 reserved areas per TDMR
> to cover both PAMTs and potential memory holes within the TDMR.  If many
> PAMTs are allocated within a single TDMR, 16 reserved areas may not be
> sufficient to cover all of them.
> 
> Adopt the following policies when allocating PAMTs for a given TDMR:
> 
>   - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
>     the total number of reserved areas consumed for PAMTs.
>   - Try to first allocate PAMT from the local node of the TDMR for better
>     NUMA locality.
> 
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> ---
>  arch/x86/Kconfig            |   1 +
>  arch/x86/virt/vmx/tdx/tdx.c | 165 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 166 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 7414625b938f..ff68d0829bd7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
>  	depends on CPU_SUP_INTEL
>  	depends on X86_64
>  	select NUMA_KEEP_MEMINFO if NUMA
> +	depends on CONTIG_ALLOC
>  	help
>  	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
>  	  host and certain physical attacks.  This option enables necessary TDX
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 82534e70df96..1b807dcbc101 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -21,6 +21,7 @@
>  #include <asm/cpufeatures.h>
>  #include <asm/virtext.h>
>  #include <asm/e820/api.h>
> +#include <asm/pgtable.h>
>  #include <asm/tdx.h>
>  #include "tdx.h"
>  
> @@ -66,6 +67,16 @@
>  #define TDMR_START(_tdmr)	((_tdmr)->base)
>  #define TDMR_END(_tdmr)		((_tdmr)->base + (_tdmr)->size)
>  
> +/* Page sizes supported by TDX */
> +enum tdx_page_sz {
> +	TDX_PG_4K = 0,
> +	TDX_PG_2M,
> +	TDX_PG_1G,
> +	TDX_PG_MAX,
> +};

Is that =0 required?  I thought the first enum was defined to be 0.

> +#define TDX_HPAGE_SHIFT	9
> +
>  /*
>   * TDX module status during initialization
>   */
> @@ -959,6 +970,148 @@ static int create_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
>  	return ret;
>  }
>  
> +/* Calculate PAMT size given a TDMR and a page size */
> +static unsigned long __tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> +					enum tdx_page_sz pgsz)
> +{
> +	unsigned long pamt_sz;
> +
> +	pamt_sz = (tdmr->size >> ((TDX_HPAGE_SHIFT * pgsz) + PAGE_SHIFT)) *
> +		tdx_sysinfo.pamt_entry_size;

That 'pgsz' thing is just hideous.  I'd *much* rather see something like
this:

static int tdx_page_size_shift(enum tdx_page_sz page_sz)
{
	switch (page_sz) {
	case TDX_PG_4K:
		return PAGE_SIZE;
	...
	}
}

That's easy to figure out what's going on.

> +	/* PAMT size must be 4K aligned */
> +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> +
> +	return pamt_sz;
> +}
> +
> +/* Calculate the size of all PAMTs for a TDMR */
> +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr)
> +{
> +	enum tdx_page_sz pgsz;
> +	unsigned long pamt_sz;
> +
> +	pamt_sz = 0;
> +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++)
> +		pamt_sz += __tdmr_get_pamt_sz(tdmr, pgsz);
> +
> +	return pamt_sz;
> +}

But, there are 3 separate pointers pointing to 3 separate PAMTs.  Why do
they all have to be contiguously allocated?

> +/*
> + * Locate the NUMA node containing the start of the given TDMR's first
> + * RAM entry.  The given TDMR may also cover memory in other NUMA nodes.
> + */

Please add a sentence or two on the implications here of what this means
when it happens.  Also, the joining of e820 regions seems like it might
span NUMA nodes.  What prevents that code from just creating one large
e820 area that leads to one large TDMR and horrible NUMA affinity for
these structures?

> +static int tdmr_get_nid(struct tdmr_info *tdmr)
> +{
> +	u64 start, end;
> +	int i;
> +
> +	/* Find the first RAM entry covered by the TDMR */
> +	e820_for_each_mem(i, start, end)
> +		if (end > TDMR_START(tdmr))
> +			break;

Brackets around the big loop, please.

> +	/*
> +	 * One TDMR must cover at least one (or partial) RAM entry,
> +	 * otherwise it is kernel bug.  WARN_ON() in this case.
> +	 */
> +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
> +		return 0;
> +
> +	/*
> +	 * The first RAM entry may be partially covered by the previous
> +	 * TDMR.  In this case, use TDMR's start to find the NUMA node.
> +	 */
> +	if (start < TDMR_START(tdmr))
> +		start = TDMR_START(tdmr);
> +
> +	return phys_to_target_node(start);
> +}
> +
> +static int tdmr_setup_pamt(struct tdmr_info *tdmr)
> +{
> +	unsigned long tdmr_pamt_base, pamt_base[TDX_PG_MAX];
> +	unsigned long pamt_sz[TDX_PG_MAX];
> +	unsigned long pamt_npages;
> +	struct page *pamt;
> +	enum tdx_page_sz pgsz;
> +	int nid;

Sooooooooooooooooooo close to reverse Christmas tree, but no cigar.
Please fix it.

> +	/*
> +	 * Allocate one chunk of physically contiguous memory for all
> +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> +	 * in overlapped TDMRs.
> +	 */

Ahh, this explains it.  Considering that tdmr_get_pamt_sz() is really
just two lines of code, I'd probably just the helper and open-code it
here.  Then you only have one place to comment on it.

> +	nid = tdmr_get_nid(tdmr);
> +	pamt_npages = tdmr_get_pamt_sz(tdmr) >> PAGE_SHIFT;
> +	pamt = alloc_contig_pages(pamt_npages, GFP_KERNEL, nid,
> +			&node_online_map);
> +	if (!pamt)
> +		return -ENOMEM;
> +
> +	/* Calculate PAMT base and size for all supported page sizes. */
> +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> +		unsigned long sz = __tdmr_get_pamt_sz(tdmr, pgsz);
> +
> +		pamt_base[pgsz] = tdmr_pamt_base;
> +		pamt_sz[pgsz] = sz;
> +
> +		tdmr_pamt_base += sz;
> +	}
> +
> +	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> +	tdmr->pamt_4k_size = pamt_sz[TDX_PG_4K];
> +	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> +	tdmr->pamt_2m_size = pamt_sz[TDX_PG_2M];
> +	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> +	tdmr->pamt_1g_size = pamt_sz[TDX_PG_1G];

This would all vertically align nicely if you renamed pamt_sz -> pamt_size.

> +	return 0;
> +}
> +
> +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> +{
> +	unsigned long pamt_pfn, pamt_sz;
> +
> +	pamt_pfn = tdmr->pamt_4k_base >> PAGE_SHIFT;

Comment, please:

	/*
	 * The PAMT was allocated in one contiguous unit.  The 4k PAMT
	 * should always point to the beginning of that allocation.
	 */

> +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> +
> +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> +	if (!pamt_sz)
> +		return;
> +
> +	if (WARN_ON(!pamt_pfn))
> +		return;
> +
> +	free_contig_range(pamt_pfn, pamt_sz >> PAGE_SHIFT);
> +}
> +
> +static void tdmrs_free_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)
> +{
> +	int i;
> +
> +	for (i = 0; i < tdmr_num; i++)
> +		tdmr_free_pamt(tdmr_array[i]);
> +}
> +
> +/* Allocate and set up PAMTs for all TDMRs */
> +static int tdmrs_setup_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)

	"set_up", please, not "setup".

> +{
> +	int i, ret;
> +
> +	for (i = 0; i < tdmr_num; i++) {
> +		ret = tdmr_setup_pamt(tdmr_array[i]);
> +		if (ret)
> +			goto err;
> +	}
> +
> +	return 0;
> +err:
> +	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> +	return -ENOMEM;
> +}
> +
>  static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
>  {
>  	int ret;
> @@ -971,8 +1124,14 @@ static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
>  	if (ret)
>  		goto err;
>  
> +	ret = tdmrs_setup_pamt_all(tdmr_array, *tdmr_num);
> +	if (ret)
> +		goto err_free_tdmrs;
> +
>  	/* Return -EFAULT until constructing TDMRs is done */
>  	ret = -EFAULT;
> +	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
> +err_free_tdmrs:
>  	free_tdmrs(tdmr_array, *tdmr_num);
>  err:
>  	return ret;
> @@ -1022,6 +1181,12 @@ static int init_tdx_module(void)
>  	 * initialization are done.
>  	 */
>  	ret = -EFAULT;
> +	/*
> +	 * Free PAMTs allocated in construct_tdmrs() when TDX module
> +	 * initialization fails.
> +	 */
> +	if (ret)
> +		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
>  out_free_tdmrs:
>  	/*
>  	 * TDMRs are only used during initializing TDX module.  Always

In a follow-on patch, I'd like this to dump out (in a pr_debug() or
pr_info()) how much memory is consumed by PAMT allocations.
Huang, Kai April 29, 2022, 7:46 a.m. UTC | #2
On Thu, 2022-04-28 at 10:12 -0700, Dave Hansen wrote:
> On 4/5/22 21:49, Kai Huang wrote:
> > In order to provide crypto protection to guests, the TDX module uses
> > additional metadata to record things like which guest "owns" a given
> > page of memory.  This metadata, referred as Physical Address Metadata
> > Table (PAMT), essentially serves as the 'struct page' for the TDX
> > module.  PAMTs are not reserved by hardware upfront.  They must be
> > allocated by the kernel and then given to the TDX module.
> > 
> > TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
> > (TDMR) has 3 PAMTs to track the 3 supported page sizes respectively.
> 
> s/respectively//
> 

Will remove.

> > Each PAMT must be a physically contiguous area from the Convertible
> 
> 							^ s/the/a/

OK.

> 
> > Memory Regions (CMR).  However, the PAMTs which track pages in one TDMR
> > do not need to reside within that TDMR but can be anywhere in CMRs.
> > If one PAMT overlaps with any TDMR, the overlapping part must be
> > reported as a reserved area in that particular TDMR.
> > 
> > Use alloc_contig_pages() since PAMT must be a physically contiguous area
> > and it may be potentially large (~1/256th of the size of the given TDMR).
> 
> This is also a good place to note the downsides of using
> alloc_contig_pages().

For instance:

	The allocation may fail when memory usage is under pressure.

?

> 
> > The current version of TDX supports at most 16 reserved areas per TDMR
> > to cover both PAMTs and potential memory holes within the TDMR.  If many
> > PAMTs are allocated within a single TDMR, 16 reserved areas may not be
> > sufficient to cover all of them.
> > 
> > Adopt the following policies when allocating PAMTs for a given TDMR:
> > 
> >   - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
> >     the total number of reserved areas consumed for PAMTs.
> >   - Try to first allocate PAMT from the local node of the TDMR for better
> >     NUMA locality.
> > 
> > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > ---
> >  arch/x86/Kconfig            |   1 +
> >  arch/x86/virt/vmx/tdx/tdx.c | 165 ++++++++++++++++++++++++++++++++++++
> >  2 files changed, 166 insertions(+)
> > 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 7414625b938f..ff68d0829bd7 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
> >  	depends on CPU_SUP_INTEL
> >  	depends on X86_64
> >  	select NUMA_KEEP_MEMINFO if NUMA
> > +	depends on CONTIG_ALLOC
> >  	help
> >  	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
> >  	  host and certain physical attacks.  This option enables necessary TDX
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index 82534e70df96..1b807dcbc101 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -21,6 +21,7 @@
> >  #include <asm/cpufeatures.h>
> >  #include <asm/virtext.h>
> >  #include <asm/e820/api.h>
> > +#include <asm/pgtable.h>
> >  #include <asm/tdx.h>
> >  #include "tdx.h"
> >  
> > @@ -66,6 +67,16 @@
> >  #define TDMR_START(_tdmr)	((_tdmr)->base)
> >  #define TDMR_END(_tdmr)		((_tdmr)->base + (_tdmr)->size)
> >  
> > +/* Page sizes supported by TDX */
> > +enum tdx_page_sz {
> > +	TDX_PG_4K = 0,
> > +	TDX_PG_2M,
> > +	TDX_PG_1G,
> > +	TDX_PG_MAX,
> > +};
> 
> Is that =0 required?  I thought the first enum was defined to be 0.

No it's not required.  Will remove.

> 
> > +#define TDX_HPAGE_SHIFT	9
> > +
> >  /*
> >   * TDX module status during initialization
> >   */
> > @@ -959,6 +970,148 @@ static int create_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
> >  	return ret;
> >  }
> >  
> > +/* Calculate PAMT size given a TDMR and a page size */
> > +static unsigned long __tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> > +					enum tdx_page_sz pgsz)
> > +{
> > +	unsigned long pamt_sz;
> > +
> > +	pamt_sz = (tdmr->size >> ((TDX_HPAGE_SHIFT * pgsz) + PAGE_SHIFT)) *
> > +		tdx_sysinfo.pamt_entry_size;
> 
> That 'pgsz' thing is just hideous.  I'd *much* rather see something like
> this:
> 
> static int tdx_page_size_shift(enum tdx_page_sz page_sz)
> {
> 	switch (page_sz) {
> 	case TDX_PG_4K:
> 		return PAGE_SIZE;
> 	...
> 	}
> }
> 
> That's easy to figure out what's going on.

OK. Will do.

> 
> > +	/* PAMT size must be 4K aligned */
> > +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> > +
> > +	return pamt_sz;
> > +}
> > +
> > +/* Calculate the size of all PAMTs for a TDMR */
> > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr)
> > +{
> > +	enum tdx_page_sz pgsz;
> > +	unsigned long pamt_sz;
> > +
> > +	pamt_sz = 0;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++)
> > +		pamt_sz += __tdmr_get_pamt_sz(tdmr, pgsz);
> > +
> > +	return pamt_sz;
> > +}
> 
> But, there are 3 separate pointers pointing to 3 separate PAMTs.  Why do
> they all have to be contiguously allocated?

It is also explained in the changelog (the last two paragraphs).

> 
> > +/*
> > + * Locate the NUMA node containing the start of the given TDMR's first
> > + * RAM entry.  The given TDMR may also cover memory in other NUMA nodes.
> > + */
> 
> Please add a sentence or two on the implications here of what this means
> when it happens.  Also, the joining of e820 regions seems like it might
> span NUMA nodes.  What prevents that code from just creating one large
> e820 area that leads to one large TDMR and horrible NUMA affinity for
> these structures?

How about adding:

	When TDMR is created, it stops spanning at NUAM boundary.

> 
> > +static int tdmr_get_nid(struct tdmr_info *tdmr)
> > +{
> > +	u64 start, end;
> > +	int i;
> > +
> > +	/* Find the first RAM entry covered by the TDMR */
> > +	e820_for_each_mem(i, start, end)
> > +		if (end > TDMR_START(tdmr))
> > +			break;
> 
> Brackets around the big loop, please.

OK.

> 
> > +	/*
> > +	 * One TDMR must cover at least one (or partial) RAM entry,
> > +	 * otherwise it is kernel bug.  WARN_ON() in this case.
> > +	 */
> > +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
> > +		return 0;
> > +
> > +	/*
> > +	 * The first RAM entry may be partially covered by the previous
> > +	 * TDMR.  In this case, use TDMR's start to find the NUMA node.
> > +	 */
> > +	if (start < TDMR_START(tdmr))
> > +		start = TDMR_START(tdmr);
> > +
> > +	return phys_to_target_node(start);
> > +}
> > +
> > +static int tdmr_setup_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long tdmr_pamt_base, pamt_base[TDX_PG_MAX];
> > +	unsigned long pamt_sz[TDX_PG_MAX];
> > +	unsigned long pamt_npages;
> > +	struct page *pamt;
> > +	enum tdx_page_sz pgsz;
> > +	int nid;
> 
> Sooooooooooooooooooo close to reverse Christmas tree, but no cigar.
> Please fix it.

Will fix.  Thanks.

> 
> > +	/*
> > +	 * Allocate one chunk of physically contiguous memory for all
> > +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> > +	 * in overlapped TDMRs.
> > +	 */
> 
> Ahh, this explains it.  Considering that tdmr_get_pamt_sz() is really
> just two lines of code, I'd probably just the helper and open-code it
> here.  Then you only have one place to comment on it.

It has a loop and internally calls __tdmr_get_pamt_sz().  It looks doesn't fit
if we open-code it here.

How about move this comment to tdmr_get_pamt_sz()?

	
> 
> > +	nid = tdmr_get_nid(tdmr);
> > +	pamt_npages = tdmr_get_pamt_sz(tdmr) >> PAGE_SHIFT;
> > +	pamt = alloc_contig_pages(pamt_npages, GFP_KERNEL, nid,
> > +			&node_online_map);
> > +	if (!pamt)
> > +		return -ENOMEM;
> > +
> > +	/* Calculate PAMT base and size for all supported page sizes. */
> > +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > +		unsigned long sz = __tdmr_get_pamt_sz(tdmr, pgsz);
> > +
> > +		pamt_base[pgsz] = tdmr_pamt_base;
> > +		pamt_sz[pgsz] = sz;
> > +
> > +		tdmr_pamt_base += sz;
> > +	}
> > +
> > +	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> > +	tdmr->pamt_4k_size = pamt_sz[TDX_PG_4K];
> > +	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> > +	tdmr->pamt_2m_size = pamt_sz[TDX_PG_2M];
> > +	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> > +	tdmr->pamt_1g_size = pamt_sz[TDX_PG_1G];
> 
> This would all vertically align nicely if you renamed pamt_sz -> pamt_size.

OK.

> 
> > +	return 0;
> > +}
> > +
> > +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long pamt_pfn, pamt_sz;
> > +
> > +	pamt_pfn = tdmr->pamt_4k_base >> PAGE_SHIFT;
> 
> Comment, please:
> 
> 	/*
> 	 * The PAMT was allocated in one contiguous unit.  The 4k PAMT
> 	 * should always point to the beginning of that allocation.
> 	 */

Thanks will add.

> 
> > +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> > +
> > +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> > +	if (!pamt_sz)
> > +		return;
> > +
> > +	if (WARN_ON(!pamt_pfn))
> > +		return;
> > +
> > +	free_contig_range(pamt_pfn, pamt_sz >> PAGE_SHIFT);
> > +}
> > +
> > +static void tdmrs_free_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < tdmr_num; i++)
> > +		tdmr_free_pamt(tdmr_array[i]);
> > +}
> > +
> > +/* Allocate and set up PAMTs for all TDMRs */
> > +static int tdmrs_setup_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)
> 
> 	"set_up", please, not "setup".

OK.

> 
> > +{
> > +	int i, ret;
> > +
> > +	for (i = 0; i < tdmr_num; i++) {
> > +		ret = tdmr_setup_pamt(tdmr_array[i]);
> > +		if (ret)
> > +			goto err;
> > +	}
> > +
> > +	return 0;
> > +err:
> > +	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > +	return -ENOMEM;
> > +}
> > +
> >  static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
> >  {
> >  	int ret;
> > @@ -971,8 +1124,14 @@ static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
> >  	if (ret)
> >  		goto err;
> >  
> > +	ret = tdmrs_setup_pamt_all(tdmr_array, *tdmr_num);
> > +	if (ret)
> > +		goto err_free_tdmrs;
> > +
> >  	/* Return -EFAULT until constructing TDMRs is done */
> >  	ret = -EFAULT;
> > +	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
> > +err_free_tdmrs:
> >  	free_tdmrs(tdmr_array, *tdmr_num);
> >  err:
> >  	return ret;
> > @@ -1022,6 +1181,12 @@ static int init_tdx_module(void)
> >  	 * initialization are done.
> >  	 */
> >  	ret = -EFAULT;
> > +	/*
> > +	 * Free PAMTs allocated in construct_tdmrs() when TDX module
> > +	 * initialization fails.
> > +	 */
> > +	if (ret)
> > +		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> >  out_free_tdmrs:
> >  	/*
> >  	 * TDMRs are only used during initializing TDX module.  Always
> 
> In a follow-on patch, I'd like this to dump out (in a pr_debug() or
> pr_info()) how much memory is consumed by PAMT allocations.

OK.
Dave Hansen April 29, 2022, 2:20 p.m. UTC | #3
On 4/29/22 00:46, Kai Huang wrote:
> On Thu, 2022-04-28 at 10:12 -0700, Dave Hansen wrote:
>> This is also a good place to note the downsides of using
>> alloc_contig_pages().
> 
> For instance:
> 
> 	The allocation may fail when memory usage is under pressure.

It's not really memory pressure, though.  The larger the allocation, the
more likely it is to fail.  The more likely it is that the kernel can't
free the memory or that if you need 1GB of contiguous memory that
999.996MB gets freed, but there is one stubborn page left.

alloc_contig_pages() can and will fail.  The only mitigation which is
guaranteed to avoid this is doing the allocation at boot.  But, you're
not doing that to avoid wasting memory on every TDX system that doesn't
use TDX.

A *good* way (although not foolproof) is to launch a TDX VM early in
boot before memory gets fragmented or consumed.  You might even want to
recommend this in the documentation.

>>> +/*
>>> + * Locate the NUMA node containing the start of the given TDMR's first
>>> + * RAM entry.  The given TDMR may also cover memory in other NUMA nodes.
>>> + */
>>
>> Please add a sentence or two on the implications here of what this means
>> when it happens.  Also, the joining of e820 regions seems like it might
>> span NUMA nodes.  What prevents that code from just creating one large
>> e820 area that leads to one large TDMR and horrible NUMA affinity for
>> these structures?
> 
> How about adding:
> 
> 	When TDMR is created, it stops spanning at NUAM boundary.

I actually don't know what that means at all.  I was thinking of
something like this.

/*
 * Pick a NUMA node on which to allocate this TDMR's metadata.
 *
 * This is imprecise since TDMRs are 1GB aligned and NUMA nodes might
 * not be.  If the TDMR covers more than one node, just use the _first_
 * one.  This can lead to small areas of off-node metadata for some
 * memory.
 */

>>> +static int tdmr_get_nid(struct tdmr_info *tdmr)
>>> +{
>>> +	u64 start, end;
>>> +	int i;
>>> +
>>> +	/* Find the first RAM entry covered by the TDMR */

There's something else missing in here.  Why not just do:

	return phys_to_target_node(TDMR_START(tdmr));

This would explain it:

	/*
	 * The beginning of the TDMR might not point to RAM.
	 * Find its first RAM address which which its node can
	 * be found.
	 */

>>> +	e820_for_each_mem(i, start, end)
>>> +		if (end > TDMR_START(tdmr))
>>> +			break;
>>
>> Brackets around the big loop, please.
> 
> OK.
> 
>>
>>> +	/*
>>> +	 * One TDMR must cover at least one (or partial) RAM entry,
>>> +	 * otherwise it is kernel bug.  WARN_ON() in this case.
>>> +	 */
>>> +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
>>> +		return 0;

This really means "no RAM found for this TDMR", right?  Can we say that,
please.


>>> +	/*
>>> +	 * Allocate one chunk of physically contiguous memory for all
>>> +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
>>> +	 * in overlapped TDMRs.
>>> +	 */
>>
>> Ahh, this explains it.  Considering that tdmr_get_pamt_sz() is really
>> just two lines of code, I'd probably just the helper and open-code it
>> here.  Then you only have one place to comment on it.
> 
> It has a loop and internally calls __tdmr_get_pamt_sz().  It looks doesn't fit
> if we open-code it here.
> 
> How about move this comment to tdmr_get_pamt_sz()?

I thought about that.  But tdmr_get_pamt_sz() isn't itself doing any
allocation so it doesn't make a whole lot of logical sense.  This is a
place where a helper _can_ be removed.  Remove it, please.
Sean Christopherson April 29, 2022, 2:30 p.m. UTC | #4
On Fri, Apr 29, 2022, Dave Hansen wrote:
> On 4/29/22 00:46, Kai Huang wrote:
> > On Thu, 2022-04-28 at 10:12 -0700, Dave Hansen wrote:
> >> This is also a good place to note the downsides of using
> >> alloc_contig_pages().
> > 
> > For instance:
> > 
> > 	The allocation may fail when memory usage is under pressure.
> 
> It's not really memory pressure, though.  The larger the allocation, the
> more likely it is to fail.  The more likely it is that the kernel can't
> free the memory or that if you need 1GB of contiguous memory that
> 999.996MB gets freed, but there is one stubborn page left.
> 
> alloc_contig_pages() can and will fail.  The only mitigation which is
> guaranteed to avoid this is doing the allocation at boot.  But, you're
> not doing that to avoid wasting memory on every TDX system that doesn't
> use TDX.
> 
> A *good* way (although not foolproof) is to launch a TDX VM early in
> boot before memory gets fragmented or consumed.  You might even want to
> recommend this in the documentation.

What about providing a kernel param to tell the kernel to do the allocation during
boot?  Or maybe a sysfs knob to reserve/free the memory, a la nr_overcommit_hugepages?

I suspect that most/all deployments that actually want to use TDX would much prefer
to eat the overhead if TDX VMs are never scheduled on the host, as opposed to having
to deal with a host in a TDX pool not actually being able to run TDX VMs.
Dave Hansen April 29, 2022, 5:46 p.m. UTC | #5
On 4/29/22 07:30, Sean Christopherson wrote:
> On Fri, Apr 29, 2022, Dave Hansen wrote:
...
>> A *good* way (although not foolproof) is to launch a TDX VM early
>> in boot before memory gets fragmented or consumed.  You might even
>> want to recommend this in the documentation.
> 
> What about providing a kernel param to tell the kernel to do the
> allocation during boot?

I think that's where we'll end up eventually.  But, I also want to defer
that discussion until after we have something merged.

Right now, allocating the PAMTs precisely requires running the TDX
module.  Running the TDX module requires VMXON.  VMXON is only done by
KVM.  KVM isn't necessarily there during boot.  So, it's hard to do
precisely today without a bunch of mucking with VMX.

But, it would be really easy to do something less precise like:

	tdx_reserve_ratio=255

...

u8 *pamt_reserve[MAX_NR_NODES]

	for_each_online_node(n) {
		pamt_pages = (node_spanned_pages(n)/tdx_reserve_ratio) /
			     PAGE_SIZE;
		pamt_reserve[n] = alloc_bootmem([pamt_pages);
	}

Then have the TDX code use pamt_reserve[] instead of allocating more
memory when it is needed later.

That will work just fine as long as you know up front how much metadata
TDX needs.  If the metadata requirements change in an updated TDX
module, the command-line will need to be updated to regain the
guarantee.  But, it can still fall back to the best-effort code that is
 in the series today.

In other words, I think we want what is in the series today no matter
what, and we'll want it forever.  That's why it's the *one* way of doing
things now.  I entirely agree that there will be TDX users that want a
stronger guarantee.

You can arm-wrestle the distro folks who hate adding command-line tweaks
when the time comes. ;)
Sean Christopherson April 29, 2022, 6:19 p.m. UTC | #6
On Fri, Apr 29, 2022, Dave Hansen wrote:
> On 4/29/22 07:30, Sean Christopherson wrote:
> > On Fri, Apr 29, 2022, Dave Hansen wrote:
> ...
> >> A *good* way (although not foolproof) is to launch a TDX VM early
> >> in boot before memory gets fragmented or consumed.  You might even
> >> want to recommend this in the documentation.
> > 
> > What about providing a kernel param to tell the kernel to do the
> > allocation during boot?
> 
> I think that's where we'll end up eventually.  But, I also want to defer
> that discussion until after we have something merged.
> 
> Right now, allocating the PAMTs precisely requires running the TDX
> module.  Running the TDX module requires VMXON.  VMXON is only done by
> KVM.  KVM isn't necessarily there during boot.  So, it's hard to do
> precisely today without a bunch of mucking with VMX.

Meh, it's hard only if we ignore the fact that the PAMT entry size isn't going
to change for a given TDX module, and is extremely unlikely to change in general.

Odds are good the kernel can hardcode a sane default and Just Work.  Or provide
the assumed size of a PAMT entry via module param.  If the size ends up being
wrong, log an error, free the reserved memory, and move on with TDX setup with
the correct size.

> You can arm-wrestle the distro folks who hate adding command-line tweaks
> when the time comes. ;)

Sure, you just find me the person that's going to run TDX guests with an
off-the-shelf distro kernel :-D
Dave Hansen April 29, 2022, 6:32 p.m. UTC | #7
On 4/29/22 11:19, Sean Christopherson wrote:
> On Fri, Apr 29, 2022, Dave Hansen wrote:
>> On 4/29/22 07:30, Sean Christopherson wrote:
>>> On Fri, Apr 29, 2022, Dave Hansen wrote:
>> ...
>>>> A *good* way (although not foolproof) is to launch a TDX VM early
>>>> in boot before memory gets fragmented or consumed.  You might even
>>>> want to recommend this in the documentation.
>>>
>>> What about providing a kernel param to tell the kernel to do the
>>> allocation during boot?
>>
>> I think that's where we'll end up eventually.  But, I also want to defer
>> that discussion until after we have something merged.
>>
>> Right now, allocating the PAMTs precisely requires running the TDX
>> module.  Running the TDX module requires VMXON.  VMXON is only done by
>> KVM.  KVM isn't necessarily there during boot.  So, it's hard to do
>> precisely today without a bunch of mucking with VMX.
> 
> Meh, it's hard only if we ignore the fact that the PAMT entry size isn't going
> to change for a given TDX module, and is extremely unlikely to change in general.
> 
> Odds are good the kernel can hardcode a sane default and Just Work.  Or provide
> the assumed size of a PAMT entry via module param.  If the size ends up being
> wrong, log an error, free the reserved memory, and move on with TDX setup with
> the correct size.

Sure.  The boot param could be:

	tdx_reserve_whatever=auto

and then it can be overridden if necessary.  I just don't want to have
kernel binaries that are only good as paperweights if Intel decides it
needs another byte of metadata.

>> You can arm-wrestle the distro folks who hate adding command-line tweaks
>> when the time comes. ;)
> 
> Sure, you just find me the person that's going to run TDX guests with an
> off-the-shelf distro kernel :-D

Well, everyone gets their kernel from upstream eventually and everyone
watches upstream.

But, in all seriousness, do you really expect TDX to remain solely in
the non-distro-kernel crowd forever?  I expect that the fancy cloud
providers (with custom kernels) who care the most to deploy TDX fist.
But, things will trickle down to the distro crowd over time.
Huang, Kai May 2, 2022, 5:59 a.m. UTC | #8
On Fri, 2022-04-29 at 07:20 -0700, Dave Hansen wrote:
> On 4/29/22 00:46, Kai Huang wrote:
> > On Thu, 2022-04-28 at 10:12 -0700, Dave Hansen wrote:
> > > This is also a good place to note the downsides of using
> > > alloc_contig_pages().
> > 
> > For instance:
> > 
> > 	The allocation may fail when memory usage is under pressure.
> 
> It's not really memory pressure, though.  The larger the allocation, the
> more likely it is to fail.  The more likely it is that the kernel can't
> free the memory or that if you need 1GB of contiguous memory that
> 999.996MB gets freed, but there is one stubborn page left.
> 
> alloc_contig_pages() can and will fail.  The only mitigation which is
> guaranteed to avoid this is doing the allocation at boot.  But, you're
> not doing that to avoid wasting memory on every TDX system that doesn't
> use TDX.
> 
> A *good* way (although not foolproof) is to launch a TDX VM early in
> boot before memory gets fragmented or consumed.  You might even want to
> recommend this in the documentation.

"launch a TDX VM early in boot" I suppose you mean having some boot-time service
which launches a TDX VM before we get the login interface.  I'll put this in the
documentation.

How about adding below in the changelog:

"
However using alloc_contig_pages() to allocate large physically contiguous
memory at runtime may fail.  The larger the allocation, the more likely it is to
fail.  Due to the fragmentation, the kernel may need to move pages out of the
to-be-allocated contiguous memory range but it may fail to move even the last
stubborn page.  A good way (although not foolproof) is to launch a TD VM early
in boot to get PAMTs allocated before memory gets fragmented or consumed.
"

> 
> > > > +/*
> > > > + * Locate the NUMA node containing the start of the given TDMR's first
> > > > + * RAM entry.  The given TDMR may also cover memory in other NUMA nodes.
> > > > + */
> > > 
> > > Please add a sentence or two on the implications here of what this means
> > > when it happens.  Also, the joining of e820 regions seems like it might
> > > span NUMA nodes.  What prevents that code from just creating one large
> > > e820 area that leads to one large TDMR and horrible NUMA affinity for
> > > these structures?
> > 
> > How about adding:
> > 
> > 	When TDMR is created, it stops spanning at NUAM boundary.
> 
> I actually don't know what that means at all.  I was thinking of
> something like this.
> 
> /*
>  * Pick a NUMA node on which to allocate this TDMR's metadata.
>  *
>  * This is imprecise since TDMRs are 1GB aligned and NUMA nodes might
>  * not be.  If the TDMR covers more than one node, just use the _first_
>  * one.  This can lead to small areas of off-node metadata for some
>  * memory.
>  */

Thanks.

> 
> > > > +static int tdmr_get_nid(struct tdmr_info *tdmr)
> > > > +{
> > > > +	u64 start, end;
> > > > +	int i;
> > > > +
> > > > +	/* Find the first RAM entry covered by the TDMR */
> 
> There's something else missing in here.  Why not just do:
> 
> 	return phys_to_target_node(TDMR_START(tdmr));
> 
> This would explain it:
> 
> 	/*
> 	 * The beginning of the TDMR might not point to RAM.
> 	 * Find its first RAM address which which its node can
> 	 * be found.
> 	 */

Will use this.  Thanks.

> 
> > > > +	e820_for_each_mem(i, start, end)
> > > > +		if (end > TDMR_START(tdmr))
> > > > +			break;
> > > 
> > > Brackets around the big loop, please.
> > 
> > OK.
> > 
> > > 
> > > > +	/*
> > > > +	 * One TDMR must cover at least one (or partial) RAM entry,
> > > > +	 * otherwise it is kernel bug.  WARN_ON() in this case.
> > > > +	 */
> > > > +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
> > > > +		return 0;
> 
> This really means "no RAM found for this TDMR", right?  Can we say that,
> please.

OK will add it.  How about:

	/*
	 * No RAM found for this TDMR.  WARN() in this case, as it
	 * cannot happen otherwise it is a kernel bug.
	 */

> 
> 
> > > > +	/*
> > > > +	 * Allocate one chunk of physically contiguous memory for all
> > > > +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> > > > +	 * in overlapped TDMRs.
> > > > +	 */
> > > 
> > > Ahh, this explains it.  Considering that tdmr_get_pamt_sz() is really
> > > just two lines of code, I'd probably just the helper and open-code it
> > > here.  Then you only have one place to comment on it.
> > 
> > It has a loop and internally calls __tdmr_get_pamt_sz().  It looks doesn't fit
> > if we open-code it here.
> > 
> > How about move this comment to tdmr_get_pamt_sz()?
> 
> I thought about that.  But tdmr_get_pamt_sz() isn't itself doing any
> allocation so it doesn't make a whole lot of logical sense.  This is a
> place where a helper _can_ be removed.  Remove it, please.

OK.  Will remove the helper.  Thanks.
Dave Hansen May 2, 2022, 2:17 p.m. UTC | #9
On 5/1/22 22:59, Kai Huang wrote:
> On Fri, 2022-04-29 at 07:20 -0700, Dave Hansen wrote:
> How about adding below in the changelog:
> 
> "
> However using alloc_contig_pages() to allocate large physically contiguous
> memory at runtime may fail.  The larger the allocation, the more likely it is to
> fail.  Due to the fragmentation, the kernel may need to move pages out of the
> to-be-allocated contiguous memory range but it may fail to move even the last
> stubborn page.  A good way (although not foolproof) is to launch a TD VM early
> in boot to get PAMTs allocated before memory gets fragmented or consumed.
> "

Better, although it's getting a bit off topic for this changelog.

Just be short and sweet:

1. the allocation can fail
2. Launch a VM early to (badly) mitigate this
3. the only way to fix it is to add a boot option


>>>>> +	/*
>>>>> +	 * One TDMR must cover at least one (or partial) RAM entry,
>>>>> +	 * otherwise it is kernel bug.  WARN_ON() in this case.
>>>>> +	 */
>>>>> +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
>>>>> +		return 0;
>>
>> This really means "no RAM found for this TDMR", right?  Can we say that,
>> please.
> 
> OK will add it.  How about:
> 
> 	/*
> 	 * No RAM found for this TDMR.  WARN() in this case, as it
> 	 * cannot happen otherwise it is a kernel bug.
> 	 */

The only useful information in that comment is the first sentence.  The
jibberish about WARN() is patently obvious from the next two lines of code.

*WHY* can't this happen?  How might it have actually happened?
Huang, Kai May 2, 2022, 9:55 p.m. UTC | #10
On Mon, 2022-05-02 at 07:17 -0700, Dave Hansen wrote:
> On 5/1/22 22:59, Kai Huang wrote:
> > On Fri, 2022-04-29 at 07:20 -0700, Dave Hansen wrote:
> > How about adding below in the changelog:
> > 
> > "
> > However using alloc_contig_pages() to allocate large physically contiguous
> > memory at runtime may fail.  The larger the allocation, the more likely it is to
> > fail.  Due to the fragmentation, the kernel may need to move pages out of the
> > to-be-allocated contiguous memory range but it may fail to move even the last
> > stubborn page.  A good way (although not foolproof) is to launch a TD VM early
> > in boot to get PAMTs allocated before memory gets fragmented or consumed.
> > "
> 
> Better, although it's getting a bit off topic for this changelog.
> 
> Just be short and sweet:
> 
> 1. the allocation can fail
> 2. Launch a VM early to (badly) mitigate this
> 3. the only way to fix it is to add a boot option
> 
OK. Thanks.

> 
> > > > > > +	/*
> > > > > > +	 * One TDMR must cover at least one (or partial) RAM entry,
> > > > > > +	 * otherwise it is kernel bug.  WARN_ON() in this case.
> > > > > > +	 */
> > > > > > +	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
> > > > > > +		return 0;
> > > 
> > > This really means "no RAM found for this TDMR", right?  Can we say that,
> > > please.
> > 
> > OK will add it.  How about:
> > 
> > 	/*
> > 	 * No RAM found for this TDMR.  WARN() in this case, as it
> > 	 * cannot happen otherwise it is a kernel bug.
> > 	 */
> 
> The only useful information in that comment is the first sentence.  The
> jibberish about WARN() is patently obvious from the next two lines of code.
> 
> *WHY* can't this happen?  How might it have actually happened?

When TDMRs are created, we already have made sure one TDMR must cover at least
one or partial RAM entry.
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7414625b938f..ff68d0829bd7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1973,6 +1973,7 @@  config INTEL_TDX_HOST
 	depends on CPU_SUP_INTEL
 	depends on X86_64
 	select NUMA_KEEP_MEMINFO if NUMA
+	depends on CONTIG_ALLOC
 	help
 	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
 	  host and certain physical attacks.  This option enables necessary TDX
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 82534e70df96..1b807dcbc101 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -21,6 +21,7 @@ 
 #include <asm/cpufeatures.h>
 #include <asm/virtext.h>
 #include <asm/e820/api.h>
+#include <asm/pgtable.h>
 #include <asm/tdx.h>
 #include "tdx.h"
 
@@ -66,6 +67,16 @@ 
 #define TDMR_START(_tdmr)	((_tdmr)->base)
 #define TDMR_END(_tdmr)		((_tdmr)->base + (_tdmr)->size)
 
+/* Page sizes supported by TDX */
+enum tdx_page_sz {
+	TDX_PG_4K = 0,
+	TDX_PG_2M,
+	TDX_PG_1G,
+	TDX_PG_MAX,
+};
+
+#define TDX_HPAGE_SHIFT	9
+
 /*
  * TDX module status during initialization
  */
@@ -959,6 +970,148 @@  static int create_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
 	return ret;
 }
 
+/* Calculate PAMT size given a TDMR and a page size */
+static unsigned long __tdmr_get_pamt_sz(struct tdmr_info *tdmr,
+					enum tdx_page_sz pgsz)
+{
+	unsigned long pamt_sz;
+
+	pamt_sz = (tdmr->size >> ((TDX_HPAGE_SHIFT * pgsz) + PAGE_SHIFT)) *
+		tdx_sysinfo.pamt_entry_size;
+	/* PAMT size must be 4K aligned */
+	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
+
+	return pamt_sz;
+}
+
+/* Calculate the size of all PAMTs for a TDMR */
+static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr)
+{
+	enum tdx_page_sz pgsz;
+	unsigned long pamt_sz;
+
+	pamt_sz = 0;
+	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++)
+		pamt_sz += __tdmr_get_pamt_sz(tdmr, pgsz);
+
+	return pamt_sz;
+}
+
+/*
+ * Locate the NUMA node containing the start of the given TDMR's first
+ * RAM entry.  The given TDMR may also cover memory in other NUMA nodes.
+ */
+static int tdmr_get_nid(struct tdmr_info *tdmr)
+{
+	u64 start, end;
+	int i;
+
+	/* Find the first RAM entry covered by the TDMR */
+	e820_for_each_mem(i, start, end)
+		if (end > TDMR_START(tdmr))
+			break;
+
+	/*
+	 * One TDMR must cover at least one (or partial) RAM entry,
+	 * otherwise it is kernel bug.  WARN_ON() in this case.
+	 */
+	if (WARN_ON_ONCE((start >= end) || start >= TDMR_END(tdmr)))
+		return 0;
+
+	/*
+	 * The first RAM entry may be partially covered by the previous
+	 * TDMR.  In this case, use TDMR's start to find the NUMA node.
+	 */
+	if (start < TDMR_START(tdmr))
+		start = TDMR_START(tdmr);
+
+	return phys_to_target_node(start);
+}
+
+static int tdmr_setup_pamt(struct tdmr_info *tdmr)
+{
+	unsigned long tdmr_pamt_base, pamt_base[TDX_PG_MAX];
+	unsigned long pamt_sz[TDX_PG_MAX];
+	unsigned long pamt_npages;
+	struct page *pamt;
+	enum tdx_page_sz pgsz;
+	int nid;
+
+	/*
+	 * Allocate one chunk of physically contiguous memory for all
+	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
+	 * in overlapped TDMRs.
+	 */
+	nid = tdmr_get_nid(tdmr);
+	pamt_npages = tdmr_get_pamt_sz(tdmr) >> PAGE_SHIFT;
+	pamt = alloc_contig_pages(pamt_npages, GFP_KERNEL, nid,
+			&node_online_map);
+	if (!pamt)
+		return -ENOMEM;
+
+	/* Calculate PAMT base and size for all supported page sizes. */
+	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
+	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
+		unsigned long sz = __tdmr_get_pamt_sz(tdmr, pgsz);
+
+		pamt_base[pgsz] = tdmr_pamt_base;
+		pamt_sz[pgsz] = sz;
+
+		tdmr_pamt_base += sz;
+	}
+
+	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
+	tdmr->pamt_4k_size = pamt_sz[TDX_PG_4K];
+	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
+	tdmr->pamt_2m_size = pamt_sz[TDX_PG_2M];
+	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
+	tdmr->pamt_1g_size = pamt_sz[TDX_PG_1G];
+
+	return 0;
+}
+
+static void tdmr_free_pamt(struct tdmr_info *tdmr)
+{
+	unsigned long pamt_pfn, pamt_sz;
+
+	pamt_pfn = tdmr->pamt_4k_base >> PAGE_SHIFT;
+	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
+
+	/* Do nothing if PAMT hasn't been allocated for this TDMR */
+	if (!pamt_sz)
+		return;
+
+	if (WARN_ON(!pamt_pfn))
+		return;
+
+	free_contig_range(pamt_pfn, pamt_sz >> PAGE_SHIFT);
+}
+
+static void tdmrs_free_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)
+{
+	int i;
+
+	for (i = 0; i < tdmr_num; i++)
+		tdmr_free_pamt(tdmr_array[i]);
+}
+
+/* Allocate and set up PAMTs for all TDMRs */
+static int tdmrs_setup_pamt_all(struct tdmr_info **tdmr_array, int tdmr_num)
+{
+	int i, ret;
+
+	for (i = 0; i < tdmr_num; i++) {
+		ret = tdmr_setup_pamt(tdmr_array[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+err:
+	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
+	return -ENOMEM;
+}
+
 static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
 {
 	int ret;
@@ -971,8 +1124,14 @@  static int construct_tdmrs(struct tdmr_info **tdmr_array, int *tdmr_num)
 	if (ret)
 		goto err;
 
+	ret = tdmrs_setup_pamt_all(tdmr_array, *tdmr_num);
+	if (ret)
+		goto err_free_tdmrs;
+
 	/* Return -EFAULT until constructing TDMRs is done */
 	ret = -EFAULT;
+	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
+err_free_tdmrs:
 	free_tdmrs(tdmr_array, *tdmr_num);
 err:
 	return ret;
@@ -1022,6 +1181,12 @@  static int init_tdx_module(void)
 	 * initialization are done.
 	 */
 	ret = -EFAULT;
+	/*
+	 * Free PAMTs allocated in construct_tdmrs() when TDX module
+	 * initialization fails.
+	 */
+	if (ret)
+		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
 out_free_tdmrs:
 	/*
 	 * TDMRs are only used during initializing TDX module.  Always