@@ -4,6 +4,7 @@ obj-y += entry/
obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_KVM) += kvm/
+obj-$(subst m,y,$(CONFIG_KVM)) += kvm/boot/
# Xen paravirtualization support
obj-$(CONFIG_XEN) += xen/
new file mode 100644
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_KVM_BOOT_H
+#define _ASM_X86_KVM_BOOT_H
+
+#include <linux/cpumask.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_KVM_INTEL_TDX
+int __init seam_load_module(void *module, unsigned long module_size,
+ void *sigstruct, unsigned long sigstruct_size,
+ void *seamldr, unsigned long seamldr_size);
+
+void __init tdx_seam_init(void);
+void tdx_init_cpu(struct cpuinfo_x86 *c);
+
+void tdx_seamcall_on_other_pkgs(smp_call_func_t fn, void *param,
+ struct mutex *lock);
+#define tdx_seamcall_on_each_pkg(fn, param, lock) \
+do { \
+ fn(param); \
+ if (topology_max_packages() > 1) \
+ tdx_seamcall_on_other_pkgs(fn, param, lock); \
+} while (0)
+
+/*
+ * Return pointer to TDX system info (TDSYSINFO_STRUCT) if TDX has been
+ * successfully initialized, or NULL.
+ */
+struct tdsysinfo_struct;
+struct tdsysinfo_struct *tdx_get_sysinfo(void);
+
+/* TDX keyID allocation functions */
+extern int tdx_keyid_alloc(void);
+extern void tdx_keyid_free(int keyid);
+#else
+static inline void __init tdx_seam_init(void) {}
+static inline void tdx_init_cpu(struct cpuinfo_x86 *c) {}
+#endif
+
+#endif /* _ASM_X86_KVM_BOOT_H */
@@ -15,6 +15,7 @@
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
+#include <asm/kvm_boot.h>
#include <asm/intel-family.h>
#include <asm/microcode_intel.h>
#include <asm/hwcap2.h>
@@ -711,6 +712,9 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
+ if (cpu_has(c, X86_FEATURE_TDX))
+ tdx_init_cpu(c);
+
init_intel_misc_features(c);
if (tsx_ctrl_state == TSX_CTRL_ENABLE)
@@ -38,6 +38,7 @@
#include <asm/io_apic.h>
#include <asm/kasan.h>
#include <asm/kaslr.h>
+#include <asm/kvm_boot.h>
#include <asm/mce.h>
#include <asm/mtrr.h>
#include <asm/realmode.h>
@@ -1200,6 +1201,8 @@ void __init setup_arch(char **cmdline_p)
prefill_possible_map();
+ tdx_seam_init();
+
init_cpu_to_node();
init_gi_nodes();
@@ -84,6 +84,14 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_TDX
+ bool "Trusted Domain Extensions"
+ depends on KVM_INTEL && X86_64
+ select FW_LOADER
+
+ help
+ Extends KVM on Intel processors to support Trusted Domain Extensions.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
new file mode 100644
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I$(srctree)/arch/x86/kvm
+
+obj-$(CONFIG_KVM_INTEL_TDX) += seam/seamldr.o seam/seamloader.o seam/tdx.o
new file mode 100644
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ASM helper to load Intel SEAM module.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Authors:
+ * Kai Huang <kai.huang>@intel.com
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <uapi/asm/processor-flags.h>
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/msr-index.h>
+#include <asm/segment.h>
+
+.macro save_msr _msr
+ movl $(\_msr), %ecx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+.endm
+
+.macro restore_msr _msr
+ popq %rdx
+ popq %rax
+ movl $(\_msr), %ecx
+ wrmsr
+.endm
+
+ .text
+ __INIT
+ .code64
+SYM_FUNC_START(launch_seamldr)
+
+ pushq %rbp
+ movq %rsp, %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+
+ /* Save DR7, SEAMLDR sets it to 0x400. */
+ movq %dr7, %rax
+ pushq %rax
+
+ /*
+ * SEAMLDR restores GDTR and CS before ExitAC, DS/ES/SS don't need to
+ * be manually preserved as this is 64-bit mode, and FS/GS and IDTR are
+ * not modified by EnterACCS or SEAMLDR.
+ */
+
+ /* EnterACCS and SEAMLDR modify CR0 and CR4. */
+ movq %cr0, %rax
+ pushq %rax
+ movq %cr4, %rax
+ pushq %rax
+
+ /* Enable CR4.SMXE for GETSEC */
+ orq $X86_CR4_SMXE, %rax
+ movq %rax, %cr4
+
+ /*
+ * Load R8-R11 immediately, they won't be clobbered, unlike RDX.
+ *
+ * - R8: SEAMLDR_PARAMS physical address
+ * - R9: GDT base to be setup by SEMALDR when returning to kernel
+ * - R10: RIP of resume point
+ * - R11: CR3 when returning to kernel
+ */
+ movq %rdx, %r8
+ sgdt kernel_gdt64(%rip)
+ movq kernel_gdt64_base(%rip), %r9
+ leaq .Lseamldr_resume(%rip), %r10
+ movq %cr3, %r11
+
+ /* Save MSRs that are modified by EnterACCS and/or SEAMLDR */
+ save_msr MSR_EFER
+ save_msr MSR_IA32_CR_PAT
+ save_msr MSR_IA32_MISC_ENABLE
+
+ /*
+ * MSRs that are clobbered by SEAMLDR but are not enabled during early
+ * boot and so don't need to be saved/restored.
+ *
+ * save_msr MSR_IA32_DEBUGCTLMSR
+ * save_msr MSR_CORE_PERF_GLOBAL_CTRL
+ * save_msr MSR_IA32_PEBS_ENABLE
+ * save_msr MSR_IA32_RTIT_CTL
+ * save_msr MSR_IA32_LBR_CTRL
+ */
+
+ /* Now as last step, save RSP before invoking GETSEC[ENTERACCS] */
+ movq %rsp, saved_rsp(%rip)
+
+ /*
+ * Load the Remaining params for EnterACCS.
+ *
+ * - EBX: SEAMLDR ACM physical address
+ * - ECX: SEAMLDR ACM size
+ * - EAX: 2
+ */
+ movl %edi, %ebx
+ movl %esi, %ecx
+
+ /* Invoke GETSEC[ENTERACCS] */
+ movl $2, %eax
+.Lseamldr_enteraccs:
+ getsec
+
+.Lseamldr_resume:
+ /*
+ * SEAMLDR restores CRs and GDT. Segment registers are flat, but
+ * don't hold kernel selectors. Reload the data segs now.
+ */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ /*
+ * Restore stack from RIP relative storage, and then restore everything
+ * else from the stack.
+ */
+ movq saved_rsp(%rip), %rsp
+
+ /*
+ * Restore CPU status, in reverse order of saving. Firstly, restore
+ * MSRs.
+ */
+ restore_msr MSR_IA32_MISC_ENABLE
+ restore_msr MSR_IA32_CR_PAT
+ restore_msr MSR_EFER
+
+ popq %rax
+ movq %rax, %cr4
+ popq %rax
+ movq %rax, %cr0
+
+ popq %rax
+ movq %rax, %dr7
+
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+
+ /* Far return to load the kernel's CS. */
+ popq %rax
+ pushq $__KERNEL_CS
+ pushq %rax
+
+ movq %r9, %rax
+ lretq
+
+.pushsection .fixup, "ax"
+ /*
+ * ENTERACCS faulted, return -EFAULT. Restore CR4 (to clear SMXE) and
+ * GPRs (to make objtool happy, only RBP/RSP are actually modified).
+ */
+1: movq 8 * 6(%rsp), %rax
+ movq %rax, %cr4
+ addq $(8 *9), %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ movq $-EFAULT, %rax
+ ret
+.popsection
+ _ASM_EXTABLE(.Lseamldr_enteraccs, 1b)
+
+SYM_FUNC_END(launch_seamldr)
+
+ __INITDATA
+ .balign 8
+kernel_gdt64:
+ .word 0
+kernel_gdt64_base:
+ .quad 0
+saved_rsp:
+ .quad 0
new file mode 100644
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "seam: " fmt
+
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/memblock.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/delay.h>
+#include <asm/kvm_boot.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+
+#define MTRRCAP_SEAMRR BIT(15)
+
+#define SEAMLDR_MAX_NR_MODULE_PAGES 496
+
+struct seamldr_params {
+ u32 version;
+ u32 scenario;
+ u64 sigstruct_pa;
+ u8 reserved[104];
+ u64 module_pages;
+ u64 module_pa_list[SEAMLDR_MAX_NR_MODULE_PAGES];
+} __packed __aligned(PAGE_SIZE);
+
+/* The ACM and input params need to be below 4G. */
+static phys_addr_t __init seam_alloc_lowmem(phys_addr_t size)
+{
+ return memblock_phys_alloc_range(size, PAGE_SIZE, 0, BIT_ULL(32));
+}
+
+static bool __init is_seamrr_enabled(void)
+{
+ u64 mtrrcap, seamrr_base, seamrr_mask;
+
+ if (!boot_cpu_has(X86_FEATURE_MTRR) ||
+ rdmsrl_safe(MSR_MTRRcap, &mtrrcap) || !(mtrrcap & MTRRCAP_SEAMRR))
+ return 0;
+
+ if (rdmsrl_safe(MSR_IA32_SEAMRR_PHYS_BASE, &seamrr_base) ||
+ !(seamrr_base & MSR_IA32_SEAMRR_PHYS_BASE_CONFIGURED)) {
+ pr_info("SEAMRR base is not configured by BIOS\n");
+ return 0;
+ }
+
+ if (rdmsrl_safe(MSR_IA32_SEAMRR_PHYS_MASK, &seamrr_mask) ||
+ !(seamrr_mask & MSR_IA32_SEAMRR_PHYS_MASK_ENABLED)) {
+ pr_info("SEAMRR is not enabled by BIOS\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+extern int __init launch_seamldr(unsigned long seamldr_pa,
+ unsigned long seamldr_size,
+ unsigned long params_pa);
+
+int __init seam_load_module(void *module, unsigned long module_size,
+ void *sigstruct, unsigned long sigstruct_size,
+ void *seamldr, unsigned long seamldr_size)
+{
+ phys_addr_t module_pa, seamldr_pa, params_pa;
+ struct seamldr_params *params;
+ int enteraccs_attempts = 10;
+ u32 icr_busy;
+ int ret;
+ u64 i;
+
+ if (!is_seamrr_enabled())
+ return -ENOTSUPP;
+
+ /* SEAM module must be 4K aligned, and less than 496 pages. */
+ if (!module_size || !IS_ALIGNED(module_size, PAGE_SIZE) ||
+ module_size > SEAMLDR_MAX_NR_MODULE_PAGES * PAGE_SIZE) {
+ pr_err("Invalid SEAM module size 0x%lx\n", module_size);
+ return -EINVAL;
+ }
+ /* SEAM signature structure must be 0x200 DWORDS, which is 2048 bytes */
+ if (sigstruct_size != 2048) {
+ pr_err("Invalid SEAM signature structure size 0x%lx\n",
+ sigstruct_size);
+ return -EINVAL;
+ }
+ if (!seamldr_size) {
+ pr_err("Invalid SEAMLDR ACM size\n");
+ return -EINVAL;
+ }
+
+ ret = -ENOMEM;
+ /* SEAMLDR requires the SEAM module to be 4k aligned. */
+ module_pa = __pa(module);
+ if (!IS_ALIGNED(module_pa, 4096)) {
+ module_pa = memblock_phys_alloc(module_size, PAGE_SIZE);
+ if (!module_pa) {
+ pr_err("Unable to allocate memory to copy SEAM module\n");
+ goto out;
+ }
+ memcpy(__va(module_pa), module, module_size);
+ }
+
+ /* GETSEC[EnterACCS] requires the ACM to be 4k aligned and below 4G. */
+ seamldr_pa = __pa(seamldr);
+ if (seamldr_pa >= BIT_ULL(32) || !IS_ALIGNED(seamldr_pa, 4096)) {
+ seamldr_pa = seam_alloc_lowmem(seamldr_size);
+ if (!seamldr_pa)
+ goto free_seam_module;
+ memcpy(__va(seamldr_pa), seamldr, seamldr_size);
+ }
+
+ /*
+ * Allocate and initialize the SEAMLDR params. Pages are passed in as
+ * a list of physical addresses.
+ */
+ params_pa = seam_alloc_lowmem(PAGE_SIZE);
+ if (!params_pa) {
+ pr_err("Unable to allocate memory for SEAMLDR_PARAMS\n");
+ goto free_seamldr;
+ }
+
+ ret = -EIO;
+ /* Ensure APs are in WFS. */
+ apic_icr_write(APIC_DEST_ALLBUT | APIC_INT_LEVELTRIG | APIC_INT_ASSERT |
+ APIC_DM_INIT, 0);
+ icr_busy = safe_apic_wait_icr_idle();
+ if (WARN_ON(icr_busy))
+ goto free_seamldr;
+
+ apic_icr_write(APIC_DEST_ALLBUT | APIC_INT_LEVELTRIG | APIC_DM_INIT, 0);
+ icr_busy = safe_apic_wait_icr_idle();
+ if (WARN_ON(icr_busy))
+ goto free_seamldr;
+ mb();
+
+ params = __va(params_pa);
+ memset(params, 0, PAGE_SIZE);
+ params->sigstruct_pa = __pa(sigstruct);
+ params->module_pages = PFN_UP(module_size);
+ for (i = 0; i < params->module_pages; i++)
+ params->module_pa_list[i] = module_pa + i * PAGE_SIZE;
+
+retry_enteraccs:
+ ret = launch_seamldr(seamldr_pa, seamldr_size, params_pa);
+ if (ret == -EFAULT && !WARN_ON(!enteraccs_attempts--)) {
+ udelay(1 * USEC_PER_MSEC);
+ goto retry_enteraccs;
+ }
+ pr_info("Launch SEAMLDR returned %d\n", ret);
+
+ memblock_free_early(params_pa, PAGE_SIZE);
+free_seamldr:
+ if (seamldr_pa != __pa(seamldr))
+ memblock_free_early(seamldr_pa, seamldr_size);
+free_seam_module:
+ if (module_pa != __pa(module))
+ memblock_free_early(module_pa, module_size);
+out:
+ return ret;
+}
new file mode 100644
@@ -0,0 +1,1131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/earlycpio.h>
+#include <linux/fs.h>
+#include <linux/initrd.h>
+#include <linux/percpu.h>
+#include <linux/memblock.h>
+#include <linux/idr.h>
+#include <linux/sort.h>
+
+#include <asm/cpu.h>
+#include <asm/kvm_boot.h>
+#include <asm/virtext.h>
+#include <asm/tlbflush.h>
+#include <asm/e820/api.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tdx: " fmt
+
+/* Instruct tdx_ops.h to do boot-time friendly SEAMCALL exception handling. */
+#define INTEL_TDX_BOOT_TIME_SEAMCALL 1
+
+#include "vmx/tdx_arch.h"
+#include "vmx/tdx_ops.h"
+#include "vmx/tdx_errno.h"
+
+#include "vmx/vmcs.h"
+
+static DEFINE_PER_CPU(unsigned long, tdx_vmxon_vmcs);
+static atomic_t tdx_init_cpu_errors;
+
+/*
+ * TODO: better to have kernel boot parameter to let admin control whether to
+ * enable TDX with sysprof or not.
+ *
+ * Or how to decide tdx_sysprof??
+ */
+static bool tdx_sysprof;
+
+/* KeyID range reserved to TDX by BIOS */
+static u32 tdx_keyids_start;
+static u32 tdx_nr_keyids;
+
+/* TDX keyID pool */
+static DEFINE_IDA(tdx_keyid_pool);
+
+static int *tdx_package_masters __ro_after_init;
+
+/*
+ * TDX system information returned by TDSYSINFO.
+ */
+static struct tdsysinfo_struct tdx_tdsysinfo;
+
+/*
+ * CMR info array returned by TDSYSINFO.
+ *
+ * TDSYSINFO doesn't return specific error code indicating whether we didn't
+ * pass long-enough CMR info array to it, so just reserve enough space for
+ * the maximum number of CMRs.
+ */
+static struct cmr_info tdx_cmrs[TDX1_MAX_NR_CMRS] __aligned(512);
+static int tdx_nr_cmrs;
+
+/*
+ * TDMR info array used as input for TDSYSCONFIG.
+ */
+static struct tdmr_info tdx_tdmrs[TDX1_MAX_NR_TDMRS] __initdata;
+static int tdx_nr_tdmrs __initdata;
+static atomic_t tdx_next_tdmr_index;
+static atomic_t tdx_nr_initialized_tdmrs;
+
+/* TDMRs must be 1gb aligned */
+#define TDMR_ALIGNMENT BIT_ULL(30)
+#define TDMR_PFN_ALIGNMENT (TDMR_ALIGNMENT >> PAGE_SHIFT)
+
+/*
+ * TDSYSCONFIG takes a array of pointers to TDMR infos. Its just big enough
+ * that allocating it on the stack is undesirable.
+ */
+static u64 tdx_tdmr_addrs[TDX1_MAX_NR_TDMRS] __aligned(512) __initdata;
+
+struct pamt_info {
+ u64 pamt_base;
+ u64 pamt_size;
+};
+
+/*
+ * PAMT info for each TDMR, used to free PAMT when TDX is disabled due to
+ * whatever reason.
+ */
+static struct pamt_info tdx_pamts[TDX1_MAX_NR_TDMRS] __initdata;
+
+static int __init set_tdmr_reserved_area(struct tdmr_info *tdmr, int *p_idx,
+ u64 offset, u64 size)
+{
+ int idx = *p_idx;
+
+ if (idx >= tdx_tdsysinfo.max_reserved_per_tdmr)
+ return -EINVAL;
+
+ /* offset & size must be 4K aligned */
+ if (offset & ~PAGE_MASK || size & ~PAGE_MASK)
+ return -EINVAL;
+
+ tdmr->reserved_areas[idx].offset = offset;
+ tdmr->reserved_areas[idx].size = size;
+
+ *p_idx = idx + 1;
+ return 0;
+}
+
+/*
+ * Construct TDMR reserved areas.
+ *
+ * Two types of address range will be put into reserved areas: 1) PAMT range,
+ * since PAMT cannot overlap with TDMR non-reserved range; 2) any CMR hole
+ * within TDMR range, since TDMR non-reserved range must be in CMR.
+ *
+ * Note: we are not putting any memory hole made by kernel (which is not CMR
+ * hole -- i.e. some memory range is reserved by kernel and won't be freed to
+ * page allocator, and it is memory hole from page allocator's view) into
+ * reserved area for the sake of simplicity of implementation. The other
+ * reason is for TDX1 one TDMR can only have upto 16 reserved areas so if
+ * there are lots of holes we won't be have enough reserved areas to hold
+ * them. This is OK, since kernel page allocator will never allocate pages
+ * from those areas (as they are invalid). PAMT may internally mark them as
+ * 'normal' pages but it is OK.
+ *
+ * Returns -EINVAL if number of reserved areas exceeds TDX1 limitation.
+ *
+ */
+static int __init __construct_tdmr_reserved_areas(struct tdmr_info *tdmr,
+ u64 pamt_base, u64 pamt_size)
+{
+ u64 tdmr_start, tdmr_end, offset, size;
+ struct cmr_info *cmr, *next_cmr;
+ bool pamt_done = false;
+ int i, idx, ret;
+
+ memset(tdmr->reserved_areas, 0, sizeof(tdmr->reserved_areas));
+
+ /* Save some typing later */
+ tdmr_start = tdmr->base;
+ tdmr_end = tdmr->base + tdmr->size;
+
+ if (WARN_ON(!tdx_nr_cmrs))
+ return -EINVAL;
+ /*
+ * Find the first CMR whose end is greater than tdmr_start_pfn.
+ */
+ cmr = &tdx_cmrs[0];
+ for (i = 0; i < tdx_nr_cmrs; i++) {
+ cmr = &tdx_cmrs[i];
+ if ((cmr->base + cmr->size) > tdmr_start)
+ break;
+ }
+
+ /* Unable to find ?? Something is wrong here */
+ if (i == tdx_nr_cmrs)
+ return -EINVAL;
+
+ /*
+ * If CMR base is within TDMR range, [tdmr_start, cmr->base) needs to be
+ * in reserved area.
+ */
+ idx = 0;
+ if (cmr->base > tdmr_start) {
+ offset = 0;
+ size = cmr->base - tdmr_start;
+
+ ret = set_tdmr_reserved_area(tdmr, &idx, offset, size);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Check whether there's any hole between CMRs within TDMR range.
+ * If there is any, it needs to be in reserved area.
+ */
+ for (++i; i < tdx_nr_cmrs; i++) {
+ next_cmr = &tdx_cmrs[i];
+
+ /*
+ * If next CMR is beyond TDMR range, there's no CMR hole within
+ * TDMR range, and we only need to insert PAMT into reserved
+ * area, thus we are done here.
+ */
+ if (next_cmr->base >= tdmr_end)
+ break;
+
+ /* Otherwise need to have CMR hole in reserved area */
+ if (cmr->base + cmr->size < next_cmr->base) {
+ offset = cmr->base + cmr->size - tdmr_start;
+ size = next_cmr->base - (cmr->base + cmr->size);
+
+ /*
+ * Reserved areas needs to be in physical address
+ * ascending order, therefore we need to check PAMT
+ * range before filling any CMR hole into reserved
+ * area.
+ */
+ if (pamt_base < tdmr_start + offset) {
+ /*
+ * PAMT won't overlap with any CMR hole
+ * otherwise there's bug -- see comments below.
+ */
+ if (WARN_ON((pamt_base + pamt_size) >
+ (tdmr_start + offset)))
+ return -EINVAL;
+
+ ret = set_tdmr_reserved_area(tdmr, &idx,
+ pamt_base - tdmr_start,
+ pamt_size);
+ if (ret)
+ return ret;
+
+ pamt_done = true;
+ }
+
+ /* Insert CMR hole into reserved area */
+ ret = set_tdmr_reserved_area(tdmr, &idx, offset, size);
+ if (ret)
+ return ret;
+ }
+
+ cmr = next_cmr;
+ }
+
+ if (!pamt_done) {
+ /*
+ * PAMT won't overlap with CMR range, otherwise there's bug
+ * -- we have guaranteed this by checking all CMRs have
+ * covered all memory in e820.
+ */
+ if (WARN_ON((pamt_base + pamt_size) > (cmr->base + cmr->size)))
+ return -EINVAL;
+
+ ret = set_tdmr_reserved_area(tdmr, &idx,
+ pamt_base - tdmr_start, pamt_size);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If CMR end is in TDMR range, [cmr->end, tdmr_end) needs to be in
+ * reserved area.
+ */
+ if (cmr->base + cmr->size < tdmr_end) {
+ offset = cmr->base + cmr->size - tdmr_start;
+ size = tdmr_end - (cmr->base + cmr->size);
+
+ ret = set_tdmr_reserved_area(tdmr, &idx, offset, size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __init __construct_tdmr_node(int tdmr_idx,
+ unsigned long tdmr_start_pfn,
+ unsigned long tdmr_end_pfn)
+{
+ u64 tdmr_size, pamt_1g_size, pamt_2m_size, pamt_4k_size, pamt_size;
+ struct pamt_info *pamt = &tdx_pamts[tdmr_idx];
+ struct tdmr_info *tdmr = &tdx_tdmrs[tdmr_idx];
+ u64 pamt_phys;
+ int ret;
+
+ tdmr_size = (tdmr_end_pfn - tdmr_start_pfn) << PAGE_SHIFT;
+
+ /* sanity check */
+ if (!tdmr_size || !IS_ALIGNED(tdmr_size, TDMR_ALIGNMENT))
+ return -EINVAL;
+
+ /* 1 entry to cover 1G */
+ pamt_1g_size = (tdmr_size >> 30) * tdx_tdsysinfo.pamt_entry_size;
+ /* 1 entry to cover 2M */
+ pamt_2m_size = (tdmr_size >> 21) * tdx_tdsysinfo.pamt_entry_size;
+ /* 1 entry to cover 4K */
+ pamt_4k_size = (tdmr_size >> 12) * tdx_tdsysinfo.pamt_entry_size;
+
+ pamt_size = ALIGN(pamt_1g_size, PAGE_SIZE) +
+ ALIGN(pamt_2m_size, PAGE_SIZE) +
+ ALIGN(pamt_4k_size, PAGE_SIZE);
+
+ pamt_phys = memblock_phys_alloc_range(pamt_size, PAGE_SIZE,
+ tdmr_start_pfn << PAGE_SHIFT,
+ tdmr_end_pfn << PAGE_SHIFT);
+ if (!pamt_phys)
+ return -ENOMEM;
+
+ tdmr->base = tdmr_start_pfn << PAGE_SHIFT;
+ tdmr->size = tdmr_size;
+
+ /* PAMT for 1G at first */
+ tdmr->pamt_1g_base = pamt_phys;
+ tdmr->pamt_1g_size = ALIGN(pamt_1g_size, PAGE_SIZE);
+ /* PAMT for 2M right after PAMT for 1G */
+ tdmr->pamt_2m_base = tdmr->pamt_1g_base + tdmr->pamt_1g_size;
+ tdmr->pamt_2m_size = ALIGN(pamt_2m_size, PAGE_SIZE);
+ /* PAMT for 4K comes after PAMT for 2M */
+ tdmr->pamt_4k_base = tdmr->pamt_2m_base + tdmr->pamt_2m_size;
+ tdmr->pamt_4k_size = ALIGN(pamt_4k_size, PAGE_SIZE);
+
+ /* Construct TDMR's reserved areas */
+ ret = __construct_tdmr_reserved_areas(tdmr, tdmr->pamt_1g_base,
+ pamt_size);
+ if (ret) {
+ memblock_free(pamt_phys, pamt_size);
+ return ret;
+ }
+
+ /* Record PAMT info for this TDMR */
+ pamt->pamt_base = pamt_phys;
+ pamt->pamt_size = pamt_size;
+
+ return 0;
+}
+
+/*
+ * Convert node's memory into TDMRs as less as possible.
+ *
+ * @node_start_pfn and @node_end_pfn are not node's real memory region, but
+ * already 1G aligned passed from caller.
+ */
+static int __init construct_tdmr_node(int *p_tdmr_idx,
+ unsigned long tdmr_start_pfn,
+ unsigned long tdmr_end_pfn)
+{
+ u64 start_pfn, end_pfn, mid_pfn;
+ int ret = 0, idx = *p_tdmr_idx;
+
+ start_pfn = tdmr_start_pfn;
+ end_pfn = tdmr_end_pfn;
+
+ while (start_pfn < tdmr_end_pfn) {
+ /* Cast to u32, else compiler will sign extend and complain. */
+ if (idx >= (u32)tdx_tdsysinfo.max_tdmrs) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = __construct_tdmr_node(idx, start_pfn, end_pfn);
+
+ /*
+ * Try again with smaller TDMR if the failure was due to unable
+ * to allocate PAMT.
+ */
+ if (ret == -ENOMEM) {
+ mid_pfn = start_pfn + (end_pfn - start_pfn) / 2;
+ mid_pfn = ALIGN_DOWN(mid_pfn, TDMR_PFN_ALIGNMENT);
+ mid_pfn = max(mid_pfn, start_pfn + TDMR_PFN_ALIGNMENT);
+ if (mid_pfn == end_pfn)
+ break;
+ end_pfn = mid_pfn;
+ continue;
+ } else if (ret) {
+ break;
+ }
+
+ /* Successfully done with one TDMR, and continue if there's remaining */
+ start_pfn = end_pfn;
+ end_pfn = tdmr_end_pfn;
+ idx++;
+ }
+
+ /* Setup next TDMR entry to work on */
+ *p_tdmr_idx = idx;
+ return ret;
+}
+
+/*
+ * Construct TDMR based on system memory info and CMR info. To avoid modifying
+ * kernel core-mm page allocator to have TDMR specific logic for memory
+ * allocation in TDMR, we choose to simply convert all memory to TDMR, with the
+ * disadvantage of wasting some memory for PAMT, but since TDX is mainly a
+ * virtualization feature so it is expected majority of memory will be used as
+ * TD guest memory so wasting some memory for PAMT won't be big issue.
+ *
+ * There are some restrictions of TDMR/PAMT/CMR:
+ *
+ * - TDMR's base and size need to be 1G aligned.
+ * - TDMR's size need to be multiple of 1G.
+ * - TDMRs cannot overlap with each other.
+ * - PAMTs cannot overlap with each other.
+ * - Each TDMR can have reserved areas (TDX1 upto 16).
+ * - TDMR reserved areas must be in physical address ascending order.
+ * - TDMR non-reserved area must be in CMR.
+ * - TDMR reserved area doesn't have to be in CMR.
+ * - TDMR non-reserved area cannot overlap with PAMT.
+ * - PAMT may reside within TDMR reserved area.
+ * - PAMT must be in CMR.
+ *
+ */
+static int __init __construct_tdmrs(void)
+{
+ u64 tdmr_start_pfn, tdmr_end_pfn, tdmr_start_pfn_next, inc_pfn;
+ unsigned long start_pfn, end_pfn;
+ int last_nid, nid, i, idx, ret;
+
+ /* Sanity check on tdx_tdsysinfo... */
+ if (!tdx_tdsysinfo.max_tdmrs || !tdx_tdsysinfo.max_reserved_per_tdmr ||
+ !tdx_tdsysinfo.pamt_entry_size) {
+ pr_err("Invalid TDSYSINFO_STRUCT reported by TDSYSINFO.\n");
+ return -ENOTSUPP;
+ }
+
+ idx = 0;
+ tdmr_start_pfn = 0;
+ tdmr_end_pfn = 0;
+ last_nid = MAX_NUMNODES;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (last_nid == MAX_NUMNODES) {
+ /* First memory range */
+ last_nid = nid;
+ tdmr_start_pfn = ALIGN_DOWN(start_pfn, TDMR_PFN_ALIGNMENT);
+ WARN_ON(tdmr_start_pfn != 0);
+ } else if (nid == last_nid) {
+ /*
+ * This memory range is in the same node as previous
+ * one, update tdmr_end_pfn.
+ */
+ tdmr_end_pfn = ALIGN(end_pfn, TDMR_PFN_ALIGNMENT);
+ } else if (ALIGN_DOWN(start_pfn, TDMR_PFN_ALIGNMENT) >= tdmr_end_pfn) {
+ /* This memory range is in next node */
+ /*
+ * If new TDMR start pfn is greater than previous TDMR
+ * end pfn, then it's ready to convert previous node's
+ * memory to TDMR.
+ */
+ ret = construct_tdmr_node(&idx, tdmr_start_pfn,
+ tdmr_end_pfn);
+ if (ret)
+ return ret;
+ tdmr_start_pfn = ALIGN(start_pfn, TDMR_PFN_ALIGNMENT);
+ tdmr_end_pfn = ALIGN(end_pfn, TDMR_PFN_ALIGNMENT);
+ last_nid = nid;
+ } else {
+ /*
+ * This memory range is in the next node, and the
+ * boundary between nodes falls into 1G range. In this
+ * case, put beginning of second node into the TDMR
+ * which covers previous node. This is not ideal but
+ * this case is very unlikely as well so should be OK
+ * for now.
+ */
+ tdmr_end_pfn = ALIGN(start_pfn, TDMR_PFN_ALIGNMENT);
+
+ ret = construct_tdmr_node(&idx, tdmr_start_pfn,
+ tdmr_end_pfn);
+ if (ret)
+ return ret;
+
+ tdmr_start_pfn = tdmr_end_pfn;
+ last_nid = nid;
+ }
+ }
+
+ /* Spread out the remaining memory across multiple TDMRs. */
+ inc_pfn = (tdmr_end_pfn - tdmr_start_pfn) /
+ (tdx_tdsysinfo.max_tdmrs - idx);
+ inc_pfn = ALIGN(inc_pfn, TDMR_PFN_ALIGNMENT);
+
+ tdmr_start_pfn_next = tdmr_end_pfn;
+ while (tdmr_start_pfn < tdmr_start_pfn_next) {
+ if (idx == tdx_tdsysinfo.max_tdmrs - 1)
+ tdmr_end_pfn = tdmr_start_pfn_next;
+ else
+ tdmr_end_pfn = tdmr_start_pfn + inc_pfn;
+retry:
+ tdmr_end_pfn = min(tdmr_end_pfn, tdmr_start_pfn_next);
+
+ ret = construct_tdmr_node(&idx, tdmr_start_pfn, tdmr_end_pfn);
+ if (ret == -ENOMEM) {
+ if (tdmr_end_pfn == tdmr_start_pfn_next)
+ return -ENOMEM;
+ tdmr_end_pfn += inc_pfn;
+ goto retry;
+ }
+ if (ret)
+ return ret;
+ tdmr_start_pfn = tdmr_end_pfn;
+ }
+
+ tdx_nr_tdmrs = idx;
+
+ return 0;
+}
+
+static int __init e820_type_cmr_ram(enum e820_type type)
+{
+ /*
+ * CMR needs to at least cover e820 memory regions which will be later
+ * freed to kernel memory allocator, otherwise kernel may allocate
+ * non-TDMR pages, i.e. when KVM allocates memory.
+ *
+ * Note memblock also treats E820_TYPE_RESERVED_KERN as memory so also
+ * need to cover it.
+ *
+ * FIXME:
+ *
+ * Need to cover other types which are actually RAM, i.e:
+ *
+ * E820_TYPE_ACPI,
+ * E820_TYPE_NVS
+ */
+ return (type == E820_TYPE_RAM || type == E820_TYPE_RESERVED_KERN);
+}
+
+static int __init in_cmr_range(u64 addr, u64 size)
+{
+ struct cmr_info *cmr;
+ u64 cmr_end, end;
+ int i;
+
+ end = addr + size;
+
+ /* Ignore bad area */
+ if (end < addr)
+ return 1;
+
+ for (i = 0; i < tdx_nr_cmrs; i++) {
+ cmr = &tdx_cmrs[i];
+ cmr_end = cmr->base + cmr->size;
+
+ /* Found one CMR which covers the range [addr, addr + size) */
+ if (cmr->base <= addr && cmr_end >= end)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __init sanitize_cmrs(void)
+{
+ struct e820_entry *entry;
+ bool observed_empty;
+ int i, j;
+
+ if (!tdx_nr_cmrs)
+ return -EIO;
+
+ for (i = 0, j = -1, observed_empty = false; i < tdx_nr_cmrs; i++) {
+ if (!tdx_cmrs[i].size) {
+ observed_empty = true;
+ continue;
+ }
+ /* Valid entry after empty entry isn't allowed, per SEAM. */
+ if (observed_empty)
+ return -EIO;
+
+ /* The previous CMR must reside fully below this CMR. */
+ if (j >= 0 &&
+ (tdx_cmrs[j].base + tdx_cmrs[j].size) > tdx_cmrs[i].base)
+ return -EIO;
+
+ if (j < 0 ||
+ (tdx_cmrs[j].base + tdx_cmrs[j].size) != tdx_cmrs[i].base) {
+ j++;
+ if (i != j) {
+ tdx_cmrs[j].base = tdx_cmrs[i].base;
+ tdx_cmrs[j].size = tdx_cmrs[i].size;
+ }
+ } else {
+ tdx_cmrs[j].size += tdx_cmrs[i].size;
+ }
+ }
+ tdx_nr_cmrs = j + 1;
+ if (!tdx_nr_cmrs)
+ return -EINVAL;
+
+ /*
+ * Sanity check whether CMR has covered all memory in E820. We need
+ * to make sure that CMR covers all memory that will be freed to page
+ * allocator, otherwise alloc_pages() may return non-TDMR pages, i.e.
+ * when KVM allocates memory for VM. Cannot allow that to happen, so
+ * disable TDX if we found CMR doesn't cover all.
+ *
+ * FIXME:
+ *
+ * Alternatively we could just check against memblocks? Only memblocks
+ * are freed to page allocator so it appears to be OK as long as CMR
+ * covers all memblocks. But CMR should be generated by BIOS thus should
+ * be cover e820..
+ */
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ entry = &e820_table->entries[i];
+
+ if (!e820_type_cmr_ram(entry->type))
+ continue;
+
+ if (!in_cmr_range(entry->addr, entry->size))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init construct_tdmrs(void)
+{
+ struct pamt_info *pamt;
+ int ret, i;
+
+ ret = sanitize_cmrs();
+ if (ret)
+ return ret;
+
+ ret = __construct_tdmrs();
+ if (ret)
+ goto free_pamts;
+ return 0;
+
+free_pamts:
+ for (i = 0; i < ARRAY_SIZE(tdx_pamts); i++) {
+ pamt = &tdx_pamts[i];
+ if (pamt->pamt_base && pamt->pamt_size) {
+ if (WARN_ON(!IS_ALIGNED(pamt->pamt_base, PAGE_SIZE) ||
+ !IS_ALIGNED(pamt->pamt_size, PAGE_SIZE)))
+ continue;
+
+ memblock_free(pamt->pamt_base, pamt->pamt_size);
+ }
+ }
+
+ memset(tdx_pamts, 0, sizeof(tdx_pamts));
+ memset(tdx_tdmrs, 0, sizeof(tdx_tdmrs));
+ tdx_nr_tdmrs = 0;
+ return ret;
+}
+
+
+/*
+ * Well.. I guess a better way is to put cpu_vmxon() into asm/virtext.h,
+ * and split kvm_cpu_vmxon() into cpu_vmxon(), and intel_pt_handle_vmx(),
+ * so we just only have one cpu_vmxon() in asm/virtext.h..
+ */
+static inline void cpu_vmxon(u64 vmxon_region)
+{
+ cr4_set_bits(X86_CR4_VMXE);
+ asm volatile ("vmxon %0" : : "m"(vmxon_region));
+}
+
+static inline int tdx_init_vmxon_vmcs(struct vmcs *vmcs)
+{
+ u64 msr;
+
+ /*
+ * Can't enable TDX if VMX is unsupported or disabled by BIOS.
+ * cpu_has(X86_FEATURE_VMX) can't be relied on as the BSP calls this
+ * before the kernel has configured feat_ctl().
+ */
+ if (!cpu_has_vmx())
+ return -ENOTSUPP;
+
+ if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ||
+ !(msr & FEAT_CTL_LOCKED) ||
+ !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))
+ return -ENOTSUPP;
+
+ if (rdmsrl_safe(MSR_IA32_VMX_BASIC, &msr))
+ return -ENOTSUPP;
+
+ memset(vmcs, 0, PAGE_SIZE);
+ vmcs->hdr.revision_id = (u32)msr;
+
+ return 0;
+}
+
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+static inline void tdx_get_keyids(u32 *keyids_start, u32 *nr_keyids)
+{
+ u32 nr_mktme_ids;
+
+ rdmsr(MSR_IA32_MKTME_KEYID_PART, nr_mktme_ids, *nr_keyids);
+
+ /* KeyID 0 is reserved, i.e. KeyIDs are 1-based. */
+ *keyids_start = nr_mktme_ids + 1;
+}
+
+static int tdx_init_ap(unsigned long vmcs)
+{
+ u32 keyids_start, nr_keyids;
+ struct tdx_ex_ret ex_ret;
+ u64 err;
+
+ /*
+ * MSR_IA32_MKTME_KEYID_PART is core-scoped, disable TDX if this CPU's
+ * partitioning doesn't match the BSP's partitioning.
+ */
+ tdx_get_keyids(&keyids_start, &nr_keyids);
+ if (keyids_start != tdx_keyids_start || nr_keyids != tdx_nr_keyids) {
+ pr_err("MKTME KeyID partioning inconsistent on CPU %u\n",
+ smp_processor_id());
+ return -ENOTSUPP;
+ }
+
+ cpu_vmxon(__pa(vmcs));
+ err = tdsysinitlp(&ex_ret);
+ cpu_vmxoff();
+
+ if (TDX_ERR(err, TDSYSINITLP))
+ return -EIO;
+
+ return 0;
+}
+
+void tdx_init_cpu(struct cpuinfo_x86 *c)
+{
+ unsigned long vmcs;
+
+ /* Allocate VMCS for VMXON. */
+ vmcs = __get_free_page(GFP_KERNEL);
+ if (!vmcs)
+ goto err;
+
+ /* VMCS configuration shouldn't fail at this point. */
+ if (WARN_ON_ONCE(tdx_init_vmxon_vmcs((void *)vmcs)))
+ goto err_vmcs;
+
+ /* BSP does TDSYSINITLP as part of tdx_seam_init(). */
+ if (c != &boot_cpu_data && tdx_init_ap(vmcs))
+ goto err_vmcs;
+
+ this_cpu_write(tdx_vmxon_vmcs, vmcs);
+ return;
+
+err_vmcs:
+ free_page(vmcs);
+err:
+ clear_cpu_cap(c, X86_FEATURE_TDX);
+ atomic_inc(&tdx_init_cpu_errors);
+}
+
+static __init int tdx_init_bsp(void)
+{
+ struct tdx_ex_ret ex_ret;
+ void *vmcs;
+ u64 err;
+ int ret;
+
+ /*
+ * Detect HKID for TDX if initialization was successful.
+ *
+ * TDX provides core-scoped MSR for us to simply read out TDX start
+ * keyID and number of keyIDs.
+ */
+ tdx_get_keyids(&tdx_keyids_start, &tdx_nr_keyids);
+ if (!tdx_nr_keyids)
+ return -ENOTSUPP;
+
+ /*
+ * Allocate a temporary VMCS for early BSP init, the VMCS for late(ish)
+ * init will be allocated after the page allocator is up and running.
+ */
+ vmcs = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ if (!vmcs)
+ return -ENOMEM;
+
+ ret = tdx_init_vmxon_vmcs(vmcs);
+ if (ret)
+ goto out;
+
+ cpu_vmxon(__pa(vmcs));
+
+ err = tdsysinit(tdx_sysprof ? BIT(0) : 0, &ex_ret);
+ if (TDX_ERR(err, TDSYSINIT)) {
+ ret = -EIO;
+ goto out_vmxoff;
+ }
+
+ err = tdsysinitlp(&ex_ret);
+ if (TDX_ERR(err, TDSYSINITLP)) {
+ ret = -EIO;
+ goto out_vmxoff;
+ }
+
+ /*
+ * Do TDSYSINFO to collect the information needed to construct TDMRs,
+ * which needs to be done before kernel page allocator is up as the
+ * page allocator can't provide the large chunk (>4MB) of memory needed
+ * for the PAMTs.
+ */
+ err = tdsysinfo(__pa(&tdx_tdsysinfo), sizeof(tdx_tdsysinfo),
+ __pa(tdx_cmrs), TDX1_MAX_NR_CMRS, &ex_ret);
+ if (TDX_ERR(err, TDSYSINFO)) {
+ ret = -EIO;
+ goto out_vmxoff;
+ }
+
+ tdx_nr_cmrs = ex_ret.nr_cmr_entries;
+ ret = 0;
+
+out_vmxoff:
+ cpu_vmxoff();
+out:
+ memblock_free(__pa(vmcs), PAGE_SIZE);
+ return ret;
+}
+
+static bool __init tdx_all_cpus_available(void)
+{
+ /*
+ * CPUs detected in ACPI can be marked as disabled due to:
+ * 1) disabled in ACPI MADT table
+ * 2) disabled by 'disable_cpu_apicid' kernel parameter, which
+ * disables CPU with particular APIC id.
+ * 3) limited by 'nr_cpus' kernel parameter.
+ */
+ if (disabled_cpus) {
+ pr_info("Disabled CPUs detected");
+ goto err;
+ }
+
+ if (num_possible_cpus() < num_processors) {
+ pr_info("Number of CPUs limited by 'possible_cpus' kernel param");
+ goto err;
+ }
+
+ if (setup_max_cpus < num_processors) {
+ pr_info("Boot-time CPUs limited by 'maxcpus' kernel param");
+ goto err;
+ }
+
+ return true;
+
+err:
+ pr_cont(", skipping TDX-SEAM load/config.\n");
+ return false;
+}
+
+static bool __init tdx_get_firmware(struct cpio_data *blob, const char *name)
+{
+ char path[64];
+
+ if (get_builtin_firmware(blob, name))
+ return true;
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !initrd_start)
+ return false;
+
+ snprintf(path, sizeof(path), "lib/firmware/%s", name);
+ *blob = find_cpio_data(path, (void *)initrd_start,
+ initrd_end - initrd_start, NULL);
+
+ return !!blob->data;
+}
+
+void __init tdx_seam_init(void)
+{
+ const char *sigstruct_name = "intel-seam/libtdx.so.sigstruct";
+ const char *seamldr_name = "intel-seam/seamldr.acm";
+ const char *module_name = "intel-seam/libtdx.so";
+ struct cpio_data module, sigstruct, seamldr;
+
+ /*
+ * Don't load/configure SEAM if not all CPUs can be brought up during
+ * smp_init(), TDX must execute TDSYSINITLP on all logical processors.
+ */
+ if (!tdx_all_cpus_available())
+ return;
+
+ if (!tdx_get_firmware(&module, module_name))
+ return;
+
+ if (!tdx_get_firmware(&sigstruct, sigstruct_name))
+ return;
+
+ if (!tdx_get_firmware(&seamldr, seamldr_name))
+ return;
+
+ if (seam_load_module(module.data, module.size, sigstruct.data,
+ sigstruct.size, seamldr.data, seamldr.size))
+ return;
+
+ if (tdx_init_bsp() || construct_tdmrs())
+ return;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX);
+}
+
+/*
+ * Setup one-cpu-per-pkg array to do package-scoped SEAMCALLs. The array is
+ * only necessary if there are multiple packages.
+ */
+static int __init init_package_masters(void)
+{
+ int cpu, pkg, nr_filled, nr_pkgs;
+
+ nr_pkgs = topology_max_packages();
+ if (nr_pkgs == 1)
+ return 0;
+
+ tdx_package_masters = kcalloc(nr_pkgs, sizeof(int), GFP_KERNEL);
+ if (!tdx_package_masters)
+ return -ENOMEM;
+
+ memset(tdx_package_masters, -1, nr_pkgs * sizeof(int));
+
+ nr_filled = 0;
+ for_each_online_cpu(cpu) {
+ pkg = topology_physical_package_id(cpu);
+ if (tdx_package_masters[pkg] >= 0)
+ continue;
+
+ tdx_package_masters[pkg] = cpu;
+ if (++nr_filled == topology_max_packages())
+ break;
+ }
+
+ if (WARN_ON(nr_filled != topology_max_packages())) {
+ kfree(tdx_package_masters);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void __tdx_seamcall_on_other_pkgs(smp_call_func_t fn, void *param)
+{
+ int i, cpu, cur_package;
+
+ cpu = raw_smp_processor_id();
+ cur_package = topology_physical_package_id(cpu);
+
+ for (i = 0; i < topology_max_packages(); i++) {
+ if (i == cur_package)
+ continue;
+
+ smp_call_function_single(tdx_package_masters[i], fn, param, 1);
+ }
+}
+
+void tdx_seamcall_on_other_pkgs(smp_call_func_t fn, void *param,
+ struct mutex *lock)
+{
+ if (WARN_ON_ONCE(!tdx_package_masters))
+ return;
+
+ mutex_lock(lock);
+ preempt_disable();
+
+ __tdx_seamcall_on_other_pkgs(fn, param);
+
+ preempt_enable();
+ mutex_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(tdx_seamcall_on_other_pkgs);
+
+static void __init tdx_vmxon(void *ret)
+{
+ cpu_vmxon(__pa(this_cpu_read(tdx_vmxon_vmcs)));
+}
+
+static void __init tdx_vmxoff(void *ign)
+{
+ cpu_vmxoff();
+}
+
+static void __init tdx_free_vmxon_vmcs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_page(per_cpu(tdx_vmxon_vmcs, cpu));
+ per_cpu(tdx_vmxon_vmcs, cpu) = 0;
+ }
+}
+
+static void __init do_tdsysconfigkey(void *failed)
+{
+ u64 err;
+
+ if (*(int *)failed)
+ return;
+
+ do {
+ err = tdsysconfigkey();
+ } while (err == TDX_KEY_GENERATION_FAILED);
+ TDX_ERR(err, TDSYSCONFIGKEY);
+
+ if (err)
+ *(int *)failed = -EIO;
+}
+
+static void __init __tdx_init_tdmrs(void *failed)
+{
+ struct tdx_ex_ret ex_ret;
+ u64 base, size;
+ u64 err;
+ int i;
+
+ for (i = atomic_fetch_add(1, &tdx_next_tdmr_index);
+ i < tdx_nr_tdmrs;
+ i = atomic_fetch_add(1, &tdx_next_tdmr_index)) {
+ base = tdx_tdmrs[i].base;
+ size = tdx_tdmrs[i].size;
+
+ do {
+ /* Abort if a different CPU failed. */
+ if (atomic_read(failed))
+ return;
+
+ err = tdsysinittdmr(base, &ex_ret);
+ if (TDX_ERR(err, TDSYSINITTDMR)) {
+ atomic_inc(failed);
+ return;
+ }
+ /*
+ * Note, "next" is simply an indicator, base is passed to
+ * TDSYSINTTDMR on every iteration.
+ */
+ } while (ex_ret.next < (base + size));
+
+ atomic_inc(&tdx_nr_initialized_tdmrs);
+ }
+}
+
+static int __init tdx_init_tdmrs(void)
+{
+ atomic_t failed = ATOMIC_INIT(0);
+
+ /*
+ * Flush the cache to guarantee there no MODIFIED cache lines exist for
+ * PAMTs before TDSYSINITTDMR, which will initialize PAMT memory using
+ * TDX-SEAM's reserved/system HKID.
+ */
+ wbinvd_on_all_cpus();
+
+ on_each_cpu(__tdx_init_tdmrs, &failed, 0);
+
+ while (atomic_read(&tdx_nr_initialized_tdmrs) < tdx_nr_tdmrs) {
+ if (atomic_read(&failed))
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int __init tdx_init(void)
+{
+ int ret, i;
+ u64 err;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX))
+ return -ENOTSUPP;
+
+ /* Disable TDX if any CPU(s) failed to boot. */
+ if (!cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (atomic_read(&tdx_init_cpu_errors)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = init_package_masters();
+ if (ret)
+ goto err;
+
+ on_each_cpu(tdx_vmxon, NULL, 1);
+
+ for (i = 0; i < tdx_nr_tdmrs; i++)
+ tdx_tdmr_addrs[i] = __pa(&tdx_tdmrs[i]);
+
+ /* Use the first keyID as TDX-SEAM's global key. */
+ err = tdsysconfig(__pa(tdx_tdmr_addrs), tdx_nr_tdmrs, tdx_keyids_start);
+ if (TDX_ERR(err, TDSYSCONFIG)) {
+ ret = -EIO;
+ goto err_vmxoff;
+ }
+
+ do_tdsysconfigkey(&ret);
+ if (!ret && topology_max_packages() > 1)
+ __tdx_seamcall_on_other_pkgs(do_tdsysconfigkey, &ret);
+ if (ret)
+ goto err_vmxoff;
+
+ ret = tdx_init_tdmrs();
+ if (ret)
+ goto err_vmxoff;
+
+ on_each_cpu(tdx_vmxoff, NULL, 1);
+ tdx_free_vmxon_vmcs();
+
+ pr_info("TDX initialized.\n");
+ return 0;
+
+err_vmxoff:
+ on_each_cpu(tdx_vmxoff, NULL, 1);
+err:
+ tdx_free_vmxon_vmcs();
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TDX);
+ return ret;
+}
+arch_initcall(tdx_init);
+
+struct tdsysinfo_struct *tdx_get_sysinfo(void)
+{
+ if (boot_cpu_has(X86_FEATURE_TDX))
+ return &tdx_tdsysinfo;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
+
+int tdx_keyid_alloc(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TDX))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!tdx_keyids_start || !tdx_nr_keyids))
+ return -EINVAL;
+
+ /* The first keyID is reserved for the global key. */
+ return ida_alloc_range(&tdx_keyid_pool, tdx_keyids_start + 1,
+ tdx_keyids_start + tdx_nr_keyids - 2,
+ GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(tdx_keyid_alloc);
+
+void tdx_keyid_free(int keyid)
+{
+ if (!keyid || keyid < 0)
+ return;
+
+ ida_free(&tdx_keyid_pool, keyid);
+}
+EXPORT_SYMBOL_GPL(tdx_keyid_free);