@@ -114,11 +114,13 @@ bool platform_tdx_enabled(void);
int tdx_cpu_enable(void);
int tdx_enable(void);
void tdx_reset_memory(void);
+bool tdx_is_private_mem(unsigned long phys);
#else
static inline bool platform_tdx_enabled(void) { return false; }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
static inline void tdx_reset_memory(void) { }
+static inline bool tdx_is_private_mem(unsigned long phys) { return false; }
#endif /* CONFIG_INTEL_TDX_HOST */
#endif /* !__ASSEMBLY__ */
@@ -52,6 +52,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/tdx.h>
#include "internal.h"
@@ -228,11 +229,34 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
+static const char *mce_memory_info(struct mce *m)
+{
+ if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
+ return NULL;
+
+ /*
+ * Certain initial generations of TDX-capable CPUs have an
+ * erratum. A kernel non-temporal partial write to TDX private
+ * memory poisons that memory, and a subsequent read of that
+ * memory triggers #MC.
+ *
+ * However such #MC caused by software cannot be distinguished
+ * from the real hardware #MC. Just print additional message
+ * to show such #MC may be result of the CPU erratum.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return NULL;
+
+ return !tdx_is_private_mem(m->addr) ? NULL :
+ "TDX private memory error. Possible kernel bug.";
+}
+
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ const char *memmsg;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -283,6 +307,15 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
+ /*
+ * Confidential computing platforms such as TDX platforms
+ * may occur MCE due to incorrect access to confidential
+ * memory. Print additional information for such error.
+ */
+ memmsg = mce_memory_info(final);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
@@ -1348,6 +1348,109 @@ void tdx_reset_memory(void)
tdmrs_reset_pamt_all(&tdx_tdmr_list);
}
+static bool is_pamt_page(unsigned long phys)
+{
+ struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
+ int i;
+
+ /*
+ * This function is called from #MC handler, and theoretically
+ * it could run in parallel with the TDX module initialization
+ * on other logical cpus. But it's not OK to hold mutex here
+ * so just blindly check module status to make sure PAMTs/TDMRs
+ * are stable to access.
+ *
+ * This may return inaccurate result in rare cases, e.g., when
+ * #MC happens on a PAMT page during module initialization, but
+ * this is fine as #MC handler doesn't need a 100% accurate
+ * result.
+ */
+ if (tdx_module_status != TDX_MODULE_INITIALIZED)
+ return false;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ unsigned long base, size;
+
+ tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
+
+ if (phys >= base && phys < (base + size))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return whether the memory page at the given physical address is TDX
+ * private memory or not. Called from #MC handler do_machine_check().
+ *
+ * Note this function may not return an accurate result in rare cases.
+ * This is fine as the #MC handler doesn't need a 100% accurate result,
+ * because it cannot distinguish #MC between software bug and real
+ * hardware error anyway.
+ */
+bool tdx_is_private_mem(unsigned long phys)
+{
+ struct tdx_module_args args = {
+ .rcx = phys & PAGE_MASK,
+ };
+ u64 sret;
+
+ if (!platform_tdx_enabled())
+ return false;
+
+ /* Get page type from the TDX module */
+ sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
+ /*
+ * Handle the case that CPU isn't in VMX operation.
+ *
+ * KVM guarantees no VM is running (thus no TDX guest)
+ * when there's any online CPU isn't in VMX operation.
+ * This means there will be no TDX guest private memory
+ * and Secure-EPT pages. However the TDX module may have
+ * been initialized and the memory page could be PAMT.
+ */
+ if (sret == TDX_SEAMCALL_UD)
+ return is_pamt_page(phys);
+
+ /*
+ * Any other failure means:
+ *
+ * 1) TDX module not loaded; or
+ * 2) Memory page isn't managed by the TDX module.
+ *
+ * In either case, the memory page cannot be a TDX
+ * private page.
+ */
+ if (sret)
+ return false;
+
+ /*
+ * SEAMCALL was successful -- read page type (via RCX):
+ *
+ * - PT_NDA: Page is not used by the TDX module
+ * - PT_RSVD: Reserved for Non-TDX use
+ * - Others: Page is used by the TDX module
+ *
+ * Note PAMT pages are marked as PT_RSVD but they are also TDX
+ * private memory.
+ *
+ * Note: Even page type is PT_NDA, the memory page could still
+ * be associated with TDX private KeyID if the kernel hasn't
+ * explicitly used MOVDIR64B to clear the page. Assume KVM
+ * always does that after reclaiming any private page from TDX
+ * gusets.
+ */
+ switch (args.rcx) {
+ case PT_NDA:
+ return false;
+ case PT_RSVD:
+ return is_pamt_page(phys);
+ default:
+ return true;
+ }
+}
+
static int __init record_keyid_partitioning(u32 *tdx_keyid_start,
u32 *nr_tdx_keyids)
{
@@ -16,6 +16,7 @@
/*
* TDX module SEAMCALL leaf functions
*/
+#define TDH_PHYMEM_PAGE_RDMD 24
#define TDH_SYS_KEY_CONFIG 31
#define TDH_SYS_INFO 32
#define TDH_SYS_INIT 33
@@ -23,6 +24,10 @@
#define TDH_SYS_TDMR_INIT 36
#define TDH_SYS_CONFIG 45
+/* TDX page types */
+#define PT_NDA 0x0
+#define PT_RSVD 0x1
+
struct cmr_info {
u64 base;
u64 size;