Message ID | 20231016132819.1002933-12-michael.roth@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand |
On Mon, Oct 16, 2023 at 08:27:40AM -0500, Michael Roth wrote: > From: Brijesh Singh <brijesh.singh@amd.com> > > The RMPUPDATE instruction writes a new RMP entry in the RMP Table. The > hypervisor will use the instruction to add pages to the RMP table. See > APM3 for details on the instruction operations. > > The PSMASH instruction expands a 2MB RMP entry into a corresponding set > of contiguous 4KB-Page RMP entries. The hypervisor will use this s/-Page// > instruction to adjust the RMP entry without invalidating the previous > RMP entry. "... without invalidating it." This below is useless text in the commit message - that should be all visible from the patch itself. > Add the following external interface API functions: > > psmash(): Used to smash a 2MB aligned page into 4K pages while > preserving the Validated bit in the RMP. > > rmp_make_private(): Used to assign a page to guest using the RMPUPDATE > instruction. > > rmp_make_shared(): Used to transition a page to hypervisor/shared > state using the RMPUPDATE instruction. > > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Since Brijesh is the author, first comes his SOB, then Ashish's and then yours. > [mdr: add RMPUPDATE retry logic for transient FAIL_OVERLAP errors] > Signed-off-by: Michael Roth <michael.roth@amd.com> > --- > arch/x86/include/asm/sev-common.h | 14 +++++ > arch/x86/include/asm/sev-host.h | 10 ++++ > arch/x86/virt/svm/sev.c | 89 +++++++++++++++++++++++++++++++ > 3 files changed, 113 insertions(+) > > diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h > index 1e6fb93d8ab0..93ec8c12c91d 100644 > --- a/arch/x86/include/asm/sev-common.h > +++ b/arch/x86/include/asm/sev-common.h > @@ -173,8 +173,22 @@ struct snp_psc_desc { > #define GHCB_ERR_INVALID_INPUT 5 > #define GHCB_ERR_INVALID_EVENT 6 > > +/* RMUPDATE detected 4K page and 2MB page overlap. */ > +#define RMPUPDATE_FAIL_OVERLAP 4 > + > /* RMP page size */ > #define RMP_PG_SIZE_4K 0 > +#define RMP_PG_SIZE_2M 1 RMP_PG_LEVEL_2M > #define RMP_TO_X86_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) > +#define X86_TO_RMP_PG_LEVEL(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) > + > +struct rmp_state { > + u64 gpa; > + u8 assigned; > + u8 pagesize; > + u8 immutable; > + u8 rsvd; > + u32 asid; > +} __packed; > > #endif > diff --git a/arch/x86/include/asm/sev-host.h b/arch/x86/include/asm/sev-host.h > index bb06c57f2909..1df989411334 100644 > --- a/arch/x86/include/asm/sev-host.h > +++ b/arch/x86/include/asm/sev-host.h > @@ -16,9 +16,19 @@ > #ifdef CONFIG_KVM_AMD_SEV > int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); > void sev_dump_hva_rmpentry(unsigned long address); > +int psmash(u64 pfn); > +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable); > +int rmp_make_shared(u64 pfn, enum pg_level level); > #else > static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENXIO; } > static inline void sev_dump_hva_rmpentry(unsigned long address) {} > +static inline int psmash(u64 pfn) { return -ENXIO; } > +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, > + bool immutable) > +{ > + return -ENXIO; > +} > +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENXIO; } > #endif > > #endif > diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c > index cac3e311c38f..24a695af13a5 100644 > --- a/arch/x86/virt/svm/sev.c > +++ b/arch/x86/virt/svm/sev.c > @@ -367,3 +367,92 @@ void sev_dump_hva_rmpentry(unsigned long hva) > sev_dump_rmpentry(pte_pfn(*pte)); > } > EXPORT_SYMBOL_GPL(sev_dump_hva_rmpentry); > + > +/* > + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the > + * Validated bit. > + */ > +int psmash(u64 pfn) > +{ > + unsigned long paddr = pfn << PAGE_SHIFT; > + int ret; > + > + pr_debug("%s: PFN: 0x%llx\n", __func__, pfn); Left over? > + > + if (!pfn_valid(pfn)) > + return -EINVAL; > + > + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) > + return -ENXIO; That needs to happen first in the function. > + > + /* Binutils version 2.36 supports the PSMASH mnemonic. */ > + asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" > + : "=a"(ret) > + : "a"(paddr) Add an empty space between the " and the ( > + : "memory", "cc"); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(psmash); > + > +static int rmpupdate(u64 pfn, struct rmp_state *val) rmp_state *state so that it is clear what this is. > +{ > + unsigned long paddr = pfn << PAGE_SHIFT; > + int ret, level, npages; > + int attempts = 0; > + > + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) > + return -ENXIO; > + > + do { > + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ > + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" > + : "=a"(ret) > + : "a"(paddr), "c"((unsigned long)val) Add an empty space between the " and the ( > + : "memory", "cc"); > + > + attempts++; > + } while (ret == RMPUPDATE_FAIL_OVERLAP); What's the logic here? Loop as long as it says "overlap"? How "transient" is that overlapping condition? What's the upper limit of that loop? This loop should check a generously chosen upper limit of attempts and then break if that limit is reached. > + if (ret) { > + pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n", > + attempts, ret, pfn, npages, level); You're dumping here uninitialized stack variables npages and level. Looks like leftover from some prior version of this function. > + sev_dump_rmpentry(pfn); > + dump_stack(); This is going to become real noisy on a huge machine with a lot of SNP guests. > + return -EFAULT; > + } > + > + return 0; > +} > + > +/* > + * Assign a page to guest using the RMPUPDATE instruction. > + */ One-line comment works too. /* Assign ... */ > +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable) > +{ > + struct rmp_state val; > + > + memset(&val, 0, sizeof(val)); > + val.assigned = 1; > + val.asid = asid; > + val.immutable = immutable; > + val.gpa = gpa; > + val.pagesize = X86_TO_RMP_PG_LEVEL(level); > + > + return rmpupdate(pfn, &val); > +} > +EXPORT_SYMBOL_GPL(rmp_make_private); > + > +/* > + * Transition a page to hypervisor/shared state using the RMPUPDATE instruction. > + */ Ditto. > +int rmp_make_shared(u64 pfn, enum pg_level level) > +{ > + struct rmp_state val; > + > + memset(&val, 0, sizeof(val)); > + val.pagesize = X86_TO_RMP_PG_LEVEL(level); > + > + return rmpupdate(pfn, &val); > +} > +EXPORT_SYMBOL_GPL(rmp_make_shared); > -- Thx.
On Tue, Nov 21, 2023 at 05:21:49PM +0100, Borislav Petkov wrote: > On Mon, Oct 16, 2023 at 08:27:40AM -0500, Michael Roth wrote: > > +static int rmpupdate(u64 pfn, struct rmp_state *val) > > rmp_state *state > > so that it is clear what this is. > > > +{ > > + unsigned long paddr = pfn << PAGE_SHIFT; > > + int ret, level, npages; > > + int attempts = 0; > > + > > + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) > > + return -ENXIO; > > + > > + do { > > + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ > > + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" > > + : "=a"(ret) > > + : "a"(paddr), "c"((unsigned long)val) > > Add an empty space between the " and the ( > > > + : "memory", "cc"); > > + > > + attempts++; > > + } while (ret == RMPUPDATE_FAIL_OVERLAP); > > What's the logic here? Loop as long as it says "overlap"? > > How "transient" is that overlapping condition? > > What's the upper limit of that loop? > > This loop should check a generously chosen upper limit of attempts and > then break if that limit is reached. We've raised similar questions to David Kaplan and discussed this to a fair degree. The transient condition here is due to firmware locking the 2MB-aligned RMP entry for the range to handle atomic updates. There is no upper bound on retries or the amount of time spent, but it is always transient since multiple hypervisor implementations now depend on this and any deviation from this assurance would constitute a firmware regression. A good torture test for this path is lots of 4K-only guests doing concurrent boot/shutdowns in a tight loop. With week-long runs the longest delay seen was on the order of 100ns, but there's no real correlation between time spent and number of retries, sometimes 100ns delays only involve 1 retry, sometimes much smaller time delays involve hundreds of retries, and it all depends on what firmware is doing, so there's no way to infer a safe retry limit based on that data. All that said, there are unfortunately other conditions that can trigger non-transient RMPUPDATE_FAIL_OVERLAP failures, and these will result in an infinite loop. Those are the result of host misbehavior however, like trying to set up 2MB private RMP entries when there are already private 4K entries in the range. Ideally these would be separate error codes, but even if that were changed in firmware we'd still need code to support older firmwares that don't disambiguate so not sure this situation can be improved much. > > > + if (ret) { > > + pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n", > > + attempts, ret, pfn, npages, level); > > You're dumping here uninitialized stack variables npages and level. > Looks like leftover from some prior version of this function. Yah, I'll clean this up. I think logging the attempts probably doesn't have much use anymore either. > > > + sev_dump_rmpentry(pfn); > > + dump_stack(); > > This is going to become real noisy on a huge machine with a lot of SNP > guests. Since the transient case will eventually resolve to ret==0, we will only get here on a kernel oops sort of condition where a stack dump seems appropriate. rmpupdate() shouldn't error during normal operation, and if it ever does it will likely be a fatal situation where those stack dumps will be useful. Thanks, Mike > > Thx. > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette >
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1e6fb93d8ab0..93ec8c12c91d 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -173,8 +173,22 @@ struct snp_psc_desc { #define GHCB_ERR_INVALID_INPUT 5 #define GHCB_ERR_INVALID_EVENT 6 +/* RMUPDATE detected 4K page and 2MB page overlap. */ +#define RMPUPDATE_FAIL_OVERLAP 4 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 +#define RMP_PG_SIZE_2M 1 #define RMP_TO_X86_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) +#define X86_TO_RMP_PG_LEVEL(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) + +struct rmp_state { + u64 gpa; + u8 assigned; + u8 pagesize; + u8 immutable; + u8 rsvd; + u32 asid; +} __packed; #endif diff --git a/arch/x86/include/asm/sev-host.h b/arch/x86/include/asm/sev-host.h index bb06c57f2909..1df989411334 100644 --- a/arch/x86/include/asm/sev-host.h +++ b/arch/x86/include/asm/sev-host.h @@ -16,9 +16,19 @@ #ifdef CONFIG_KVM_AMD_SEV int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); void sev_dump_hva_rmpentry(unsigned long address); +int psmash(u64 pfn); +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable); +int rmp_make_shared(u64 pfn, enum pg_level level); #else static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENXIO; } static inline void sev_dump_hva_rmpentry(unsigned long address) {} +static inline int psmash(u64 pfn) { return -ENXIO; } +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, + bool immutable) +{ + return -ENXIO; +} +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENXIO; } #endif #endif diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index cac3e311c38f..24a695af13a5 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -367,3 +367,92 @@ void sev_dump_hva_rmpentry(unsigned long hva) sev_dump_rmpentry(pte_pfn(*pte)); } EXPORT_SYMBOL_GPL(sev_dump_hva_rmpentry); + +/* + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the + * Validated bit. + */ +int psmash(u64 pfn) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret; + + pr_debug("%s: PFN: 0x%llx\n", __func__, pfn); + + if (!pfn_valid(pfn)) + return -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return -ENXIO; + + /* Binutils version 2.36 supports the PSMASH mnemonic. */ + asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" + : "=a"(ret) + : "a"(paddr) + : "memory", "cc"); + + return ret; +} +EXPORT_SYMBOL_GPL(psmash); + +static int rmpupdate(u64 pfn, struct rmp_state *val) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret, level, npages; + int attempts = 0; + + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return -ENXIO; + + do { + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a"(ret) + : "a"(paddr), "c"((unsigned long)val) + : "memory", "cc"); + + attempts++; + } while (ret == RMPUPDATE_FAIL_OVERLAP); + + if (ret) { + pr_err("RMPUPDATE failed after %d attempts, ret: %d, pfn: %llx, npages: %d, level: %d\n", + attempts, ret, pfn, npages, level); + sev_dump_rmpentry(pfn); + dump_stack(); + return -EFAULT; + } + + return 0; +} + +/* + * Assign a page to guest using the RMPUPDATE instruction. + */ +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable) +{ + struct rmp_state val; + + memset(&val, 0, sizeof(val)); + val.assigned = 1; + val.asid = asid; + val.immutable = immutable; + val.gpa = gpa; + val.pagesize = X86_TO_RMP_PG_LEVEL(level); + + return rmpupdate(pfn, &val); +} +EXPORT_SYMBOL_GPL(rmp_make_private); + +/* + * Transition a page to hypervisor/shared state using the RMPUPDATE instruction. + */ +int rmp_make_shared(u64 pfn, enum pg_level level) +{ + struct rmp_state val; + + memset(&val, 0, sizeof(val)); + val.pagesize = X86_TO_RMP_PG_LEVEL(level); + + return rmpupdate(pfn, &val); +} +EXPORT_SYMBOL_GPL(rmp_make_shared);