Message ID | 20240710062920.73063-1-pbonzini@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
I have only skimmed the diffs. Your knowledge of the deep semantics, gained by close differential reading of intel and amd docs, is truly amazing. Many thanks for pushing this through! I have 2 nits, perhaps stylistic only. For code like "sp -= 2" or "sp += 2" followed or preceded by a write to the stack pointer of a uint16_t variable 'x', would it be better/more robust to rewrite as: "sp -= sizeof(x)" ? There are a lot of masks constructed using -1. I think it would be clearer to use 0xffffffff (for 32-bit masks) as that reminds the reader that this is a bit mask. But it seems that using -1 is how the original code was written. On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > This includes bugfixes: > - allowing IRET from user mode to user mode with SMAP (do not use implicit > kernel accesses, which break if the stack is in userspace) > > - use DPL-level accesses for interrupts and call gates > > - various fixes for task switching > > And two related cleanups: computing MMU index once for far calls and > returns > (including task switches), and using X86Access for TSS access. > > Tested with a really ugly patch to kvm-unit-tests, included after > signature. > > Paolo Bonzini (7): > target/i386/tcg: Allow IRET from user mode to user mode with SMAP > target/i386/tcg: use PUSHL/PUSHW for error code > target/i386/tcg: Compute MMU index once > target/i386/tcg: Use DPL-level accesses for interrupts and call gates > target/i386/tcg: check for correct busy state before switching to a > new task > target/i386/tcg: use X86Access for TSS access > target/i386/tcg: save current task state before loading new one > > Richard Henderson (3): > target/i386/tcg: Remove SEG_ADDL > target/i386/tcg: Reorg push/pop within seg_helper.c > target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl > > target/i386/cpu.h | 11 +- > target/i386/cpu.c | 27 +- > target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++---------------- > 3 files changed, 354 insertions(+), 290 deletions(-) > > -- > 2.45.2 > > diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c > index c3ec0ad7..0bf40c6d 100644 > --- a/lib/x86/usermode.c > +++ b/lib/x86/usermode.c > @@ -5,13 +5,15 @@ > #include "x86/desc.h" > #include "x86/isr.h" > #include "alloc.h" > +#include "alloc_page.h" > #include "setjmp.h" > #include "usermode.h" > > #include "libcflat.h" > #include <stdint.h> > > -#define USERMODE_STACK_SIZE 0x2000 > +#define USERMODE_STACK_ORDER 1 /* 8k */ > +#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER)) > #define RET_TO_KERNEL_IRQ 0x20 > > static jmp_buf jmpbuf; > @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > { > extern char ret_to_kernel; > volatile uint64_t rax = 0; > - static unsigned char user_stack[USERMODE_STACK_SIZE]; > + static unsigned char *user_stack; > handler old_ex; > > + if (!user_stack) { > + user_stack = alloc_pages(USERMODE_STACK_ORDER); > + printf("%p\n", user_stack); > + } > + > *raised_vector = 0; > set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3); > old_ex = handle_exception(fault_vector, > @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > return 0; > } > > + memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8); > + > asm volatile ( > /* Prepare kernel SP for exception handlers */ > "mov %%rsp, %[rsp0]\n\t" > @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > "pushq %[user_stack_top]\n\t" > "pushfq\n\t" > "pushq %[user_cs]\n\t" > - "lea user_mode(%%rip), %%rax\n\t" > + "lea user_mode+0x800000(%%rip), %%rax\n\t" // > smap.flat places usermode addresses at 8MB-16MB > "pushq %%rax\n\t" > "iretq\n" > > "user_mode:\n\t" > /* Back up volatile registers before invoking func > */ > + "pop %%rax\n\t" > "push %%rcx\n\t" > "push %%rdx\n\t" > "push %%rdi\n\t" > @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int > fault_vector, > "push %%r10\n\t" > "push %%r11\n\t" > /* Call user mode function */ > + "add $0x800000,%%rbp\n\t" > "mov %[arg1], %%rdi\n\t" > "mov %[arg2], %%rsi\n\t" > "mov %[arg3], %%rdx\n\t" > "mov %[arg4], %%rcx\n\t" > - "call *%[func]\n\t" > + "call *%%rax\n\t" > /* Restore registers */ > "pop %%r11\n\t" > "pop %%r10\n\t" > @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned > int fault_vector, > [arg2]"m"(arg2), > [arg3]"m"(arg3), > [arg4]"m"(arg4), > - [func]"m"(func), > [user_ds]"i"(USER_DS), > [user_cs]"i"(USER_CS), > [kernel_ds]"rm"(KERNEL_DS), > [user_stack_top]"r"(user_stack + > - sizeof(user_stack)), > + USERMODE_STACK_SIZE - 8), > [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ)); > > handle_exception(fault_vector, old_ex); > diff --git a/x86/smap.c b/x86/smap.c > index 9a823a55..65119442 100644 > --- a/x86/smap.c > +++ b/x86/smap.c > @@ -2,6 +2,7 @@ > #include <alloc_page.h> > #include "x86/desc.h" > #include "x86/processor.h" > +#include "x86/usermode.h" > #include "x86/vm.h" > > volatile int pf_count = 0; > @@ -89,6 +90,31 @@ static void check_smap_nowp(void) > write_cr3(read_cr3()); > } > > +#ifdef __x86_64__ > +static void iret(void) > +{ > + asm volatile( > + "mov %%rsp, %%rcx;" > + "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;" > + "pushf;" > + "movl %%cs, %%ebx; pushq %%rbx; " > + "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:" > + > + : : : "ebx", "ecx", "cc"); /* RPL=0 */ > +} > + > +static void test_user_iret(void) > +{ > + bool raised_vector; > + uintptr_t user_iret = (uintptr_t)iret + USER_BASE; > + > + run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0, > + &raised_vector); > + > + report(!raised_vector, "No #PF on CPL=3 DPL=3 iret"); > +} > +#endif > + > int main(int ac, char **av) > { > unsigned long i; > @@ -196,7 +222,9 @@ int main(int ac, char **av) > > check_smap_nowp(); > > - // TODO: implicit kernel access from ring 3 (e.g. int) > +#ifdef __x86_64__ > + test_user_iret(); > +#endif > > return report_summary(); > } > > > >
Il mer 10 lug 2024, 23:01 Robert Henry <rrh.henry@gmail.com> ha scritto: > I have only skimmed the diffs. Your knowledge of the deep semantics, > gained by close differential reading of intel and amd docs, is truly > amazing. Many thanks for pushing this through! > Thanks for bringing this to our attention too, apart from the practical bug hopefully it will help future readers to have a more precise implementation. I tried to acknowledge your contribution in the commit messages. I have 2 nits, perhaps stylistic only. > > For code like "sp -= 2" or "sp += 2" followed or preceded by a write to > the stack pointer of a uint16_t variable 'x', would it be better/more > robust to rewrite as: "sp -= sizeof(x)" ? > I think that's intentional because the value subtracted is related to the "stw" or "stl" in the store (likewise for incrementing after a load) more than to the size of x. There are a lot of masks constructed using -1. I think it would be clearer > to use 0xffffffff (for 32-bit masks) as that reminds the reader that this > is a bit mask. But it seems that using -1 is how the original code was > written. > -1 is used for 64-bit masks only. They get unwieldy quickly. :) Paolo > On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > >> This includes bugfixes: >> - allowing IRET from user mode to user mode with SMAP (do not use implicit >> kernel accesses, which break if the stack is in userspace) >> >> - use DPL-level accesses for interrupts and call gates >> >> - various fixes for task switching >> >> And two related cleanups: computing MMU index once for far calls and >> returns >> (including task switches), and using X86Access for TSS access. >> >> Tested with a really ugly patch to kvm-unit-tests, included after >> signature. >> >> Paolo Bonzini (7): >> target/i386/tcg: Allow IRET from user mode to user mode with SMAP >> target/i386/tcg: use PUSHL/PUSHW for error code >> target/i386/tcg: Compute MMU index once >> target/i386/tcg: Use DPL-level accesses for interrupts and call gates >> target/i386/tcg: check for correct busy state before switching to a >> new task >> target/i386/tcg: use X86Access for TSS access >> target/i386/tcg: save current task state before loading new one >> >> Richard Henderson (3): >> target/i386/tcg: Remove SEG_ADDL >> target/i386/tcg: Reorg push/pop within seg_helper.c >> target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl >> >> target/i386/cpu.h | 11 +- >> target/i386/cpu.c | 27 +- >> target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++---------------- >> 3 files changed, 354 insertions(+), 290 deletions(-) >> >> -- >> 2.45.2 >> >> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c >> index c3ec0ad7..0bf40c6d 100644 >> --- a/lib/x86/usermode.c >> +++ b/lib/x86/usermode.c >> @@ -5,13 +5,15 @@ >> #include "x86/desc.h" >> #include "x86/isr.h" >> #include "alloc.h" >> +#include "alloc_page.h" >> #include "setjmp.h" >> #include "usermode.h" >> >> #include "libcflat.h" >> #include <stdint.h> >> >> -#define USERMODE_STACK_SIZE 0x2000 >> +#define USERMODE_STACK_ORDER 1 /* 8k */ >> +#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER)) >> #define RET_TO_KERNEL_IRQ 0x20 >> >> static jmp_buf jmpbuf; >> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> { >> extern char ret_to_kernel; >> volatile uint64_t rax = 0; >> - static unsigned char user_stack[USERMODE_STACK_SIZE]; >> + static unsigned char *user_stack; >> handler old_ex; >> >> + if (!user_stack) { >> + user_stack = alloc_pages(USERMODE_STACK_ORDER); >> + printf("%p\n", user_stack); >> + } >> + >> *raised_vector = 0; >> set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3); >> old_ex = handle_exception(fault_vector, >> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> return 0; >> } >> >> + memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8); >> + >> asm volatile ( >> /* Prepare kernel SP for exception handlers */ >> "mov %%rsp, %[rsp0]\n\t" >> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> "pushq %[user_stack_top]\n\t" >> "pushfq\n\t" >> "pushq %[user_cs]\n\t" >> - "lea user_mode(%%rip), %%rax\n\t" >> + "lea user_mode+0x800000(%%rip), %%rax\n\t" // >> smap.flat places usermode addresses at 8MB-16MB >> "pushq %%rax\n\t" >> "iretq\n" >> >> "user_mode:\n\t" >> /* Back up volatile registers before invoking >> func */ >> + "pop %%rax\n\t" >> "push %%rcx\n\t" >> "push %%rdx\n\t" >> "push %%rdi\n\t" >> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int >> fault_vector, >> "push %%r10\n\t" >> "push %%r11\n\t" >> /* Call user mode function */ >> + "add $0x800000,%%rbp\n\t" >> "mov %[arg1], %%rdi\n\t" >> "mov %[arg2], %%rsi\n\t" >> "mov %[arg3], %%rdx\n\t" >> "mov %[arg4], %%rcx\n\t" >> - "call *%[func]\n\t" >> + "call *%%rax\n\t" >> /* Restore registers */ >> "pop %%r11\n\t" >> "pop %%r10\n\t" >> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned >> int fault_vector, >> [arg2]"m"(arg2), >> [arg3]"m"(arg3), >> [arg4]"m"(arg4), >> - [func]"m"(func), >> [user_ds]"i"(USER_DS), >> [user_cs]"i"(USER_CS), >> [kernel_ds]"rm"(KERNEL_DS), >> [user_stack_top]"r"(user_stack + >> - sizeof(user_stack)), >> + USERMODE_STACK_SIZE - 8), >> [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ)); >> >> handle_exception(fault_vector, old_ex); >> diff --git a/x86/smap.c b/x86/smap.c >> index 9a823a55..65119442 100644 >> --- a/x86/smap.c >> +++ b/x86/smap.c >> @@ -2,6 +2,7 @@ >> #include <alloc_page.h> >> #include "x86/desc.h" >> #include "x86/processor.h" >> +#include "x86/usermode.h" >> #include "x86/vm.h" >> >> volatile int pf_count = 0; >> @@ -89,6 +90,31 @@ static void check_smap_nowp(void) >> write_cr3(read_cr3()); >> } >> >> +#ifdef __x86_64__ >> +static void iret(void) >> +{ >> + asm volatile( >> + "mov %%rsp, %%rcx;" >> + "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;" >> + "pushf;" >> + "movl %%cs, %%ebx; pushq %%rbx; " >> + "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:" >> + >> + : : : "ebx", "ecx", "cc"); /* RPL=0 */ >> +} >> + >> +static void test_user_iret(void) >> +{ >> + bool raised_vector; >> + uintptr_t user_iret = (uintptr_t)iret + USER_BASE; >> + >> + run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0, >> + &raised_vector); >> + >> + report(!raised_vector, "No #PF on CPL=3 DPL=3 iret"); >> +} >> +#endif >> + >> int main(int ac, char **av) >> { >> unsigned long i; >> @@ -196,7 +222,9 @@ int main(int ac, char **av) >> >> check_smap_nowp(); >> >> - // TODO: implicit kernel access from ring 3 (e.g. int) >> +#ifdef __x86_64__ >> + test_user_iret(); >> +#endif >> >> return report_summary(); >> } >> >> >> >>
diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c index c3ec0ad7..0bf40c6d 100644 --- a/lib/x86/usermode.c +++ b/lib/x86/usermode.c @@ -5,13 +5,15 @@ #include "x86/desc.h" #include "x86/isr.h" #include "alloc.h" +#include "alloc_page.h" #include "setjmp.h" #include "usermode.h" #include "libcflat.h" #include <stdint.h> -#define USERMODE_STACK_SIZE 0x2000 +#define USERMODE_STACK_ORDER 1 /* 8k */ +#define USERMODE_STACK_SIZE (1 << (12 + USERMODE_STACK_ORDER)) #define RET_TO_KERNEL_IRQ 0x20 static jmp_buf jmpbuf; @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector, { extern char ret_to_kernel; volatile uint64_t rax = 0; - static unsigned char user_stack[USERMODE_STACK_SIZE]; + static unsigned char *user_stack; handler old_ex; + if (!user_stack) { + user_stack = alloc_pages(USERMODE_STACK_ORDER); + printf("%p\n", user_stack); + } + *raised_vector = 0; set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3); old_ex = handle_exception(fault_vector, @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector, return 0; } + memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8); + asm volatile ( /* Prepare kernel SP for exception handlers */ "mov %%rsp, %[rsp0]\n\t" @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector, "pushq %[user_stack_top]\n\t" "pushfq\n\t" "pushq %[user_cs]\n\t" - "lea user_mode(%%rip), %%rax\n\t" + "lea user_mode+0x800000(%%rip), %%rax\n\t" // smap.flat places usermode addresses at 8MB-16MB "pushq %%rax\n\t" "iretq\n" "user_mode:\n\t" /* Back up volatile registers before invoking func */ + "pop %%rax\n\t" "push %%rcx\n\t" "push %%rdx\n\t" "push %%rdi\n\t" @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector, "push %%r10\n\t" "push %%r11\n\t" /* Call user mode function */ + "add $0x800000,%%rbp\n\t" "mov %[arg1], %%rdi\n\t" "mov %[arg2], %%rsi\n\t" "mov %[arg3], %%rdx\n\t" "mov %[arg4], %%rcx\n\t" - "call *%[func]\n\t" + "call *%%rax\n\t" /* Restore registers */ "pop %%r11\n\t" "pop %%r10\n\t" @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned int fault_vector, [arg2]"m"(arg2), [arg3]"m"(arg3), [arg4]"m"(arg4), - [func]"m"(func), [user_ds]"i"(USER_DS), [user_cs]"i"(USER_CS), [kernel_ds]"rm"(KERNEL_DS), [user_stack_top]"r"(user_stack + - sizeof(user_stack)), + USERMODE_STACK_SIZE - 8), [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ)); handle_exception(fault_vector, old_ex); diff --git a/x86/smap.c b/x86/smap.c index 9a823a55..65119442 100644 --- a/x86/smap.c +++ b/x86/smap.c @@ -2,6 +2,7 @@ #include <alloc_page.h> #include "x86/desc.h" #include "x86/processor.h" +#include "x86/usermode.h" #include "x86/vm.h" volatile int pf_count = 0; @@ -89,6 +90,31 @@ static void check_smap_nowp(void) write_cr3(read_cr3()); } +#ifdef __x86_64__ +static void iret(void) +{ + asm volatile( + "mov %%rsp, %%rcx;" + "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;" + "pushf;" + "movl %%cs, %%ebx; pushq %%rbx; " + "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:" + + : : : "ebx", "ecx", "cc"); /* RPL=0 */ +} + +static void test_user_iret(void) +{ + bool raised_vector; + uintptr_t user_iret = (uintptr_t)iret + USER_BASE; + + run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0, + &raised_vector); + + report(!raised_vector, "No #PF on CPL=3 DPL=3 iret"); +} +#endif + int main(int ac, char **av) { unsigned long i; @@ -196,7 +222,9 @@ int main(int ac, char **av) check_smap_nowp(); - // TODO: implicit kernel access from ring 3 (e.g. int) +#ifdef __x86_64__ + test_user_iret(); +#endif return report_summary(); }