Message ID | 1531308586-29340-11-git-send-email-joro@8bytes.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jul 11, 2018 at 4:29 AM, Joerg Roedel <joro@8bytes.org> wrote: > From: Joerg Roedel <jroedel@suse.de> > > It can happen that we enter the kernel from kernel-mode and > on the entry-stack. The most common way this happens is when > we get an exception while loading the user-space segment > registers on the kernel-to-userspace exit path. > > The segment loading needs to be done after the entry-stack > switch, because the stack-switch needs kernel %fs for > per_cpu access. > > When this happens, we need to make sure that we leave the > kernel with the entry-stack again, so that the interrupted > code-path runs on the right stack when switching to the > user-cr3. > > We do this by detecting this condition on kernel-entry by > checking CS.RPL and %esp, and if it happens, we copy over > the complete content of the entry stack to the task-stack. > This needs to be done because once we enter the exception > handlers we might be scheduled out or even migrated to a > different CPU, so that we can't rely on the entry-stack > contents. We also leave a marker in the stack-frame to > detect this condition on the exit path. > > On the exit path the copy is reversed, we copy all of the > remaining task-stack back to the entry-stack and switch > to it. > > Signed-off-by: Joerg Roedel <jroedel@suse.de> > --- > arch/x86/entry/entry_32.S | 116 +++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 115 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S > index 3d1a114..b3af76e 100644 > --- a/arch/x86/entry/entry_32.S > +++ b/arch/x86/entry/entry_32.S > @@ -299,6 +299,9 @@ > * copied there. So allocate the stack-frame on the task-stack and > * switch to it before we do any copying. > */ > + > +#define CS_FROM_ENTRY_STACK (1 << 31) > + > .macro SWITCH_TO_KERNEL_STACK > > ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV > @@ -320,6 +323,16 @@ > /* Load top of task-stack into %edi */ > movl TSS_entry_stack(%edi), %edi > > + /* > + * Clear upper bits of the CS slot in pt_regs in case hardware > + * didn't clear it for us > + */ > + andl $(0x0000ffff), PT_CS(%esp) The comment is highly confusing, give that the upper bits aren't part of the slot any more: commit 385eca8f277c4c34f361a4c3a088fd876d29ae21 Author: Andy Lutomirski <luto@kernel.org> Date: Fri Jul 28 06:00:30 2017 -0700 x86/asm/32: Make pt_regs's segment registers be 16 bits What you're really doing is keeping it available for an extra flag. Please update the comment as such. But see below. > + > + /* Special case - entry from kernel mode via entry stack */ > + testl $SEGMENT_RPL_MASK, PT_CS(%esp) > + jz .Lentry_from_kernel_\@ > + > /* Bytes to copy */ > movl $PTREGS_SIZE, %ecx > > @@ -333,8 +346,8 @@ > */ > addl $(4 * 4), %ecx > > -.Lcopy_pt_regs_\@: > #endif > +.Lcopy_pt_regs_\@: > > /* Allocate frame on task-stack */ > subl %ecx, %edi > @@ -350,6 +363,56 @@ > cld > rep movsl > > + jmp .Lend_\@ > + > +.Lentry_from_kernel_\@: > + > + /* > + * This handles the case when we enter the kernel from > + * kernel-mode and %esp points to the entry-stack. When this > + * happens we need to switch to the task-stack to run C code, > + * but switch back to the entry-stack again when we approach > + * iret and return to the interrupted code-path. This usually > + * happens when we hit an exception while restoring user-space > + * segment registers on the way back to user-space. > + * > + * When we switch to the task-stack here, we can't trust the > + * contents of the entry-stack anymore, as the exception handler > + * might be scheduled out or moved to another CPU. Therefore we > + * copy the complete entry-stack to the task-stack and set a > + * marker in the iret-frame (bit 31 of the CS dword) to detect > + * what we've done on the iret path. > + * > + * On the iret path we copy everything back and switch to the > + * entry-stack, so that the interrupted kernel code-path > + * continues on the same stack it was interrupted with. > + * > + * Be aware that an NMI can happen anytime in this code. > + * > + * %esi: Entry-Stack pointer (same as %esp) > + * %edi: Top of the task stack > + */ > + > + /* Calculate number of bytes on the entry stack in %ecx */ > + movl %esi, %ecx > + > + /* %ecx to the top of entry-stack */ > + andl $(MASK_entry_stack), %ecx > + addl $(SIZEOF_entry_stack), %ecx > + > + /* Number of bytes on the entry stack to %ecx */ > + sub %esi, %ecx > + > + /* Mark stackframe as coming from entry stack */ > + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) > + > + /* > + * %esi and %edi are unchanged, %ecx contains the number of > + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate > + * the stack-frame on task-stack and copy everything over > + */ > + jmp .Lcopy_pt_regs_\@ > + > .Lend_\@: > .endm > > @@ -408,6 +471,56 @@ > .endm > > /* > + * This macro handles the case when we return to kernel-mode on the iret > + * path and have to switch back to the entry stack. > + * > + * See the comments below the .Lentry_from_kernel_\@ label in the > + * SWITCH_TO_KERNEL_STACK macro for more details. > + */ > +.macro PARANOID_EXIT_TO_KERNEL_MODE > + > + /* > + * Test if we entered the kernel with the entry-stack. Most > + * likely we did not, because this code only runs on the > + * return-to-kernel path. > + */ > + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) > + jz .Lend_\@ > + > + /* Unlikely slow-path */ > + > + /* Clear marker from stack-frame */ > + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) > + > + /* Copy the remaining task-stack contents to entry-stack */ > + movl %esp, %esi > + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi I'm confused. Why do we need any special handling here at all? How could we end up with the contents of the stack frame we interrupted in a corrupt state? I guess I don't understand why this patch is needed.
On Fri, Jul 13, 2018 at 04:31:02PM -0700, Andy Lutomirski wrote: > What you're really doing is keeping it available for an extra flag. > Please update the comment as such. But see below. Thanks, will do. > > +.macro PARANOID_EXIT_TO_KERNEL_MODE > > + > > + /* > > + * Test if we entered the kernel with the entry-stack. Most > > + * likely we did not, because this code only runs on the > > + * return-to-kernel path. > > + */ > > + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) > > + jz .Lend_\@ > > + > > + /* Unlikely slow-path */ > > + > > + /* Clear marker from stack-frame */ > > + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) > > + > > + /* Copy the remaining task-stack contents to entry-stack */ > > + movl %esp, %esi > > + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi > > I'm confused. Why do we need any special handling here at all? How > could we end up with the contents of the stack frame we interrupted in > a corrupt state? > > I guess I don't understand why this patch is needed. The patch is needed because we can get exceptions in kernel-mode while we are already on user-cr3 and entry-stack. In this case we need to return with user-cr3 and entry-stack to the kernel too, otherwise we would go to user-space with kernel-cr3. So based on that, I did the above because the entry-stack is a per-cpu data structure and I am not sure that we always return from the exception on the same CPU where we got it. Therefore the path is called PARANOID_... :) Regards, Joerg
> On Jul 13, 2018, at 10:21 PM, Joerg Roedel <jroedel@suse.de> wrote: > >> On Fri, Jul 13, 2018 at 04:31:02PM -0700, Andy Lutomirski wrote: >> What you're really doing is keeping it available for an extra flag. >> Please update the comment as such. But see below. > > Thanks, will do. > >>> +.macro PARANOID_EXIT_TO_KERNEL_MODE >>> + >>> + /* >>> + * Test if we entered the kernel with the entry-stack. Most >>> + * likely we did not, because this code only runs on the >>> + * return-to-kernel path. >>> + */ >>> + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) >>> + jz .Lend_\@ >>> + >>> + /* Unlikely slow-path */ >>> + >>> + /* Clear marker from stack-frame */ >>> + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) >>> + >>> + /* Copy the remaining task-stack contents to entry-stack */ >>> + movl %esp, %esi >>> + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi >> >> I'm confused. Why do we need any special handling here at all? How >> could we end up with the contents of the stack frame we interrupted in >> a corrupt state? >> >> I guess I don't understand why this patch is needed. > > The patch is needed because we can get exceptions in kernel-mode while > we are already on user-cr3 and entry-stack. In this case we need to > return with user-cr3 and entry-stack to the kernel too, otherwise we > would go to user-space with kernel-cr3. > > So based on that, I did the above because the entry-stack is a per-cpu > data structure and I am not sure that we always return from the exception > on the same CPU where we got it. Therefore the path is called > PARANOID_... :) But we should just be able to IRET and end up right back on the entry stack where we were when we got interrupted. On x86_64, we *definitely* can’t schedule in NMI, MCE, or #DB because we’re on a percpu stack. Are you *sure* we need this patch? > > > Regards, > > Joerg >
On Fri, Jul 13, 2018 at 11:26:54PM -0700, Andy Lutomirski wrote: > > So based on that, I did the above because the entry-stack is a per-cpu > > data structure and I am not sure that we always return from the exception > > on the same CPU where we got it. Therefore the path is called > > PARANOID_... :) > > But we should just be able to IRET and end up right back on the entry > stack where we were when we got interrupted. Yeah, but using another CPUs entry-stack is a bad idea, no? Especially since the owning CPU might have overwritten our content there already. > On x86_64, we *definitely* can’t schedule in NMI, MCE, or #DB because > we’re on a percpu stack. Are you *sure* we need this patch? I am sure we need this patch, but not 100% sure that we really can change CPUs in this path. We are not only talking about NMI, #MC and #DB, but also about #GP and every other exception that can happen while writing segments registers or on iret. With this implementation we are on the safe side for this unlikely slow-path. Regards, Joerg
> On Jul 14, 2018, at 1:01 AM, Joerg Roedel <jroedel@suse.de> wrote: > > On Fri, Jul 13, 2018 at 11:26:54PM -0700, Andy Lutomirski wrote: >>> So based on that, I did the above because the entry-stack is a per-cpu >>> data structure and I am not sure that we always return from the exception >>> on the same CPU where we got it. Therefore the path is called >>> PARANOID_... :) >> >> But we should just be able to IRET and end up right back on the entry >> stack where we were when we got interrupted. > > Yeah, but using another CPUs entry-stack is a bad idea, no? Especially > since the owning CPU might have overwritten our content there already. > >> On x86_64, we *definitely* can’t schedule in NMI, MCE, or #DB because >> we’re on a percpu stack. Are you *sure* we need this patch? > > I am sure we need this patch, but not 100% sure that we really can > change CPUs in this path. We are not only talking about NMI, #MC and > #DB, but also about #GP and every other exception that can happen while > writing segments registers or on iret. With this implementation we are > on the safe side for this unlikely slow-path. Oh, right, exceptions while writing segment regs. IRET is special, though. But I’m still unconvinced. If any code executed with IRQs enabled on the entry stack, then that code is terminally buggy. If you’re executing with IRQs off, you’re not going to get migrated. 64-bit kernels run on percpu stacks all the time, and it’s not a problem. IRET errors are genuinely special and, if they’re causing a problem for you, we should fix them the same way we deal with them on x86_64. M > > Regards, > > Joerg
On Sat, Jul 14, 2018 at 07:36:47AM -0700, Andy Lutomirski wrote: > But I’m still unconvinced. If any code executed with IRQs enabled on > the entry stack, then that code is terminally buggy. If you’re > executing with IRQs off, you’re not going to get migrated. 64-bit > kernels run on percpu stacks all the time, and it’s not a problem. The code switches to the kernel-stack and kernel-cr3 and just remembers where it came from (to handle the entry-from-kernel with entry-stack and/or user-cr3 case). IRQs are disabled in the entry-code path. But ultimately it calls into C code to handle the exception. And there IRQs might get enabled again. > IRET errors are genuinely special and, if they’re causing a problem > for you, we should fix them the same way we deal with them on x86_64. Right, IRET is handled differently and doesn't need this patch. But the segment-writing exceptions do. If you insist on it I can try to implement the assumption that we don't get preempted in this code-path. That will safe us some cycles for copying stack contents in this unlikely slow-path. But we definitly need to handle the entry-from-kernel with entry-stack and/or user-cr3 case correctly and make a switch to kernel-stack/cr3 because we are going to call into C-code. Regards, Joerg
On Tue, Jul 17, 2018 at 12:15 AM, Joerg Roedel <jroedel@suse.de> wrote: > On Sat, Jul 14, 2018 at 07:36:47AM -0700, Andy Lutomirski wrote: >> But I’m still unconvinced. If any code executed with IRQs enabled on >> the entry stack, then that code is terminally buggy. If you’re >> executing with IRQs off, you’re not going to get migrated. 64-bit >> kernels run on percpu stacks all the time, and it’s not a problem. > > The code switches to the kernel-stack and kernel-cr3 and just remembers > where it came from (to handle the entry-from-kernel with entry-stack > and/or user-cr3 case). IRQs are disabled in the entry-code path. But > ultimately it calls into C code to handle the exception. And there IRQs > might get enabled again. > >> IRET errors are genuinely special and, if they’re causing a problem >> for you, we should fix them the same way we deal with them on x86_64. > > Right, IRET is handled differently and doesn't need this patch. But the > segment-writing exceptions do. > > If you insist on it I can try to implement the assumption that we don't > get preempted in this code-path. That will safe us some cycles for > copying stack contents in this unlikely slow-path. But we definitly need > to handle the entry-from-kernel with entry-stack and/or user-cr3 case > correctly and make a switch to kernel-stack/cr3 because we are going to > call into C-code. > > Yes, we obviously need to restore the correct cr3. But I really don't like the code that rewrites the stack frame that we're about to IRET to, especially when it doesn't seem to serve a purpose. I'd much rather the code just get its CR3 right and do the IRET and trust that the frame it's returning to is still there.
On Tue, Jul 17, 2018 at 01:06:11PM -0700, Andy Lutomirski wrote: > Yes, we obviously need to restore the correct cr3. But I really don't > like the code that rewrites the stack frame that we're about to IRET > to, especially when it doesn't seem to serve a purpose. I'd much > rather the code just get its CR3 right and do the IRET and trust that > the frame it's returning to is still there. Okay, I'll give it a try and if it works without the copying we can put that on-top of this patch-set. This also has the benefit that we can revert it later if it causes problems down the road. Regards, Joerg
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3d1a114..b3af76e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -299,6 +299,9 @@ * copied there. So allocate the stack-frame on the task-stack and * switch to it before we do any copying. */ + +#define CS_FROM_ENTRY_STACK (1 << 31) + .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV @@ -320,6 +323,16 @@ /* Load top of task-stack into %edi */ movl TSS_entry_stack(%edi), %edi + /* + * Clear upper bits of the CS slot in pt_regs in case hardware + * didn't clear it for us + */ + andl $(0x0000ffff), PT_CS(%esp) + + /* Special case - entry from kernel mode via entry stack */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lentry_from_kernel_\@ + /* Bytes to copy */ movl $PTREGS_SIZE, %ecx @@ -333,8 +346,8 @@ */ addl $(4 * 4), %ecx -.Lcopy_pt_regs_\@: #endif +.Lcopy_pt_regs_\@: /* Allocate frame on task-stack */ subl %ecx, %edi @@ -350,6 +363,56 @@ cld rep movsl + jmp .Lend_\@ + +.Lentry_from_kernel_\@: + + /* + * This handles the case when we enter the kernel from + * kernel-mode and %esp points to the entry-stack. When this + * happens we need to switch to the task-stack to run C code, + * but switch back to the entry-stack again when we approach + * iret and return to the interrupted code-path. This usually + * happens when we hit an exception while restoring user-space + * segment registers on the way back to user-space. + * + * When we switch to the task-stack here, we can't trust the + * contents of the entry-stack anymore, as the exception handler + * might be scheduled out or moved to another CPU. Therefore we + * copy the complete entry-stack to the task-stack and set a + * marker in the iret-frame (bit 31 of the CS dword) to detect + * what we've done on the iret path. + * + * On the iret path we copy everything back and switch to the + * entry-stack, so that the interrupted kernel code-path + * continues on the same stack it was interrupted with. + * + * Be aware that an NMI can happen anytime in this code. + * + * %esi: Entry-Stack pointer (same as %esp) + * %edi: Top of the task stack + */ + + /* Calculate number of bytes on the entry stack in %ecx */ + movl %esi, %ecx + + /* %ecx to the top of entry-stack */ + andl $(MASK_entry_stack), %ecx + addl $(SIZEOF_entry_stack), %ecx + + /* Number of bytes on the entry stack to %ecx */ + sub %esi, %ecx + + /* Mark stackframe as coming from entry stack */ + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + + /* + * %esi and %edi are unchanged, %ecx contains the number of + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate + * the stack-frame on task-stack and copy everything over + */ + jmp .Lcopy_pt_regs_\@ + .Lend_\@: .endm @@ -408,6 +471,56 @@ .endm /* + * This macro handles the case when we return to kernel-mode on the iret + * path and have to switch back to the entry stack. + * + * See the comments below the .Lentry_from_kernel_\@ label in the + * SWITCH_TO_KERNEL_STACK macro for more details. + */ +.macro PARANOID_EXIT_TO_KERNEL_MODE + + /* + * Test if we entered the kernel with the entry-stack. Most + * likely we did not, because this code only runs on the + * return-to-kernel path. + */ + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + jz .Lend_\@ + + /* Unlikely slow-path */ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) + + /* Copy the remaining task-stack contents to entry-stack */ + movl %esp, %esi + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + + /* Bytes on the task-stack to ecx */ + movl PER_CPU_VAR(cpu_current_top_of_stack), %ecx + subl %esi, %ecx + + /* Allocate stack-frame on entry-stack */ + subl %ecx, %edi + + /* + * Save future stack-pointer, we must not switch until the + * copy is done, otherwise the NMI handler could destroy the + * contents of the task-stack we are about to copy. + */ + movl %edi, %ebx + + /* Do the copy */ + shrl $2, %ecx + cld + rep movsl + + /* Safe to switch to entry-stack now */ + movl %ebx, %esp + +.Lend_\@: +.endm +/* * %eax: prev task * %edx: next task */ @@ -769,6 +882,7 @@ restore_all: restore_all_kernel: TRACE_IRQS_IRET + PARANOID_EXIT_TO_KERNEL_MODE RESTORE_REGS 4 jmp .Lirq_return