diff mbox series

[v2] x86/traps: Re-enable interrupts after reading cr2 in the #PF handler

Message ID 20240918130554.97345-1-alejandro.vallejo@cloud.com (mailing list archive)
State Superseded
Headers show
Series [v2] x86/traps: Re-enable interrupts after reading cr2 in the #PF handler | expand

Commit Message

Alejandro Vallejo Sept. 18, 2024, 1:05 p.m. UTC
Moves sti directly after the cr2 read and immediately after the #PF
handler.

While in the area, remove redundant q suffix to a movq in entry.S

Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
---
Got lost alongside other patches. Here's the promised v2.

pipeline: https://gitlab.com/xen-project/people/agvallejo/xen/-/pipelines/1458699639
v1: https://lore.kernel.org/xen-devel/20240911145823.12066-1-alejandro.vallejo@cloud.com/

v2:
  * (cosmetic), add whitespace after comma
  * Added ASSERT(local_irq_is_enabled()) to do_page_fault()
  * Only re-enable interrupts if they were enabled in the interrupted
    context.
---
 xen/arch/x86/traps.c        |  8 ++++++++
 xen/arch/x86/x86_64/entry.S | 20 ++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

Comments

Roger Pau Monné Sept. 20, 2024, 2:12 p.m. UTC | #1
On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
> Moves sti directly after the cr2 read and immediately after the #PF
> handler.

I think you need to add some context about why this is needed, iow:
avoid corrupting %cr2 if a nested 3PF happens.

> While in the area, remove redundant q suffix to a movq in entry.S
> 
> Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

One nit below.

> ---
> Got lost alongside other patches. Here's the promised v2.
> 
> pipeline: https://gitlab.com/xen-project/people/agvallejo/xen/-/pipelines/1458699639
> v1: https://lore.kernel.org/xen-devel/20240911145823.12066-1-alejandro.vallejo@cloud.com/
> 
> v2:
>   * (cosmetic), add whitespace after comma
>   * Added ASSERT(local_irq_is_enabled()) to do_page_fault()
>   * Only re-enable interrupts if they were enabled in the interrupted
>     context.
> ---
>  xen/arch/x86/traps.c        |  8 ++++++++
>  xen/arch/x86/x86_64/entry.S | 20 ++++++++++++++++----
>  2 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> index 708136f62558..a9c2c607eb08 100644
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1600,6 +1600,14 @@ void asmlinkage do_page_fault(struct cpu_user_regs *regs)
>  
>      addr = read_cr2();
>  
> +    /*
> +     * Don't re-enable interrupts if we were running an IRQ-off region when
> +     * we hit the page fault, or we'll break that code.
> +     */
> +    ASSERT(!local_irq_is_enabled());
> +    if ( regs->flags & X86_EFLAGS_IF )
> +        local_irq_enable();
> +
>      /* fixup_page_fault() might change regs->error_code, so cache it here. */
>      error_code = regs->error_code;
>  
> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> index b8482de8ee5b..218e5ea85efb 100644
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -844,9 +844,9 @@ handle_exception_saved:
>  #elif !defined(CONFIG_PV)
>          ASSERT_CONTEXT_IS_XEN
>  #endif /* CONFIG_PV */
> -        sti
> -1:      movq  %rsp,%rdi
> -        movzbl UREGS_entry_vector(%rsp),%eax
> +.Ldispatch_handlers:

Maybe 'dispatch_exception', since it's only exceptions that are
handled here? dispatch_handlers seems a bit too generic, but no strong
opinion.

Thanks, Roger.
Alejandro Vallejo Sept. 23, 2024, 10:14 a.m. UTC | #2
On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
> > Moves sti directly after the cr2 read and immediately after the #PF
> > handler.
>
> I think you need to add some context about why this is needed, iow:
> avoid corrupting %cr2 if a nested 3PF happens.

I can send a v3 with:

```
  Hitting a page fault clobbers %cr2, so if a page fault is handled while
  handling a previous page fault then %cr2 will hold the address of the latter
  fault rather than the former. This patch makes the page fault path delay
  re-enabling IRQs until %cr2 has been read in order to ensure it stays
  consistent.

  Furthermore, the patch preserves the invariant of "IRQs are only re-enabled
  if they were enabled in the interrupted context" in order to not break
  IRQs-off faulting contexts.
```

>
> > While in the area, remove redundant q suffix to a movq in entry.S
> > 
> > Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
>
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks

>
> One nit below.
>
> > ---
> > Got lost alongside other patches. Here's the promised v2.
> > 
> > pipeline: https://gitlab.com/xen-project/people/agvallejo/xen/-/pipelines/1458699639
> > v1: https://lore.kernel.org/xen-devel/20240911145823.12066-1-alejandro.vallejo@cloud.com/
> > 
> > v2:
> >   * (cosmetic), add whitespace after comma
> >   * Added ASSERT(local_irq_is_enabled()) to do_page_fault()
> >   * Only re-enable interrupts if they were enabled in the interrupted
> >     context.
> > ---
> >  xen/arch/x86/traps.c        |  8 ++++++++
> >  xen/arch/x86/x86_64/entry.S | 20 ++++++++++++++++----
> >  2 files changed, 24 insertions(+), 4 deletions(-)
> > 
> > diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> > index 708136f62558..a9c2c607eb08 100644
> > --- a/xen/arch/x86/traps.c
> > +++ b/xen/arch/x86/traps.c
> > @@ -1600,6 +1600,14 @@ void asmlinkage do_page_fault(struct cpu_user_regs *regs)
> >  
> >      addr = read_cr2();
> >  
> > +    /*
> > +     * Don't re-enable interrupts if we were running an IRQ-off region when
> > +     * we hit the page fault, or we'll break that code.
> > +     */
> > +    ASSERT(!local_irq_is_enabled());
> > +    if ( regs->flags & X86_EFLAGS_IF )
> > +        local_irq_enable();
> > +
> >      /* fixup_page_fault() might change regs->error_code, so cache it here. */
> >      error_code = regs->error_code;
> >  
> > diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> > index b8482de8ee5b..218e5ea85efb 100644
> > --- a/xen/arch/x86/x86_64/entry.S
> > +++ b/xen/arch/x86/x86_64/entry.S
> > @@ -844,9 +844,9 @@ handle_exception_saved:
> >  #elif !defined(CONFIG_PV)
> >          ASSERT_CONTEXT_IS_XEN
> >  #endif /* CONFIG_PV */
> > -        sti
> > -1:      movq  %rsp,%rdi
> > -        movzbl UREGS_entry_vector(%rsp),%eax
> > +.Ldispatch_handlers:
>
> Maybe 'dispatch_exception', since it's only exceptions that are
> handled here? dispatch_handlers seems a bit too generic, but no strong
> opinion.

Sure, anything would be better than "1:"

>
> Thanks, Roger.

Cheers,
Alejandro
Jan Beulich Sept. 23, 2024, 1:03 p.m. UTC | #3
On 23.09.2024 12:14, Alejandro Vallejo wrote:
> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
>>> Moves sti directly after the cr2 read and immediately after the #PF
>>> handler.
>>
>> I think you need to add some context about why this is needed, iow:
>> avoid corrupting %cr2 if a nested 3PF happens.
> 
> I can send a v3 with:
> 
> ```
>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
>   handling a previous page fault then %cr2 will hold the address of the latter
>   fault rather than the former. This patch makes the page fault path delay
>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
>   consistent.

And under what conditions would we experience #PF while already processing
an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
any #PF itself. Which isn't to say that the change isn't worthwhile to make,
but it would be nice if it was explicit whether there are active issues, or
whether this is merely to be on the safe side going forward.

>   Furthermore, the patch preserves the invariant of "IRQs are only re-enabled
>   if they were enabled in the interrupted context" in order to not break
>   IRQs-off faulting contexts.

This last part is just stating the obvious then, in that you're not breaking
existing behavior? Seems a little odd to have in this form then.

Jan
Andrew Cooper Sept. 24, 2024, 6:36 p.m. UTC | #4
On 23/09/2024 2:03 pm, Jan Beulich wrote:
> On 23.09.2024 12:14, Alejandro Vallejo wrote:
>> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
>>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
>>>> Moves sti directly after the cr2 read and immediately after the #PF
>>>> handler.
>>> I think you need to add some context about why this is needed, iow:
>>> avoid corrupting %cr2 if a nested 3PF happens.
>> I can send a v3 with:
>>
>> ```
>>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
>>   handling a previous page fault then %cr2 will hold the address of the latter
>>   fault rather than the former. This patch makes the page fault path delay
>>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
>>   consistent.
> And under what conditions would we experience #PF while already processing
> an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
> any #PF itself. Which isn't to say that the change isn't worthwhile to make,
> but it would be nice if it was explicit whether there are active issues, or
> whether this is merely to be on the safe side going forward.

My understanding is that this came from code inspection, not an active
issue.

The same is true for %dr6 and #DB, and MSR_XFD_ERR and #NM.

I think we can safely agree to veto the use of AMX in the #NM handler,
and IST exceptions don't re-enable interrupts[1], so #PF is the only
problem case.

Debug keys happen off the back of plain IRQs, and we can get #PF when
interrogating guest stacks.  Also, I'm far from certain we're safe to
spurious #PF's from updating Xen mappings, so I think there are a bunch
of risky corner cases that we might be exposed to.

And I really need to find some time to get FRED working...

~Andrew

[1] We do re-enable interrupts in order to IPI cpu0 for "clean"
shutdown, and this explodes in our faces if kexec isn't active and we
crashed in the middle of context switch.  We really need to not need
irqs-on in order to shut down.
Jan Beulich Sept. 25, 2024, 6:35 a.m. UTC | #5
On 24.09.2024 20:36, Andrew Cooper wrote:
> On 23/09/2024 2:03 pm, Jan Beulich wrote:
>> On 23.09.2024 12:14, Alejandro Vallejo wrote:
>>> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
>>>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
>>>>> Moves sti directly after the cr2 read and immediately after the #PF
>>>>> handler.
>>>> I think you need to add some context about why this is needed, iow:
>>>> avoid corrupting %cr2 if a nested 3PF happens.
>>> I can send a v3 with:
>>>
>>> ```
>>>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
>>>   handling a previous page fault then %cr2 will hold the address of the latter
>>>   fault rather than the former. This patch makes the page fault path delay
>>>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
>>>   consistent.
>> And under what conditions would we experience #PF while already processing
>> an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
>> any #PF itself. Which isn't to say that the change isn't worthwhile to make,
>> but it would be nice if it was explicit whether there are active issues, or
>> whether this is merely to be on the safe side going forward.
> 
> My understanding is that this came from code inspection, not an active
> issue.
> 
> The same is true for %dr6 and #DB, and MSR_XFD_ERR and #NM.
> 
> I think we can safely agree to veto the use of AMX in the #NM handler,
> and IST exceptions don't re-enable interrupts[1], so #PF is the only
> problem case.
> 
> Debug keys happen off the back of plain IRQs, and we can get #PF when
> interrogating guest stacks.

Hmm, yes, this looks like a case that is actively being fixed here. Wants
mentioning, likely wants a respective Fixes: tag, and then also wants
backporting.

>  Also, I'm far from certain we're safe to
> spurious #PF's from updating Xen mappings, so I think there are a bunch
> of risky corner cases that we might be exposed to.

Spurious #PF are possible, but __page_fault_type() explicitly excludes
the in_irq() case.

Jan
Andrew Cooper Sept. 25, 2024, 7:49 a.m. UTC | #6
On 25/09/2024 7:35 am, Jan Beulich wrote:
> On 24.09.2024 20:36, Andrew Cooper wrote:
>> On 23/09/2024 2:03 pm, Jan Beulich wrote:
>>> On 23.09.2024 12:14, Alejandro Vallejo wrote:
>>>> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
>>>>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
>>>>>> Moves sti directly after the cr2 read and immediately after the #PF
>>>>>> handler.
>>>>> I think you need to add some context about why this is needed, iow:
>>>>> avoid corrupting %cr2 if a nested 3PF happens.
>>>> I can send a v3 with:
>>>>
>>>> ```
>>>>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
>>>>   handling a previous page fault then %cr2 will hold the address of the latter
>>>>   fault rather than the former. This patch makes the page fault path delay
>>>>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
>>>>   consistent.
>>> And under what conditions would we experience #PF while already processing
>>> an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
>>> any #PF itself. Which isn't to say that the change isn't worthwhile to make,
>>> but it would be nice if it was explicit whether there are active issues, or
>>> whether this is merely to be on the safe side going forward.
>> My understanding is that this came from code inspection, not an active
>> issue.
>>
>> The same is true for %dr6 and #DB, and MSR_XFD_ERR and #NM.
>>
>> I think we can safely agree to veto the use of AMX in the #NM handler,
>> and IST exceptions don't re-enable interrupts[1], so #PF is the only
>> problem case.
>>
>> Debug keys happen off the back of plain IRQs, and we can get #PF when
>> interrogating guest stacks.
> Hmm, yes, this looks like a case that is actively being fixed here. Wants
> mentioning, likely wants a respective Fixes: tag, and then also wants
> backporting.

I'd also like to see at least a brief mention of #DB/#NM and why they're
safe.
>>   Also, I'm far from certain we're safe to
>> spurious #PF's from updating Xen mappings, so I think there are a bunch
>> of risky corner cases that we might be exposed to.
> Spurious #PF are possible, but __page_fault_type() explicitly excludes
> the in_irq() case.

Right, but %cr2 is clobbered when the spurious #PF occurs, whatever
__page_fault_type() subsequently decides to do.

~Andrew
Alejandro Vallejo Sept. 27, 2024, 1:41 p.m. UTC | #7
Hi,

On Tue Sep 24, 2024 at 7:36 PM BST, Andrew Cooper wrote:
> On 23/09/2024 2:03 pm, Jan Beulich wrote:
> > On 23.09.2024 12:14, Alejandro Vallejo wrote:
> >> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
> >>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
> >>>> Moves sti directly after the cr2 read and immediately after the #PF
> >>>> handler.
> >>> I think you need to add some context about why this is needed, iow:
> >>> avoid corrupting %cr2 if a nested 3PF happens.
> >> I can send a v3 with:
> >>
> >> ```
> >>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
> >>   handling a previous page fault then %cr2 will hold the address of the latter
> >>   fault rather than the former. This patch makes the page fault path delay
> >>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
> >>   consistent.
> > And under what conditions would we experience #PF while already processing
> > an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
> > any #PF itself. Which isn't to say that the change isn't worthwhile to make,
> > but it would be nice if it was explicit whether there are active issues, or
> > whether this is merely to be on the safe side going forward.
>
> My understanding is that this came from code inspection, not an active
> issue.

That's right. I merely eyeballed it while going through the interrupt dispatch
sequence. This is not a bugfix as much as simply being cautious.

>
> The same is true for %dr6 and #DB, and MSR_XFD_ERR and #NM.
>
> I think we can safely agree to veto the use of AMX in the #NM handler,

Agree.

> and IST exceptions don't re-enable interrupts[1], so #PF is the only
> problem case.
>
> Debug keys happen off the back of plain IRQs, and we can get #PF when

Could you elaborate here on debug keys? Not sure I understand what you mean.

> interrogating guest stacks.  Also, I'm far from certain we're safe to
> spurious #PF's from updating Xen mappings, so I think there are a bunch
> of risky corner cases that we might be exposed to.
>
> And I really need to find some time to get FRED working...
>
> ~Andrew
>
> [1] We do re-enable interrupts in order to IPI cpu0 for "clean"
> shutdown, and this explodes in our faces if kexec isn't active and we
> crashed in the middle of context switch.  We really need to not need
> irqs-on in order to shut down.

Why do we need them currently in that path? Regardless, shutdowns would be the
response to aborts (#MC or #DF) rather than #DB, right?

Cheers,
Alejandro
Alejandro Vallejo Sept. 27, 2024, 2:21 p.m. UTC | #8
On Wed Sep 25, 2024 at 7:35 AM BST, Jan Beulich wrote:
> On 24.09.2024 20:36, Andrew Cooper wrote:
> > On 23/09/2024 2:03 pm, Jan Beulich wrote:
> >> On 23.09.2024 12:14, Alejandro Vallejo wrote:
> >>> On Fri Sep 20, 2024 at 3:12 PM BST, Roger Pau Monné wrote:
> >>>> On Wed, Sep 18, 2024 at 02:05:54PM +0100, Alejandro Vallejo wrote:
> >>>>> Moves sti directly after the cr2 read and immediately after the #PF
> >>>>> handler.
> >>>> I think you need to add some context about why this is needed, iow:
> >>>> avoid corrupting %cr2 if a nested 3PF happens.
> >>> I can send a v3 with:
> >>>
> >>> ```
> >>>   Hitting a page fault clobbers %cr2, so if a page fault is handled while
> >>>   handling a previous page fault then %cr2 will hold the address of the latter
> >>>   fault rather than the former. This patch makes the page fault path delay
> >>>   re-enabling IRQs until %cr2 has been read in order to ensure it stays
> >>>   consistent.
> >> And under what conditions would we experience #PF while already processing
> >> an earlier #PF? If an interrupt kicks in, that's not supposed to by raising
> >> any #PF itself. Which isn't to say that the change isn't worthwhile to make,
> >> but it would be nice if it was explicit whether there are active issues, or
> >> whether this is merely to be on the safe side going forward.
> > 
> > My understanding is that this came from code inspection, not an active
> > issue.
> > 
> > The same is true for %dr6 and #DB, and MSR_XFD_ERR and #NM.
> > 
> > I think we can safely agree to veto the use of AMX in the #NM handler,
> > and IST exceptions don't re-enable interrupts[1], so #PF is the only
> > problem case.
> > 
> > Debug keys happen off the back of plain IRQs, and we can get #PF when
> > interrogating guest stacks.
>
> Hmm, yes, this looks like a case that is actively being fixed here. Wants
> mentioning, likely wants a respective Fixes: tag, and then also wants
> backporting.

Sure.

>
> >  Also, I'm far from certain we're safe to
> > spurious #PF's from updating Xen mappings, so I think there are a bunch
> > of risky corner cases that we might be exposed to.
>
> Spurious #PF are possible, but __page_fault_type() explicitly excludes
> the in_irq() case.
>
> Jan

Cheers,
Alejandro
diff mbox series

Patch

diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 708136f62558..a9c2c607eb08 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1600,6 +1600,14 @@  void asmlinkage do_page_fault(struct cpu_user_regs *regs)
 
     addr = read_cr2();
 
+    /*
+     * Don't re-enable interrupts if we were running an IRQ-off region when
+     * we hit the page fault, or we'll break that code.
+     */
+    ASSERT(!local_irq_is_enabled());
+    if ( regs->flags & X86_EFLAGS_IF )
+        local_irq_enable();
+
     /* fixup_page_fault() might change regs->error_code, so cache it here. */
     error_code = regs->error_code;
 
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index b8482de8ee5b..218e5ea85efb 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -844,9 +844,9 @@  handle_exception_saved:
 #elif !defined(CONFIG_PV)
         ASSERT_CONTEXT_IS_XEN
 #endif /* CONFIG_PV */
-        sti
-1:      movq  %rsp,%rdi
-        movzbl UREGS_entry_vector(%rsp),%eax
+.Ldispatch_handlers:
+        mov   %rsp, %rdi
+        movzbl UREGS_entry_vector(%rsp), %eax
 #ifdef CONFIG_PERF_COUNTERS
         lea   per_cpu__perfcounters(%rip), %rcx
         add   STACK_CPUINFO_FIELD(per_cpu_offset)(%r14), %rcx
@@ -866,7 +866,19 @@  handle_exception_saved:
         jmp   .L_exn_dispatch_done;    \
 .L_ ## vec ## _done:
 
+        /*
+         * IRQs kept off to derisk being hit by a nested interrupt before
+         * reading %cr2. Otherwise a page fault in the nested interrupt handler
+         * would corrupt %cr2.
+         */
         DISPATCH(X86_EXC_PF, do_page_fault)
+
+        /* Only re-enable IRQs if they were active before taking the fault */
+        testb $X86_EFLAGS_IF >> 8, UREGS_eflags + 1(%rsp)
+        jz    1f
+        sti
+1:
+
         DISPATCH(X86_EXC_GP, do_general_protection)
         DISPATCH(X86_EXC_UD, do_invalid_op)
         DISPATCH(X86_EXC_NM, do_device_not_available)
@@ -911,7 +923,7 @@  exception_with_ints_disabled:
         movq  %rsp,%rdi
         call  search_pre_exception_table
         testq %rax,%rax                 # no fixup code for faulting EIP?
-        jz    1b
+        jz    .Ldispatch_handlers
         movq  %rax,UREGS_rip(%rsp)      # fixup regular stack
 
 #ifdef CONFIG_XEN_SHSTK