diff mbox series

[v9,10/12] x86/mm: do targeted broadcast flushing from tlbbatch code

Message ID 20250206044346.3810242-11-riel@surriel.com (mailing list archive)
State New
Headers show
Series AMD broadcast TLB invalidation | expand

Commit Message

Rik van Riel Feb. 6, 2025, 4:43 a.m. UTC
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.

This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
 arch/x86/include/asm/invlpgb.h  | 21 +++++----
 arch/x86/include/asm/tlbflush.h | 17 ++++---
 arch/x86/mm/tlb.c               | 80 +++++++++++++++++++++++++++++++--
 3 files changed, 95 insertions(+), 23 deletions(-)

Comments

Brendan Jackman Feb. 10, 2025, 3:27 p.m. UTC | #1
On Thu, 6 Feb 2025 at 05:46, Rik van Riel <riel@surriel.com> wrote:
>  /* Wait for INVLPGB originated by this CPU to complete. */
> -static inline void tlbsync(void)
> +static inline void __tlbsync(void)
>  {
> -       cant_migrate();

Why does this have to go away?

> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 234277a5ef89..bf167e215e8e 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -106,6 +106,7 @@ struct tlb_state {
>          * need to be invalidated.
>          */
>         bool invalidate_other;
> +       bool need_tlbsync;

The ifdeffery is missing here.

> +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
> +                                               unsigned long addr,
> +                                               u16 nr, bool pmd_stride)
> +{
> +       __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
> +       if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
> +               this_cpu_write(cpu_tlbstate.need_tlbsync, true);

 Why do we need the conditional here?

I always thought code like this was about avoiding cacheline
contention, but given this is a percpu and it's really only of
interest to the present CPU, is this worthwhile?

I guess it might be sharing a cacheline with some other percpu that is
more shared?

Anyway, not a big deal, I'm mostly asking for my own education.

> @@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING))
>                 WARN_ON_ONCE(!irqs_disabled());
>
> +       tlbsync();
> +
>         /*
>          * Verify that CR3 is what we think it is.  This will catch
>          * hypothetical buggy code that directly switches to swapper_pg_dir
> @@ -973,6 +1006,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
>   */
>  void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>  {
> +       tlbsync();
> +

I have a feeling I'll look stupid for asking this, but why do we need
this and the one in switch_mm_irqs_off()?

> @@ -1661,12 +1694,53 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>                 local_irq_enable();
>         }
>
> +       /*
> +        * If we issued (asynchronous) INVLPGB flushes, wait for them here.
> +        * The cpumask above contains only CPUs that were running tasks
> +        * not using broadcast TLB flushing.
> +        */
> +       if (cpu_feature_enabled(X86_FEATURE_INVLPGB))

It feels wrong that we check the cpufeature here but not in e.g.
enter_lazy_tlb().

> +               tlbsync();
> +
>         cpumask_clear(&batch->cpumask);
>
>         put_flush_tlb_info();
>         put_cpu();
>  }
>
> +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +                                            struct mm_struct *mm,
> +                                            unsigned long uaddr)
> +{
> +       u16 asid = mm_global_asid(mm);
> +
> +       if (asid) {
> +               invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> +               /* Do any CPUs supporting INVLPGB need PTI? */

Not today, but

1. I don't think avoiding static_cpu_has() is worth the cost of making
the reader take that logical step.

2.  AFAIK a new CPU bug never led to enabling KPTI on a CPU that
didn't have it before, and I think that would be a pretty dystopian
future (and hopefully by then we'll have ASI instead...). But we can't
really rule it out.
Rik van Riel Feb. 11, 2025, 3:45 a.m. UTC | #2
On Mon, 2025-02-10 at 16:27 +0100, Brendan Jackman wrote:
> On Thu, 6 Feb 2025 at 05:46, Rik van Riel <riel@surriel.com> wrote:
> >  /* Wait for INVLPGB originated by this CPU to complete. */
> > -static inline void tlbsync(void)
> > +static inline void __tlbsync(void)
> >  {
> > -       cant_migrate();
> 
> Why does this have to go away?

I'm not sure the current task in sched_init() has
all the correct bits set to prevent the warning
from firing, but on the flip side it won't have
called INVLPGB yet at that point, so the call to
enter_lazy_tlb() won't actually end up here.

I'll put it back.

> 
> > diff --git a/arch/x86/include/asm/tlbflush.h
> > b/arch/x86/include/asm/tlbflush.h
> > index 234277a5ef89..bf167e215e8e 100644
> > --- a/arch/x86/include/asm/tlbflush.h
> > +++ b/arch/x86/include/asm/tlbflush.h
> > @@ -106,6 +106,7 @@ struct tlb_state {
> >          * need to be invalidated.
> >          */
> >         bool invalidate_other;
> > +       bool need_tlbsync;
> 
> The ifdeffery is missing here.

Added.

> 
> > @@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct
> > *unused, struct mm_struct *next,
> >         if (IS_ENABLED(CONFIG_PROVE_LOCKING))
> >                 WARN_ON_ONCE(!irqs_disabled());
> > 
> > +       tlbsync();
> > +
> >         /*
> >          * Verify that CR3 is what we think it is.  This will catch
> >          * hypothetical buggy code that directly switches to
> > swapper_pg_dir
> > @@ -973,6 +1006,8 @@ void switch_mm_irqs_off(struct mm_struct
> > *unused, struct mm_struct *next,
> >   */
> >  void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
> >  {
> > +       tlbsync();
> > +
> 
> I have a feeling I'll look stupid for asking this, but why do we need
> this and the one in switch_mm_irqs_off()?

This is an architectural thing: TLBSYNC waits for
the INVLPGB flushes to finish that were issued
from the same CPU.

That means if we have pending flushes (from the
pageout code), we need to wait for them at context
switch time, before the task could potentially be
migrated to another CPU.

> 
> > @@ -1661,12 +1694,53 @@ void arch_tlbbatch_flush(struct
> > arch_tlbflush_unmap_batch *batch)
> >                 local_irq_enable();
> >         }
> > 
> > +       /*
> > +        * If we issued (asynchronous) INVLPGB flushes, wait for
> > them here.
> > +        * The cpumask above contains only CPUs that were running
> > tasks
> > +        * not using broadcast TLB flushing.
> > +        */
> > +       if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
> 
> It feels wrong that we check the cpufeature here but not in e.g.
> enter_lazy_tlb().
> 
> > +               tlbsync();
> > +
We no longer need to check it here, with the change
to tlbsync. Good catch.
Brendan Jackman Feb. 11, 2025, 10:02 a.m. UTC | #3
On Tue, 11 Feb 2025 at 04:50, Rik van Riel <riel@surriel.com> wrote:
>
> On Mon, 2025-02-10 at 16:27 +0100, Brendan Jackman wrote:
> > On Thu, 6 Feb 2025 at 05:46, Rik van Riel <riel@surriel.com> wrote:
> > >  /* Wait for INVLPGB originated by this CPU to complete. */
> > > -static inline void tlbsync(void)
> > > +static inline void __tlbsync(void)
> > >  {
> > > -       cant_migrate();
> >
> > Why does this have to go away?
>
> I'm not sure the current task in sched_init() has
> all the correct bits set to prevent the warning
> from firing, but on the flip side it won't have
> called INVLPGB yet at that point, so the call to
> enter_lazy_tlb() won't actually end up here.
>
> I'll put it back.

Sounds good.

FWIW I think if we do run into early-boot code hitting false
DEBUG_ATOMIC_SLEEP warnings, the best response might be to update the
DEBUG_ATOMIC_SLEEP code. Like maybe there's a more targeted solution
but something roughly equivalent to checking if (system_state ==
SYSTEM_STATE_SCHEDULING) before the warning.

> > > @@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct
> > > *unused, struct mm_struct *next,
> > >         if (IS_ENABLED(CONFIG_PROVE_LOCKING))
> > >                 WARN_ON_ONCE(!irqs_disabled());
> > >
> > > +       tlbsync();
> > > +
> > >         /*
> > >          * Verify that CR3 is what we think it is.  This will catch
> > >          * hypothetical buggy code that directly switches to
> > > swapper_pg_dir
> > > @@ -973,6 +1006,8 @@ void switch_mm_irqs_off(struct mm_struct
> > > *unused, struct mm_struct *next,
> > >   */
> > >  void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
> > >  {
> > > +       tlbsync();
> > > +
> >
> > I have a feeling I'll look stupid for asking this, but why do we need
> > this and the one in switch_mm_irqs_off()?
>
> This is an architectural thing: TLBSYNC waits for
> the INVLPGB flushes to finish that were issued
> from the same CPU.
>
> That means if we have pending flushes (from the
> pageout code), we need to wait for them at context
> switch time, before the task could potentially be
> migrated to another CPU.

Oh right thanks, that makes sense.

So I think here we're encoding the assumption that context_switch()
always calls either enter_lazy_tlb() or switch_mm_irqs_off(), which is
a little awkward, plus the job of these functions is already kinda
hazy and this makes it even hazier. What about doing it in
arch_start_context_switch() instead?

That would mean a bit of plumbing since we'd still wanna have the
tlbsync() in tlb.c, but that seems worth it to me. Plus, having it in
one place would give us a spot to add a comment. Now that you point it
out it does indeed seem obvious but it didn't seem so yesterday.

Now I think about it... if we always tlbsync() before a context
switch, is the cant_migrate() above actually required? I think with
that, even if we migrated in the middle of e.g.
broadcast_kernel_range_flush(), we'd be fine? (At least, from the
specific perspective of the invplgb code, presumably having preemption
on there would break things horribly in other ways).
Rik van Riel Feb. 11, 2025, 8:21 p.m. UTC | #4
On Tue, 2025-02-11 at 11:02 +0100, Brendan Jackman wrote:
> 
> So I think here we're encoding the assumption that context_switch()
> always calls either enter_lazy_tlb() or switch_mm_irqs_off(), which
> is
> a little awkward, plus the job of these functions is already kinda
> hazy and this makes it even hazier. What about doing it in
> arch_start_context_switch() instead?
> 
> That would mean a bit of plumbing since we'd still wanna have the
> tlbsync() in tlb.c, but that seems worth it to me. Plus, having it in
> one place would give us a spot to add a comment. Now that you point
> it
> out it does indeed seem obvious but it didn't seem so yesterday.
> 
While that would be a little bit cleaner to maintain,
in theory, I'm not convinced that adding an extra
function call to the context switch path is worthwhile
for that small maintenance benefit.

I'd rather add more comments than an extra function call :)

> Now I think about it... if we always tlbsync() before a context
> switch, is the cant_migrate() above actually required?

Probably not, but I guess it won't hurt?

I'm running tests right now on a kernel with a bunch
of debug options enabled, and I'm getting all sorts
of warnings, but not this one :)
Brendan Jackman Feb. 12, 2025, 10:38 a.m. UTC | #5
On Tue, 11 Feb 2025 at 21:21, Rik van Riel <riel@surriel.com> wrote:
>
> On Tue, 2025-02-11 at 11:02 +0100, Brendan Jackman wrote:
> >
> > So I think here we're encoding the assumption that context_switch()
> > always calls either enter_lazy_tlb() or switch_mm_irqs_off(), which
> > is
> > a little awkward, plus the job of these functions is already kinda
> > hazy and this makes it even hazier. What about doing it in
> > arch_start_context_switch() instead?
> >
> > That would mean a bit of plumbing since we'd still wanna have the
> > tlbsync() in tlb.c, but that seems worth it to me. Plus, having it in
> > one place would give us a spot to add a comment. Now that you point
> > it
> > out it does indeed seem obvious but it didn't seem so yesterday.
> >
> While that would be a little bit cleaner to maintain,
> in theory, I'm not convinced that adding an extra
> function call to the context switch path is worthwhile
> for that small maintenance benefit.

I don't see why it has to introduce a function call, can't we just
have something like:

static inline void arch_start_context_switch(struct task_struct *prev)
{
    arch_paravirt_start_context_switch(prev);
    tlb_start_context_switch(prev);
}

The paravirt one would disappear when !CONFIG_PARAVIRT (as the current
arch_start_context_switch() does) and the tlb one would disappear when
!CONFIG_X86_BROADCAST_TLB_FLUSH. The whole thing should be inlinable.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h
index a1d5dedd5217..357e3cc417e4 100644
--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
@@ -31,9 +31,8 @@  static inline void __invlpgb(unsigned long asid, unsigned long pcid,
 }
 
 /* Wait for INVLPGB originated by this CPU to complete. */
-static inline void tlbsync(void)
+static inline void __tlbsync(void)
 {
-	cant_migrate();
 	/* TLBSYNC: supported in binutils >= 0.36. */
 	asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
 }
@@ -61,19 +60,19 @@  static inline void invlpgb_flush_user(unsigned long pcid,
 				      unsigned long addr)
 {
 	__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
-	tlbsync();
+	__tlbsync();
 }
 
-static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
-						unsigned long addr,
-						u16 nr,
-						bool pmd_stride)
+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						  unsigned long addr,
+						  u16 nr,
+						  bool pmd_stride)
 {
 	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
 }
 
 /* Flush all mappings for a given PCID, not including globals. */
-static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
 {
 	__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
 }
@@ -82,11 +81,11 @@  static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
 static inline void invlpgb_flush_all(void)
 {
 	__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
-	tlbsync();
+	__tlbsync();
 }
 
 /* Flush addr, including globals, for all PCIDs. */
-static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
 {
 	__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
 }
@@ -95,7 +94,7 @@  static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
 static inline void invlpgb_flush_all_nonglobals(void)
 {
 	__invlpgb(0, 0, 0, 0, 0, 0);
-	tlbsync();
+	__tlbsync();
 }
 
 #endif /* _ASM_X86_INVLPGB */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 234277a5ef89..bf167e215e8e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -106,6 +106,7 @@  struct tlb_state {
 	 * need to be invalidated.
 	 */
 	bool invalidate_other;
+	bool need_tlbsync;
 
 #ifdef CONFIG_ADDRESS_MASKING
 	/*
@@ -310,6 +311,10 @@  static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
 static inline void consider_global_asid(struct mm_struct *mm)
 {
 }
+
+static inline void tlbsync(void)
+{
+}
 #endif
 
 #ifdef CONFIG_PARAVIRT
@@ -359,21 +364,15 @@  static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
-					     struct mm_struct *mm,
-					     unsigned long uaddr)
-{
-	inc_mm_tlb_gen(mm);
-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
 }
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 05390f0e6cb0..4253c3efd7e4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -488,6 +488,37 @@  static void finish_asid_transition(struct flush_tlb_info *info)
 	WRITE_ONCE(mm->context.asid_transition, false);
 }
 
+static inline void tlbsync(void)
+{
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		return;
+	__tlbsync();
+	this_cpu_write(cpu_tlbstate.need_tlbsync, false);
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr, bool pmd_stride)
+{
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb_flush_single_pcid_nosync(pcid);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb_flush_addr_nosync(addr, nr);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
 static void broadcast_tlb_flush(struct flush_tlb_info *info)
 {
 	bool pmd = info->stride_shift == PMD_SHIFT;
@@ -794,6 +825,8 @@  void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(!irqs_disabled());
 
+	tlbsync();
+
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
@@ -973,6 +1006,8 @@  void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
  */
 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
+	tlbsync();
+
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
 
@@ -1650,9 +1685,7 @@  void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	 * a local TLB flush is needed. Optimize this use-case by calling
 	 * flush_tlb_func_local() directly in this case.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
 		flush_tlb_multi(&batch->cpumask, info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
@@ -1661,12 +1694,53 @@  void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		local_irq_enable();
 	}
 
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		tlbsync();
+
 	cpumask_clear(&batch->cpumask);
 
 	put_flush_tlb_info();
 	put_cpu();
 }
 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	u16 asid = mm_global_asid(mm);
+
+	if (asid) {
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case we need to do both the broadcast
+		 * TLB invalidation, and send IPIs. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (in_asid_transition(mm))
+			asid = 0;
+	}
+
+	if (!asid) {
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
 /*
  * Blindly accessing user memory from NMI context can be dangerous
  * if we're in the middle of switching the current user task or