diff mbox series

[3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE

Message ID 20180822154046.823850812@infradead.org (mailing list archive)
State New, archived
Headers show
Series x86: TLB invalidate fixes | expand

Commit Message

Peter Zijlstra Aug. 22, 2018, 3:30 p.m. UTC
Jann reported that x86 was missing required TLB invalidates when he
hit the !*batch slow path in tlb_remove_table().

This is indeed the case; RCU_TABLE_FREE does not provide TLB (cache)
invalidates, the PowerPC-hash where this code originated and the
Sparc-hash where this was subsequently used did not need that. ARM
which later used this put an explicit TLB invalidate in their
__p*_free_tlb() functions, and PowerPC-radix followed that example.

But when we hooked up x86 we failed to consider this. Fix this by
(optionally) hooking tlb_remove_table() into the TLB invalidate code.

NOTE: s390 was also needing something like this and might now
      be able to use the generic code again.

Cc: stable@kernel.org
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Fixes: 9e52fc2b50de ("x86/mm: Enable RCU based page table freeing (CONFIG_HAVE_RCU_TABLE_FREE=y)")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig     |    3 +++
 arch/x86/Kconfig |    1 +
 mm/memory.c      |   27 +++++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Aug. 22, 2018, 3:55 p.m. UTC | #1
On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> ARM
> which later used this put an explicit TLB invalidate in their
> __p*_free_tlb() functions, and PowerPC-radix followed that example.

> +/*
> + * If we want tlb_remove_table() to imply TLB invalidates.
> + */
> +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> +{
> +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> +	/*
> +	 * Invalidate page-table caches used by hardware walkers. Then we still
> +	 * need to RCU-sched wait while freeing the pages because software
> +	 * walkers can still be in-flight.
> +	 */
> +	__tlb_flush_mmu_tlbonly(tlb);
> +#endif
> +}


Nick, Will is already looking at using this to remove the synchronous
invalidation from __p*_free_tlb() for ARM, could you have a look to see
if PowerPC-radix could benefit from that too?

Basically, using a patch like the below, would give your tlb_flush()
information on if tables were removed or not.

---

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -96,12 +96,22 @@ struct mmu_gather {
 #endif
 	unsigned long		start;
 	unsigned long		end;
-	/* we are in the middle of an operation to clear
-	 * a full mm and can make some optimizations */
-	unsigned int		fullmm : 1,
-	/* we have performed an operation which
-	 * requires a complete flush of the tlb */
-				need_flush_all : 1;
+	/*
+	 * we are in the middle of an operation to clear
+	 * a full mm and can make some optimizations
+	 */
+	unsigned int		fullmm : 1;
+
+	/*
+	 * we have performed an operation which
+	 * requires a complete flush of the tlb
+	 */
+	unsigned int		need_flush_all : 1;
+
+	/*
+	 * we have removed page directories
+	 */
+	unsigned int		freed_tables : 1;
 
 	struct mmu_gather_batch *active;
 	struct mmu_gather_batch	local;
@@ -136,6 +146,7 @@ static inline void __tlb_reset_range(str
 		tlb->start = TASK_SIZE;
 		tlb->end = 0;
 	}
+	tlb->freed_tables = 0;
 }
 
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
@@ -269,6 +280,7 @@ static inline void tlb_remove_check_page
 #define pte_free_tlb(tlb, ptep, address)			\
 	do {							\
 		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pte_free_tlb(tlb, ptep, address);		\
 	} while (0)
 #endif
@@ -276,7 +288,8 @@ static inline void tlb_remove_check_page
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pmd_free_tlb(tlb, pmdp, address);		\
 	} while (0)
 #endif
@@ -286,6 +299,7 @@ static inline void tlb_remove_check_page
 #define pud_free_tlb(tlb, pudp, address)			\
 	do {							\
 		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pud_free_tlb(tlb, pudp, address);		\
 	} while (0)
 #endif
@@ -295,7 +309,8 @@ static inline void tlb_remove_check_page
 #ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__p4d_free_tlb(tlb, pudp, address);		\
 	} while (0)
 #endif
Linus Torvalds Aug. 22, 2018, 9:34 p.m. UTC | #2
On Wed, Aug 22, 2018 at 8:46 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -180,6 +180,7 @@ config X86
>         select HAVE_PERF_REGS
>         select HAVE_PERF_USER_STACK_DUMP
>         select HAVE_RCU_TABLE_FREE
> +       select HAVE_RCU_TABLE_INVALIDATE        if HAVE_RCU_TABLE_FREE

This is confusing. First you select HAVE_RCU_TABLE_FREE
unconditionally, and then you select HAVE_RCU_TABLE_INVALIDATE based
on that unconditional variable.

I can see why you do it, but that's because I see the next patch. On
its own it just looks like you have a drinking problem.

That said, I was waiting to see if this patch-set would get any
comments before applying it, but it's been mostly crickets... So I
think I'll just apply it and get this issue over and done with.

              Linus
Nicholas Piggin Aug. 23, 2018, 3:45 a.m. UTC | #3
On Wed, 22 Aug 2018 17:55:27 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> > ARM
> > which later used this put an explicit TLB invalidate in their
> > __p*_free_tlb() functions, and PowerPC-radix followed that example.  
> 
> > +/*
> > + * If we want tlb_remove_table() to imply TLB invalidates.
> > + */
> > +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> > +{
> > +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> > +	/*
> > +	 * Invalidate page-table caches used by hardware walkers. Then we still
> > +	 * need to RCU-sched wait while freeing the pages because software
> > +	 * walkers can still be in-flight.
> > +	 */
> > +	__tlb_flush_mmu_tlbonly(tlb);
> > +#endif
> > +}  
> 
> 
> Nick, Will is already looking at using this to remove the synchronous
> invalidation from __p*_free_tlb() for ARM, could you have a look to see
> if PowerPC-radix could benefit from that too?

powerpc/radix has no such issue, it already does this tracking.

We were discussing this a couple of months ago, I wasn't aware of ARM's
issue but I suggested x86 could go the same way as powerpc. Would have
been good to get some feedback on some of the proposed approaches there.
Because it's not just pwc tracking but if you do this you don't need
those silly hacks in generic code to expand the TLB address range
either.

So powerpc has no fundamental problem with making this stuff generic.
If you need a fix for x86 and ARM for this merge window though, I would
suggest just copying what powerpc already has. Next time we can
consider consolidating them all into generic code.

Thanks,
Nick


> 
> Basically, using a patch like the below, would give your tlb_flush()
> information on if tables were removed or not.
> 
> ---
> 
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -96,12 +96,22 @@ struct mmu_gather {
>  #endif
>  	unsigned long		start;
>  	unsigned long		end;
> -	/* we are in the middle of an operation to clear
> -	 * a full mm and can make some optimizations */
> -	unsigned int		fullmm : 1,
> -	/* we have performed an operation which
> -	 * requires a complete flush of the tlb */
> -				need_flush_all : 1;
> +	/*
> +	 * we are in the middle of an operation to clear
> +	 * a full mm and can make some optimizations
> +	 */
> +	unsigned int		fullmm : 1;
> +
> +	/*
> +	 * we have performed an operation which
> +	 * requires a complete flush of the tlb
> +	 */
> +	unsigned int		need_flush_all : 1;
> +
> +	/*
> +	 * we have removed page directories
> +	 */
> +	unsigned int		freed_tables : 1;
>  
>  	struct mmu_gather_batch *active;
>  	struct mmu_gather_batch	local;
> @@ -136,6 +146,7 @@ static inline void __tlb_reset_range(str
>  		tlb->start = TASK_SIZE;
>  		tlb->end = 0;
>  	}
> +	tlb->freed_tables = 0;
>  }
>  
>  static inline void tlb_remove_page_size(struct mmu_gather *tlb,
> @@ -269,6 +280,7 @@ static inline void tlb_remove_check_page
>  #define pte_free_tlb(tlb, ptep, address)			\
>  	do {							\
>  		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pte_free_tlb(tlb, ptep, address);		\
>  	} while (0)
>  #endif
> @@ -276,7 +288,8 @@ static inline void tlb_remove_check_page
>  #ifndef pmd_free_tlb
>  #define pmd_free_tlb(tlb, pmdp, address)			\
>  	do {							\
> -		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
> +		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pmd_free_tlb(tlb, pmdp, address);		\
>  	} while (0)
>  #endif
> @@ -286,6 +299,7 @@ static inline void tlb_remove_check_page
>  #define pud_free_tlb(tlb, pudp, address)			\
>  	do {							\
>  		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pud_free_tlb(tlb, pudp, address);		\
>  	} while (0)
>  #endif
> @@ -295,7 +309,8 @@ static inline void tlb_remove_check_page
>  #ifndef p4d_free_tlb
>  #define p4d_free_tlb(tlb, pudp, address)			\
>  	do {							\
> -		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
> +		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__p4d_free_tlb(tlb, pudp, address);		\
>  	} while (0)
>  #endif
Linus Torvalds Aug. 23, 2018, 3:59 a.m. UTC | #4
On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> powerpc/radix has no such issue, it already does this tracking.

Yeah, I now realize that this was why you wanted to add that hacky
thing to the generic code, so that you can add the tlb_flush_pgtable()
call.

I thought it was because powerpc had some special flush instruction
for it, and the regular tlb flush didn't do it. But no. It was because
the regular code had lost the tlb flush _entirely_, because powerpc
didn't want it.

> We were discussing this a couple of months ago, I wasn't aware of ARM's
> issue but I suggested x86 could go the same way as powerpc.

The problem is that x86 _used_ to do this all correctly long long ago.

And then we switched over to the "generic" table flushing (which
harkens back to the powerpc code).

Which actually turned out to be not generic at all, and did not flush
the internal pages like x86 used to (back when x86 just used
tlb_remove_page for everything).

So as a result, x86 had unintentionally lost the TLB flush we used to
have, because tlb_remove_table() had lost the tlb flushing because of
a powerpc quirk.

You then added it back as a hacky per-architecture hook (apparently
having realized that you never did it at all), which didn't fix the
unintentional lack of flushing on x86.

So now we're going to do it right.  No more "oh, powerpc didn't need
to flush because the hash tables weren't in the tlb at all" thing in
the generic code that then others need to work around.

              Linus
Nicholas Piggin Aug. 23, 2018, 4:33 a.m. UTC | #5
On Wed, 22 Aug 2018 20:59:46 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > powerpc/radix has no such issue, it already does this tracking.  
> 
> Yeah, I now realize that this was why you wanted to add that hacky
> thing to the generic code, so that you can add the tlb_flush_pgtable()
> call.
> 
> I thought it was because powerpc had some special flush instruction
> for it, and the regular tlb flush didn't do it. But no.

powerpc/radix does have a special instruction for it, that is why I
posted the patch :)

> It was because
> the regular code had lost the tlb flush _entirely_, because powerpc
> didn't want it.

I think that was long before I started looking at the code.
powerpc/hash hardware has no idea about the page tables so yeah they
don't need it.

> 
> > We were discussing this a couple of months ago, I wasn't aware of ARM's
> > issue but I suggested x86 could go the same way as powerpc.  
> 
> The problem is that x86 _used_ to do this all correctly long long ago.
> 
> And then we switched over to the "generic" table flushing (which
> harkens back to the powerpc code).
> 
> Which actually turned out to be not generic at all, and did not flush
> the internal pages like x86 used to (back when x86 just used
> tlb_remove_page for everything).
> 
> So as a result, x86 had unintentionally lost the TLB flush we used to
> have, because tlb_remove_table() had lost the tlb flushing because of
> a powerpc quirk.
> 
> You then added it back as a hacky per-architecture hook (apparently
> having realized that you never did it at all), which didn't fix the

I think it was quite well understood and fixed here, a145abf12c9 but
again that was before I really started looking at it.

The hooks I added recently are for a different reason, and it's
actaully the opposite problem -- to work around the hacky generic code
that x86 foisted on other archs.

> unintentional lack of flushing on x86.
> 
> So now we're going to do it right.  No more "oh, powerpc didn't need
> to flush because the hash tables weren't in the tlb at all" thing in
> the generic code that then others need to work around.

I don't really understand what the issue you have with powerpc here.
powerpc hash has the page table flushing accessors which are just
no-ops, it's the generic code that fails to call them properly. Surely
there was no powerpc patch that removed those calls from generic code?

powerpc/radix yes it does some arch specific things to do its page walk
cache flushing, but it is a better design than the hacks x86 has in
generic code, surely. I thought you basically agreed and thought x86 /
generic code could move to that kind of model.

Thanks,
Nick
Benjamin Herrenschmidt Aug. 23, 2018, 4:54 a.m. UTC | #6
On Wed, 2018-08-22 at 20:59 -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> > 
> > powerpc/radix has no such issue, it already does this tracking.
> 
> Yeah, I now realize that this was why you wanted to add that hacky
> thing to the generic code, so that you can add the tlb_flush_pgtable()
> call.
> 
> I thought it was because powerpc had some special flush instruction
> for it, and the regular tlb flush didn't do it. But no. It was because
> the regular code had lost the tlb flush _entirely_, because powerpc
> didn't want it.

Heh :-) Well, back on hash we didn't (we do now with Radix) but I
wouldn't blame us for the generic code being broken ... the RCU table
freeing was in arch/powerpc at the time :-) I don't think it was us
making it generic :)

> > We were discussing this a couple of months ago, I wasn't aware of ARM's
> > issue but I suggested x86 could go the same way as powerpc.
> 
> The problem is that x86 _used_ to do this all correctly long long ago.
> 
> And then we switched over to the "generic" table flushing (which
> harkens back to the powerpc code).

Yes, we wrote it the RCU stuff to solve the races with SW walking,
which is completely orthogonal with HW walking & TLB content. We didn't
do the move to generic code though ;-)

> Which actually turned out to be not generic at all, and did not flush
> the internal pages like x86 used to (back when x86 just used
> tlb_remove_page for everything).

Well, having RCU do the flushing is rather generic, it makes sense
whenever there's somebody doing a SW walk *and* you don't have IPIs to
synchronize your flushes (ie, anybody with HW TLB invalidation
broadcast basically, so ARM and us).

> So as a result, x86 had unintentionally lost the TLB flush we used to
> have, because tlb_remove_table() had lost the tlb flushing because of
> a powerpc quirk.

This is a somewhat odd way of putting the "blame" :-) But yeah ok...

> You then added it back as a hacky per-architecture hook (apparently
> having realized that you never did it at all), which didn't fix the
> unintentional lack of flushing on x86.
> 
> So now we're going to do it right.  No more "oh, powerpc didn't need
> to flush because the hash tables weren't in the tlb at all" thing in
> the generic code that then others need to work around.

So we do need a different flush instruction for the page tables vs. the
normal TLB pages. 

Cheers,
Ben.
Linus Torvalds Aug. 23, 2018, 5:03 a.m. UTC | #7
On Wed, Aug 22, 2018 at 9:33 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> I think it was quite well understood and fixed here, a145abf12c9 but
> again that was before I really started looking at it.

You don't understand the problem.

All the x86 people thought WE ALREADY DID THAT.

Because we had done this all correctly over a decade ago!

Nobody realized that it had been screwed up by the powerpc code, and
the commit you point to was believed to be a new *powerpc* only issue,
because the semantics on powerpc has changed because of the radix
tree.

The semantics on x86 have never changed, they've always been the same.
So why would the x86 people react to powerpc doing something that x86
had already always done.

See?

Nobody cared one whit about commit a145abf12c9, because it just
handles a new powerpc-specific case.

> I don't really understand what the issue you have with powerpc here.
> powerpc hash has the page table flushing accessors which are just
> no-ops, it's the generic code that fails to call them properly. Surely
> there was no powerpc patch that removed those calls from generic code?

Yes there was.

Look where the generic code *came* from.

It's powerpc code.

See commit 267239116987 ("mm, powerpc: move the RCU page-table freeing
into generic code").

The powerpc code was made into the generic code, because the powerpc
code had to handle all those special RCU freeing things etc that
others didn't.

It's just that when x86 was then switched over to use the "generic"
code, people didn't realize that the generic code didn't do the TLB
invalidations for page tables, because they hadn't been needed on
powerpc.

So the powerpc code that was made generic, never really was. The new
"generic" code had a powerpc-specific quirk.

That then very subtly broke x86, without the x86 people ever
realizing. Because the old simple non-RCU x86 code had never had that
issue, it just treated the leaf pages and the directory pages exactly
the same.

See?

And THAT is why I talk about the powerpc code. Because what is
"generic" code in 4.18 (and several releases before) oisn't actually
generic.

And that's basically exactly the bug that the patches from PeterZ is
fixing. Making the "tlb_remove_table()" code always flush the tlb, the
way it should have when it was made generic.

              Linus
Linus Torvalds Aug. 23, 2018, 5:11 a.m. UTC | #8
On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
>
>
> So we do need a different flush instruction for the page tables vs. the
> normal TLB pages.

Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
invalidates all the internal tlb cache nodes.

So the "new world order" is exactly that patch that PeterZ sent you, that adds a

+       unsigned int            freed_tables : 1;

to the 'struct mmu_gather', and then makes all those
pte/pmd/pud/p4d_free_tlb() functions set that bit.

So I'm referring to the email PeterZ sent you in this thread that said:

  Nick, Will is already looking at using this to remove the synchronous
  invalidation from __p*_free_tlb() for ARM, could you have a look to see
  if PowerPC-radix could benefit from that too?

  Basically, using a patch like the below, would give your tlb_flush()
  information on if tables were removed or not.

then, in that model, you do *not* need to override these
pte/pmd/pud/p4d_free_tlb() macros at all (well, you *can* if you want
to, for doing games with the range modification, but let's sayt that
you don't need that right now).

So instead, when you get to the actual "tlb_flush(tlb)", you do
exactly that - flush the tlb. And the mmu_gather structure shows you
how much you need to flush. If you see that "freed_tables" is set,
then you know that you need to also do the special instruction to
flush the inner level caches. The range continues to show the page
range.

           Linus
Linus Torvalds Aug. 23, 2018, 5:20 a.m. UTC | #9
On Wed, Aug 22, 2018 at 10:11 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

Note that this obviously works fine for a hashed table model too - you
just ignore the "freed_tables" bit entirely and continue to do
whatever you always did.

And we can ignore it on x86 too, because we just see the range, and we
invalidate the range, and that will always invalidate the mid-level
caching too.

So the new bit is literally for arm and powerpc-radix (and maybe
s390), but we want to make the actual VM interface truly generic and
not have one set of code with five different behaviors (which we
_currently_ kind of have with the whole in addition to all the
HAVE_RCU_TABLE_FREE etc config options that modify how the code works.

It would be good to also cut down on the millions of functions that
each architecture can override, because Christ, it got very confusing
at times to follow just how the code flowed from generic code to
architecture-specific macros back to generic code and then
arch-specific inline helper functions.

It's a maze of underscores and "page" vs "table", and "flush" vs "remove" etc.

But that "it would be good to really make everybody to use as much of
the generic code as possible" and everybody have the same pattern,
that's a future thing. But the whole "let's just add that
"freed_tables" thing would be part of trying to get people to use the
same overall pattern, even if some architectures might not care about
that detail.

              Linus
Benjamin Herrenschmidt Aug. 23, 2018, 5:21 a.m. UTC | #10
On Wed, 2018-08-22 at 22:11 -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
> > 
> > 
> > So we do need a different flush instruction for the page tables vs. the
> > normal TLB pages.
> 
> Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> invalidates all the internal tlb cache nodes.
> 
> So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> 
> +       unsigned int            freed_tables : 1;
> 

 .../...

> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

Yup. That looks like a generic version of the "need_flush_all" flag we
have, which is fine by us.

Just don't blame powerpc for all the historical crap :-)

Cheers,
Ben.

> 
>            Linus
Nicholas Piggin Aug. 23, 2018, 5:58 a.m. UTC | #11
On Wed, 22 Aug 2018 22:03:40 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 9:33 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > I think it was quite well understood and fixed here, a145abf12c9 but
> > again that was before I really started looking at it.  
> 
> You don't understand the problem.

More fundamentally I think I didn't understand this fix, I think
actually powerpc/radix does have a bug here. a145abf12c9 was really
just a replacement for x86's hack of expanding the TLB invalidation
range when freeing page table to capture page walk cache (powerpc/radix
needs a different instruction so that didn't work for us).

But I hadn't really looked at this fix closely rather Peter's follow up
post about making powerpc page walk cache flushing design a generic
concept.

My point in this reply was more that my patches from the other month
weren't a blundering issue to fix this bug without realising it, they
were purely about avoiding the x86 TLB range expanding hack (that won't
be needed if generic users all move over).

> 
> All the x86 people thought WE ALREADY DID THAT.
> 
> Because we had done this all correctly over a decade ago!
> 
> Nobody realized that it had been screwed up by the powerpc code, and

The powerpc/hash code is not screwed up though AFAIKS. You can't
take arch specific code and slap a "generic" label on it, least of all
the crazy powerpc/hash code, you of all people would agree with that :)

> the commit you point to was believed to be a new *powerpc* only issue,
> because the semantics on powerpc has changed because of the radix
> tree.
> 
> The semantics on x86 have never changed, they've always been the same.
> So why would the x86 people react to powerpc doing something that x86
> had already always done.
> 
> See?
> 
> Nobody cared one whit about commit a145abf12c9, because it just
> handles a new powerpc-specific case.
> 
> > I don't really understand what the issue you have with powerpc here.
> > powerpc hash has the page table flushing accessors which are just
> > no-ops, it's the generic code that fails to call them properly. Surely
> > there was no powerpc patch that removed those calls from generic code?  
> 
> Yes there was.
> 
> Look where the generic code *came* from.
> 
> It's powerpc code.
> 
> See commit 267239116987 ("mm, powerpc: move the RCU page-table freeing
> into generic code").
> 
> The powerpc code was made into the generic code, because the powerpc
> code had to handle all those special RCU freeing things etc that
> others didn't.
> 
> It's just that when x86 was then switched over to use the "generic"
> code, people didn't realize that the generic code didn't do the TLB
> invalidations for page tables, because they hadn't been needed on
> powerpc.

Sure, there was a minor bug in the port. Not that it was a closely
guarded secret that powerpc didn't flush page table pages, but it's a
relatively subtle issue in complex code. That happens.

> 
> So the powerpc code that was made generic, never really was. The new
> "generic" code had a powerpc-specific quirk.
> 
> That then very subtly broke x86, without the x86 people ever
> realizing. Because the old simple non-RCU x86 code had never had that
> issue, it just treated the leaf pages and the directory pages exactly
> the same.
> 
> See?
> 
> And THAT is why I talk about the powerpc code. Because what is
> "generic" code in 4.18 (and several releases before) oisn't actually
> generic.
> 
> And that's basically exactly the bug that the patches from PeterZ is
> fixing. Making the "tlb_remove_table()" code always flush the tlb, the
> way it should have when it was made generic.

It just sounded like you were blaming correct powerpc/hash code for
this. It's just a minor bug in taking that code into generic, not really
a big deal, right? Or are you saying powerpc devs or code could be doing
something better to play nicer with the rest of the archs?

Honestly trying to improve things here, and encouraged by x86 and ARM
looking to move over to a saner page walk cache tracking design and
sharing more code with powerpc/radix. I would help with reviewing
things or writing code or porting powerpc bits if I can.

Thanks,
Nick
Nicholas Piggin Aug. 23, 2018, 6:15 a.m. UTC | #12
On Thu, 23 Aug 2018 15:21:30 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2018-08-22 at 22:11 -0700, Linus Torvalds wrote:
> > On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:  
> > > 
> > > 
> > > So we do need a different flush instruction for the page tables vs. the
> > > normal TLB pages.  
> > 
> > Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> > invalidates all the internal tlb cache nodes.
> > 
> > So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> > 
> > +       unsigned int            freed_tables : 1;
> >   
> 
>  .../...
> 
> > So instead, when you get to the actual "tlb_flush(tlb)", you do
> > exactly that - flush the tlb. And the mmu_gather structure shows you
> > how much you need to flush. If you see that "freed_tables" is set,
> > then you know that you need to also do the special instruction to
> > flush the inner level caches. The range continues to show the page
> > range.  
> 
> Yup. That looks like a generic version of the "need_flush_all" flag we
> have, which is fine by us.
> 
> Just don't blame powerpc for all the historical crap :-)

And yes we very much want to remove the x86 hacks from generic code and
have them use the sane powerpc/radix page walk cache flushing model.
That would indeed allow us to stop overriding those macros and start
sharing more code with other archs. We can help write or review code to
make sure bugs don't creep in when moving it to generic implementation.
It will be much more relevant to us now because radix is very similar to
x86.

The hack is not the powerpc override macros though, let's be clear
about that. Even x86 will be helped out by removing that crap because
it won't have to do a full TLB flush caused by massive TLB range if it
frees 0..small number of pages that happen to also free some page
tables.

Thanks,
Nick
Martin Schwidefsky Aug. 23, 2018, 6:48 a.m. UTC | #13
On Wed, 22 Aug 2018 22:20:32 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 10:11 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > So instead, when you get to the actual "tlb_flush(tlb)", you do
> > exactly that - flush the tlb. And the mmu_gather structure shows you
> > how much you need to flush. If you see that "freed_tables" is set,
> > then you know that you need to also do the special instruction to
> > flush the inner level caches. The range continues to show the page
> > range.  
> 
> Note that this obviously works fine for a hashed table model too - you
> just ignore the "freed_tables" bit entirely and continue to do
> whatever you always did.
> 
> And we can ignore it on x86 too, because we just see the range, and we
> invalidate the range, and that will always invalidate the mid-level
> caching too.
> 
> So the new bit is literally for arm and powerpc-radix (and maybe
> s390), but we want to make the actual VM interface truly generic and
> not have one set of code with five different behaviors (which we
> _currently_ kind of have with the whole in addition to all the
> HAVE_RCU_TABLE_FREE etc config options that modify how the code works.
> 
> It would be good to also cut down on the millions of functions that
> each architecture can override, because Christ, it got very confusing
> at times to follow just how the code flowed from generic code to
> architecture-specific macros back to generic code and then
> arch-specific inline helper functions.
> 
> It's a maze of underscores and "page" vs "table", and "flush" vs "remove" etc.
> 
> But that "it would be good to really make everybody to use as much of
> the generic code as possible" and everybody have the same pattern,
> that's a future thing. But the whole "let's just add that
> "freed_tables" thing would be part of trying to get people to use the
> same overall pattern, even if some architectures might not care about
> that detail.

For s390 the new freed_tables bit looks to be step into the right
direction. Right now there is the flush_mm bit in the mm->context
that I use to keep track of the need for a global flush of the mm.
The flush_mm bit is currently used for both lazy PTE flushing and
TLB flushes for freed page table pages.

I'll look into it once the generic patch is merged. The switch to
the generic mmu_gather is certainly not a change that can be done
in a hurry, we had too many subtle TLB flush issues in the past.
Nicholas Piggin Aug. 23, 2018, 8:46 a.m. UTC | #14
On Wed, 22 Aug 2018 17:30:15 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Jann reported that x86 was missing required TLB invalidates when he
> hit the !*batch slow path in tlb_remove_table().
> 
> This is indeed the case; RCU_TABLE_FREE does not provide TLB (cache)
> invalidates, the PowerPC-hash where this code originated and the
> Sparc-hash where this was subsequently used did not need that. ARM
> which later used this put an explicit TLB invalidate in their
> __p*_free_tlb() functions, and PowerPC-radix followed that example.

So this is interesting, I _think_ a145abf12c did fix this bug for
powerpc, but then it seem to have been re-broken by a46cc7a90f
because that one defers the flush back to tlb_flush time. There
was quite a lot of churn getting the radix MMU off the ground at
that point though so I'm not 100% sure.

But AFAIKS powerpc today has this same breakage, and this patch
should fix it.

I have a couple of patches that touch the same code I'll send, you
might have some opinions on them.

Thanks,
Nick
Will Deacon Aug. 23, 2018, 1:39 p.m. UTC | #15
On Wed, Aug 22, 2018 at 10:11:41PM -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
> >
> >
> > So we do need a different flush instruction for the page tables vs. the
> > normal TLB pages.
> 
> Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> invalidates all the internal tlb cache nodes.
> 
> So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> 
> +       unsigned int            freed_tables : 1;
> 
> to the 'struct mmu_gather', and then makes all those
> pte/pmd/pud/p4d_free_tlb() functions set that bit.
> 
> So I'm referring to the email PeterZ sent you in this thread that said:
> 
>   Nick, Will is already looking at using this to remove the synchronous
>   invalidation from __p*_free_tlb() for ARM, could you have a look to see
>   if PowerPC-radix could benefit from that too?
> 
>   Basically, using a patch like the below, would give your tlb_flush()
>   information on if tables were removed or not.
> 
> then, in that model, you do *not* need to override these
> pte/pmd/pud/p4d_free_tlb() macros at all (well, you *can* if you want
> to, for doing games with the range modification, but let's sayt that
> you don't need that right now).
> 
> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

The only problem with this approach is that we've lost track of the granule
size by the point we get to the tlb_flush(), so we can't adjust the stride of
the TLB invalidations for huge mappings, which actually works nicely in the
synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
rather than iterating over it at a 4k granule).

One thing we could do is switch to synchronous mode if we detect a change in
granule (i.e. treat it like a batch failure).

Will
Will Deacon Aug. 23, 2018, 11:31 p.m. UTC | #16
On Wed, Aug 22, 2018 at 05:55:27PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> > ARM
> > which later used this put an explicit TLB invalidate in their
> > __p*_free_tlb() functions, and PowerPC-radix followed that example.
> 
> > +/*
> > + * If we want tlb_remove_table() to imply TLB invalidates.
> > + */
> > +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> > +{
> > +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> > +	/*
> > +	 * Invalidate page-table caches used by hardware walkers. Then we still
> > +	 * need to RCU-sched wait while freeing the pages because software
> > +	 * walkers can still be in-flight.
> > +	 */
> > +	__tlb_flush_mmu_tlbonly(tlb);
> > +#endif
> > +}
> 
> 
> Nick, Will is already looking at using this to remove the synchronous
> invalidation from __p*_free_tlb() for ARM, could you have a look to see
> if PowerPC-radix could benefit from that too?
> 
> Basically, using a patch like the below, would give your tlb_flush()
> information on if tables were removed or not.

Just to say that I have something up and running for arm64 based on this.
I'll post it once it's seen a bit more testing (either tomorrow or Monday).

Will
Peter Zijlstra Aug. 24, 2018, 8:35 a.m. UTC | #17
On Thu, Aug 23, 2018 at 02:54:20PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-22 at 20:59 -0700, Linus Torvalds wrote:

> > The problem is that x86 _used_ to do this all correctly long long ago.
> > 
> > And then we switched over to the "generic" table flushing (which
> > harkens back to the powerpc code).
> 
> Yes, we wrote it the RCU stuff to solve the races with SW walking,
> which is completely orthogonal with HW walking & TLB content. We didn't
> do the move to generic code though ;-)
> 
> > Which actually turned out to be not generic at all, and did not flush
> > the internal pages like x86 used to (back when x86 just used
> > tlb_remove_page for everything).
> 
> Well, having RCU do the flushing is rather generic, it makes sense
> whenever there's somebody doing a SW walk *and* you don't have IPIs to
> synchronize your flushes (ie, anybody with HW TLB invalidation
> broadcast basically, so ARM and us).

Right, so (many many years ago) I moved it over to generic code because
Sparc-hash wanted fast_gup and I figured having multiple copies of this
stuff wasn't ideal.

Then ARM came along and used it because it does the invalidate
broadcast.

And then when we switched x86 over last year or so; because paravirt; I
had long since forgotten all details and completely overlooked this.

Worse; somewhere along the line we tried to get s390 on this and they
ran into the exact problem being fixed now. That _should_ have been a
big clue, but somehow I never got around to thinking about it properly
and they went back to a private copy of all this.

So double fail on me I suppose :-/

Anyway, its sorted now; although I'd like to write me a fairly big
comment in asm-generic/tlb.h about things, before I forget again.
Peter Zijlstra Aug. 24, 2018, 8:47 a.m. UTC | #18
On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> The only problem with this approach is that we've lost track of the granule
> size by the point we get to the tlb_flush(), so we can't adjust the stride of
> the TLB invalidations for huge mappings, which actually works nicely in the
> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> rather than iterating over it at a 4k granule).
> 
> One thing we could do is switch to synchronous mode if we detect a change in
> granule (i.e. treat it like a batch failure).

We could use tlb_start_vma() to track that, I think. Shouldn't be too
hard.
Peter Zijlstra Aug. 24, 2018, 11:32 a.m. UTC | #19
On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:
> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> > The only problem with this approach is that we've lost track of the granule
> > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > the TLB invalidations for huge mappings, which actually works nicely in the
> > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > rather than iterating over it at a 4k granule).
> > 
> > One thing we could do is switch to synchronous mode if we detect a change in
> > granule (i.e. treat it like a batch failure).
> 
> We could use tlb_start_vma() to track that, I think. Shouldn't be too
> hard.

Hurm.. look at commit:

  e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")

yuck yuck yuck. That needs fixing.
Peter Zijlstra Aug. 24, 2018, 11:39 a.m. UTC | #20
On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> > > The only problem with this approach is that we've lost track of the granule
> > > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > > the TLB invalidations for huge mappings, which actually works nicely in the
> > > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > > rather than iterating over it at a 4k granule).
> > > 
> > > One thing we could do is switch to synchronous mode if we detect a change in
> > > granule (i.e. treat it like a batch failure).
> > 
> > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > hard.
> 
> Hurm.. look at commit:
> 
>   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")

Ah, good, it seems that already got cleaned up a lot. But it all moved
into the power code.. blergh.
Peter Zijlstra Aug. 24, 2018, 1:13 p.m. UTC | #21
On Fri, Aug 24, 2018 at 10:35:56AM +0200, Peter Zijlstra wrote:

> Anyway, its sorted now; although I'd like to write me a fairly big
> comment in asm-generic/tlb.h about things, before I forget again.

How's something like so? There's a little page_size thingy in this;
mostly because I couldn't be arsed to split it for now.

Will has opinions on the page_size thing; I'll let him explain.

---

 arch/Kconfig                   |   3 +
 arch/arm/include/asm/tlb.h     |   3 +-
 arch/ia64/include/asm/tlb.h    |   3 +-
 arch/powerpc/Kconfig           |   1 +
 arch/powerpc/include/asm/tlb.h |  17 ------
 arch/s390/include/asm/tlb.h    |   4 +-
 arch/sh/include/asm/tlb.h      |   4 +-
 arch/um/include/asm/tlb.h      |   4 +-
 include/asm-generic/tlb.h      | 130 ++++++++++++++++++++++++++++++++++++-----
 mm/huge_memory.c               |   4 +-
 mm/hugetlb.c                   |   2 +-
 mm/madvise.c                   |   2 +-
 mm/memory.c                    |   9 ++-
 13 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6801123932a5..053c44703539 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -365,6 +365,9 @@ config HAVE_RCU_TABLE_FREE
 config HAVE_RCU_TABLE_INVALIDATE
 	bool
 
+config HAVE_MMU_GATHER_PAGE_SIZE
+	bool
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index f854148c8d7c..d644c3c7c6f3 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -286,8 +286,7 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr
 
 #define tlb_migrate_finish(mm)		do { } while (0)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
 }
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 516355a774bf..bf8985f5f876 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -282,8 +282,7 @@ do {							\
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
 }
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db0b6eebbfa5..4db1072868f7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -217,6 +217,7 @@ config PPC
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
+	select HAVE_MMU_GATHER_PAGE_SIZE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f0e571b2dc7c..b29a67137acf 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -27,7 +27,6 @@
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 extern void tlb_flush(struct mmu_gather *tlb);
 
@@ -46,22 +45,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
 #endif
 }
 
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
-{
-	if (!tlb->page_size)
-		tlb->page_size = page_size;
-	else if (tlb->page_size != page_size) {
-		if (!tlb->fullmm)
-			tlb_flush_mmu(tlb);
-		/*
-		 * update the page size after flush for the new
-		 * mmu_gather.
-		 */
-		tlb->page_size = page_size;
-	}
-}
-
 #ifdef CONFIG_SMP
 static inline int mm_is_core_local(struct mm_struct *mm)
 {
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 457b7ba0fbb6..cf3d64313740 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -180,9 +180,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 77abe192fb43..af7c9d891cf8 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -127,9 +127,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 	return tlb_remove_page(tlb, page);
 }
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index dce6db147f24..6463f3ab1767 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -146,9 +146,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b3353e21f3b3..d3573ba10068 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -20,6 +20,108 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+/*
+ * Generic MMU-gather implementation.
+ *
+ * The mmu_gather data structure is used by the mm code to implement the
+ * correct and efficient ordering of freeing pages and TLB invalidations.
+ *
+ * This correct ordering is:
+ *
+ *  1) unhook page
+ *  2) TLB invalidate page
+ *  3) free page
+ *
+ * That is, we must never free a page before we have ensured there are no live
+ * translations left to it. Otherwise it might be possible to observe (or
+ * worse, change) the page content after it has been reused.
+ *
+ * The mmu_gather API consists of:
+ *
+ *  - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather
+ *
+ *    Finish in particular will issue a (final) TLB invalidate and free
+ *    all (remaining) queued pages.
+ *
+ *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
+ *
+ *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
+ *    there's large holes between the VMAs.
+ *
+ *  - tlb_remove_page() / __tlb_remove_page()
+ *  - tlb_remove_page_size() / __tlb_remove_page_size()
+ *
+ *    __tlb_remove_page_size() is the basic primitive that queues a page for
+ *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
+ *    boolean indicating if the queue is (now) full and a call to
+ *    tlb_flush_mmu() is required.
+ *
+ *    tlb_remove_page() and tlb_remove_page_size() imply the call to
+ *    tlb_flush_mmu() when required and has no return value.
+ *
+ *  - tlb_change_page_size()
+ *
+ *    call before __tlb_remove_page*() to set the current page-size; implies a
+ *    possible tlb_flush_mmu() call.
+ *
+ *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() / tlb_flush_mmu_free()
+ *
+ *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
+ *                              related state, like the range)
+ *
+ *    tlb_flush_mmu_free() - frees the queued pages; make absolutely
+ *			     sure no additional tlb_remove_page()
+ *			     calls happen between _tlbonly() and this.
+ *
+ *    tlb_flush_mmu() - the above two calls.
+ *
+ *  - mmu_gather::fullmm
+ *
+ *    A flag set by tlb_gather_mmu() to indicate we're going to free
+ *    the entire mm; this allows a number of optimizations.
+ *
+ *    XXX list optimizations
+ *
+ *  - mmu_gather::need_flush_all
+ *
+ *    A flag that can be set by the arch code if it wants to force
+ *    flush the entire TLB irrespective of the range. For instance
+ *    x86-PAE needs this when changing top-level entries.
+ *
+ * And requires the architecture to provide and implement tlb_flush().
+ *
+ * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
+ * use of:
+ *
+ *  - mmu_gather::start / mmu_gather::end
+ *
+ *    which (when !need_flush_all; fullmm will have start = end = ~0UL) provides
+ *    the range that needs to be flushed to cover the pages to be freed.
+ *
+ * Additionally there are a few opt-in features:
+ *
+ *  HAVE_MMU_GATHER_PAGE_SIZE
+ *
+ *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
+ *  changes the size and provides mmu_gather::page_size to tlb_flush().
+ *
+ *  HAVE_RCU_TABLE_FREE
+ *
+ *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
+ *  for page directores (__p*_free_tlb()). This provides separate freeing of
+ *  the page-table pages themselves in a semi-RCU fashion (see comment below).
+ *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
+ *  and therefore doesn't naturally serialize with software page-table walkers.
+ *
+ *  HAVE_RCU_TABLE_INVALIDATE
+ *
+ *  This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing
+ *  the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your
+ *  architecture uses the Linux page-tables natively.
+ *
+ */
+#define HAVE_GENERIC_MMU_GATHER
+
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 /*
  * Semi RCU freeing of the page directories.
@@ -87,14 +189,17 @@ struct mmu_gather_batch {
  */
 #define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
 
-/* struct mmu_gather is an opaque type used by the mm code for passing around
+/*
+ * struct mmu_gather is an opaque type used by the mm code for passing around
  * any data needed by arch specific code for tlb_remove_page.
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
+
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	struct mmu_table_batch	*batch;
 #endif
+
 	unsigned long		start;
 	unsigned long		end;
 	/* we are in the middle of an operation to clear
@@ -103,15 +208,17 @@ struct mmu_gather {
 	/* we have performed an operation which
 	 * requires a complete flush of the tlb */
 				need_flush_all : 1;
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+	unsigned int page_size;
+#endif
 
 	struct mmu_gather_batch *active;
 	struct mmu_gather_batch	local;
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	unsigned int		batch_count;
-	int page_size;
+
 };
 
-#define HAVE_GENERIC_MMU_GATHER
 
 void arch_tlb_gather_mmu(struct mmu_gather *tlb,
 	struct mm_struct *mm, unsigned long start, unsigned long end);
@@ -170,21 +277,18 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 
-#ifndef tlb_remove_check_page_size_change
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
-	/*
-	 * We don't care about page size change, just update
-	 * mmu_gather page size here so that debug checks
-	 * doesn't throw false warning.
-	 */
-#ifdef CONFIG_DEBUG_VM
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+	if (tlb->page_size && tlb->page_size != page_size) {
+		if (!tlb->fullmm)
+			tlb_flush_mmu(tlb);
+	}
+
 	tlb->page_size = page_size;
 #endif
 }
-#endif
 
 /*
  * In the case of tlb vma handling, we can optimise these away in the
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08b544383d74..786758670ba1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1617,7 +1617,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct mm_struct *mm = tlb->mm;
 	bool ret = false;
 
-	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
@@ -1693,7 +1693,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	pmd_t orig_pmd;
 	spinlock_t *ptl;
 
-	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 
 	ptl = __pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..8af346b53a79 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3337,7 +3337,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	 * This is a hugetlb vma, all the pte entries should point
 	 * to huge page.
 	 */
-	tlb_remove_check_page_size_change(tlb, sz);
+	tlb_change_page_size(tlb, sz);
 	tlb_start_vma(tlb, vma);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	address = start;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..2a9073c652d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
diff --git a/mm/memory.c b/mm/memory.c
index 83aef222f11b..2818e00d1aae 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -233,7 +233,9 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
 #endif
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
 	tlb->page_size = 0;
+#endif
 
 	__tlb_reset_range(tlb);
 }
@@ -294,7 +296,10 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	struct mmu_gather_batch *batch;
 
 	VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
 	VM_WARN_ON(tlb->page_size != page_size);
+#endif
 
 	batch = tlb->active;
 	/*
@@ -602,7 +607,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 	 * We add page table cache pages with PAGE_SIZE,
 	 * (see pte_free_tlb()), flush the tlb if we need
 	 */
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1293,7 +1298,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	pte_t *pte;
 	swp_entry_t entry;
 
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 again:
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
Peter Zijlstra Aug. 24, 2018, 1:14 p.m. UTC | #22
On Fri, Aug 24, 2018 at 03:13:32PM +0200, Peter Zijlstra wrote:
> + *  HAVE_RCU_TABLE_FREE
> + *
> + *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
> + *  for page directores (__p*_free_tlb()). This provides separate freeing of
> + *  the page-table pages themselves in a semi-RCU fashion (see comment below).
> + *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
> + *  and therefore doesn't naturally serialize with software page-table walkers.
> + *
> + *  HAVE_RCU_TABLE_INVALIDATE
> + *
> + *  This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing
> + *  the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your
> + *  architecture uses the Linux page-tables natively.

Writing that also made me think we maybe should've negated that option.
Will Deacon Aug. 24, 2018, 3:49 p.m. UTC | #23
Hi Peter,

On Fri, Aug 24, 2018 at 03:13:32PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 24, 2018 at 10:35:56AM +0200, Peter Zijlstra wrote:
> 
> > Anyway, its sorted now; although I'd like to write me a fairly big
> > comment in asm-generic/tlb.h about things, before I forget again.
> 
> How's something like so? There's a little page_size thingy in this;
> mostly because I couldn't be arsed to split it for now.
> 
> Will has opinions on the page_size thing; I'll let him explain.

They're not especially strong opinions, it's just that I don't think the
page size is necessarily the right thing to track and I'd rather remove that
altogether.

In the patches I've hacked up (I'll post shortly as an RFC), I track the
levels of page-table instead so you can relate the mmu_gather explicitly
with the page-table structure, rather than have to infer it from the page
size. For example, if an architecture could put down huge mappings at the
pte level (e.g. using a contiguous hint in the pte like we have on arm64),
then actually you want to know about the level rather than the size. You can
also track the levels using only 4 bits in the gather structure.

Finally, both approaches have a funny corner case when a VMA contains a
mixture of granule sizes. With the "page size has changed so flush
synchronously" you can theoretically end up with a lot of flushes, where
you'd have been better off just invalidating the whole mm. If you track the
levels instead and postpone a flush using the smallest level you saw, then
you're likely to hit whatever threshold you have and nuke the mm.

Will
Nadav Amit Aug. 24, 2018, 5:26 p.m. UTC | #24
at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
>> The only problem with this approach is that we've lost track of the granule
>> size by the point we get to the tlb_flush(), so we can't adjust the stride of
>> the TLB invalidations for huge mappings, which actually works nicely in the
>> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
>> rather than iterating over it at a 4k granule).
>> 
>> One thing we could do is switch to synchronous mode if we detect a change in
>> granule (i.e. treat it like a batch failure).
> 
> We could use tlb_start_vma() to track that, I think. Shouldn't be too
> hard.

Somewhat unrelated, but I use this opportunity that TLB got your attention
for something that bothers me for some time. clear_fixmap(), which is used
in various places (e.g., text_poke()), ends up in doing only a local TLB
flush (in __set_pte_vaddr()).

Is that sufficient?
Peter Zijlstra Aug. 24, 2018, 6:04 p.m. UTC | #25
On Fri, Aug 24, 2018 at 10:26:50AM -0700, Nadav Amit wrote:
> at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> >> The only problem with this approach is that we've lost track of the granule
> >> size by the point we get to the tlb_flush(), so we can't adjust the stride of
> >> the TLB invalidations for huge mappings, which actually works nicely in the
> >> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> >> rather than iterating over it at a 4k granule).
> >> 
> >> One thing we could do is switch to synchronous mode if we detect a change in
> >> granule (i.e. treat it like a batch failure).
> > 
> > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > hard.
> 
> Somewhat unrelated, but I use this opportunity that TLB got your attention
> for something that bothers me for some time. clear_fixmap(), which is used
> in various places (e.g., text_poke()), ends up in doing only a local TLB
> flush (in __set_pte_vaddr()).
> 
> Is that sufficient?

Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
during PTI, but I seem to have forgotten everything again.
Nadav Amit Aug. 24, 2018, 6:35 p.m. UTC | #26
at 11:04 AM, Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Aug 24, 2018 at 10:26:50AM -0700, Nadav Amit wrote:
>> at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>>> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
>>>> The only problem with this approach is that we've lost track of the granule
>>>> size by the point we get to the tlb_flush(), so we can't adjust the stride of
>>>> the TLB invalidations for huge mappings, which actually works nicely in the
>>>> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
>>>> rather than iterating over it at a 4k granule).
>>>> 
>>>> One thing we could do is switch to synchronous mode if we detect a change in
>>>> granule (i.e. treat it like a batch failure).
>>> 
>>> We could use tlb_start_vma() to track that, I think. Shouldn't be too
>>> hard.
>> 
>> Somewhat unrelated, but I use this opportunity that TLB got your attention
>> for something that bothers me for some time. clear_fixmap(), which is used
>> in various places (e.g., text_poke()), ends up in doing only a local TLB
>> flush (in __set_pte_vaddr()).
>> 
>> Is that sufficient?
> 
> Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
> during PTI, but I seem to have forgotten everything again.

[ Changed the title. Sorry for hijacking the thread. ]

Since:

native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()

And pgd_offset_k() uses init_mm, they do not seem to be per-CPU.

In addition, the __flush_tlb_one_kernel() in text_poke() seems redundant
(since set_fixmap() should do it as well).

If you also think the current behavior is inappropriate, I can take a stab
at fixing it by adding a shootdown. But, if text_poke() is called when
interrupts are disabled, the fix would be annoying.
Linus Torvalds Aug. 24, 2018, 7:31 p.m. UTC | #27
On Fri, Aug 24, 2018 at 11:36 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> >
> > Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
> > during PTI, but I seem to have forgotten everything again.
>
> [ Changed the title. Sorry for hijacking the thread. ]
>
> Since:
>
> native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()

The fixmaps should be entirely fixed after bootup to constant
mappings, except for the KMAP ones, and they are indexed per-cpu.

That's what my mental model is, at least.

Can you actually find something that changes the fixmaps after boot
(again, ignoring kmap)?

Maybe worth adding some debugging to verify that?

                Linus
Nadav Amit Aug. 24, 2018, 8:24 p.m. UTC | #28
at 12:31 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, Aug 24, 2018 at 11:36 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>>> Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
>>> during PTI, but I seem to have forgotten everything again.
>> 
>> [ Changed the title. Sorry for hijacking the thread. ]
>> 
>> Since:
>> 
>> native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()
> 
> The fixmaps should be entirely fixed after bootup to constant
> mappings, except for the KMAP ones, and they are indexed per-cpu.
> 
> That's what my mental model is, at least.
> 
> Can you actually find something that changes the fixmaps after boot
> (again, ignoring kmap)?

At least the alternatives mechanism appears to do so.
 
IIUC the following path is possible when adding a module:

	jump_label_add_module()
	->__jump_label_update()
	->arch_jump_label_transform()
	->__jump_label_transform()
	->text_poke_bp()
	->text_poke()
	->set_fixmap()
	
And a similar path can happen when static_key_enable/disable() is called.
Linus Torvalds Aug. 25, 2018, 12:58 a.m. UTC | #29
Adding a few people to the cc.

On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >
> > Can you actually find something that changes the fixmaps after boot
> > (again, ignoring kmap)?
>
> At least the alternatives mechanism appears to do so.
>
> IIUC the following path is possible when adding a module:
>
>         jump_label_add_module()
>         ->__jump_label_update()
>         ->arch_jump_label_transform()
>         ->__jump_label_transform()
>         ->text_poke_bp()
>         ->text_poke()
>         ->set_fixmap()

Yeah, that looks a bit iffy.

But making the tlb flush global wouldn't help.  This is running on a
local core, and if there are other CPU's that can do this at the same
time, then they'd just fight about the same mapping.

Honestly, I think it's ok just because I *hope* this is all serialized
anyway (jump_label_lock? But what about other users of text_poke?).

But I'd be a lot happier about it if it either used an explicit lock
to make sure, or used per-cpu fixmap entries.

And the tlb flush is done *after* the address is used, which is bogus anyway.

> And a similar path can happen when static_key_enable/disable() is called.

Same comments.

How about replacing that

        local_irq_save(flags);
       ... do critical things here ...
        local_irq_restore(flags);

in text_poke() with

        static DEFINE_SPINLOCK(poke_lock);

        spin_lock_irqsave(&poke_lock, flags);
       ... do critical things here ...
        spin_unlock_irqrestore(&poke_lock, flags);

and moving the local_flush_tlb() to after the set_fixmaps, but before
the access through the virtual address.

But changing things to do a global tlb flush would just be wrong.

           Linus
Nadav Amit Aug. 25, 2018, 2:16 a.m. UTC | #30
On Fri, Aug 24, 2018, 5:58 PM Linus Torvalds <torvalds@linux-foundation.org>
wrote:

> Adding a few people to the cc.
>
> On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> > >
> > > Can you actually find something that changes the fixmaps after boot
> > > (again, ignoring kmap)?
> >
> > At least the alternatives mechanism appears to do so.
> >
> > IIUC the following path is possible when adding a module:
> >
> >         jump_label_add_module()
> >         ->__jump_label_update()
> >         ->arch_jump_label_transform()
> >         ->__jump_label_transform()
> >         ->text_poke_bp()
> >         ->text_poke()
> >         ->set_fixmap()
>
> Yeah, that looks a bit iffy.
>
> But making the tlb flush global wouldn't help.  This is running on a
> local core, and if there are other CPU's that can do this at the same
> time, then they'd just fight about the same mapping.
>
> Honestly, I think it's ok just because I *hope* this is all serialized
> anyway (jump_label_lock? But what about other users of text_poke?).
>

The users should hold text_mutex.



> But I'd be a lot happier about it if it either used an explicit lock
> to make sure, or used per-cpu fixmap entries.
>

My concern is that despite the lock, one core would do a speculative page
walk and cache a translation that soon after would become stale.



> And the tlb flush is done *after* the address is used, which is bogus
> anyway.
>

It seems to me that it is intended to remove the mapping that might be a
security issue.

But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in
__set_pte_vaddr()) so locally things should be fine.



>
> > And a similar path can happen when static_key_enable/disable() is called.
>
> Same comments.
>
> How about replacing that
>
>         local_irq_save(flags);
>        ... do critical things here ...
>         local_irq_restore(flags);
>
> in text_poke() with
>
>         static DEFINE_SPINLOCK(poke_lock);
>
>         spin_lock_irqsave(&poke_lock, flags);
>        ... do critical things here ...
>         spin_unlock_irqrestore(&poke_lock, flags);
>
> and moving the local_flush_tlb() to after the set_fixmaps, but before
> the access through the virtual address.
>
> But changing things to do a global tlb flush would just be wrong.



As I noted, I think that locking and local flushes as they are right now
are fine (besides the redundant flush).
My concern is merely that speculative page walks on other cores would cache
stale entries.
<div dir="auto"><div><br><br><div class="gmail_quote"><div dir="ltr">On Fri, Aug 24, 2018, 5:58 PM Linus Torvalds &lt;<a href="mailto:torvalds@linux-foundation.org" target="_blank" rel="noreferrer">torvalds@linux-foundation.org</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Adding a few people to the cc.<br>
<br>
On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit &lt;<a href="mailto:nadav.amit@gmail.com" rel="noreferrer noreferrer" target="_blank">nadav.amit@gmail.com</a>&gt; wrote:<br>
&gt; &gt;<br>
&gt; &gt; Can you actually find something that changes the fixmaps after boot<br>
&gt; &gt; (again, ignoring kmap)?<br>
&gt;<br>
&gt; At least the alternatives mechanism appears to do so.<br>
&gt;<br>
&gt; IIUC the following path is possible when adding a module:<br>
&gt;<br>
&gt;         jump_label_add_module()<br>
&gt;         -&gt;__jump_label_update()<br>
&gt;         -&gt;arch_jump_label_transform()<br>
&gt;         -&gt;__jump_label_transform()<br>
&gt;         -&gt;text_poke_bp()<br>
&gt;         -&gt;text_poke()<br>
&gt;         -&gt;set_fixmap()<br>
<br>
Yeah, that looks a bit iffy.<br>
<br>
But making the tlb flush global wouldn&#39;t help.  This is running on a<br>
local core, and if there are other CPU&#39;s that can do this at the same<br>
time, then they&#39;d just fight about the same mapping.<br>
<br>
Honestly, I think it&#39;s ok just because I *hope* this is all serialized<br>
anyway (jump_label_lock? But what about other users of text_poke?).<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">The users should hold text_mutex.</div><div dir="auto"><br></div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
But I&#39;d be a lot happier about it if it either used an explicit lock<br>
to make sure, or used per-cpu fixmap entries.<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.</div><div dir="auto"><br></div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
And the tlb flush is done *after* the address is used, which is bogus anyway.<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">It seems to me that it is intended to remove the mapping that might be a security issue. </div><div dir="auto"><br></div><div dir="auto">But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.</div><div dir="auto"><br></div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br><br>
&gt; And a similar path can happen when static_key_enable/disable() is called.<br>
<br>
Same comments.<br>
<br>
How about replacing that<br>
<br>
        local_irq_save(flags);<br>
       ... do critical things here ...<br>
        local_irq_restore(flags);<br>
<br>
in text_poke() with<br>
<br>
        static DEFINE_SPINLOCK(poke_lock);<br>
<br>
        spin_lock_irqsave(&amp;poke_lock, flags);<br>
       ... do critical things here ...<br>
        spin_unlock_irqrestore(&amp;poke_lock, flags);<br>
<br>
and moving the local_flush_tlb() to after the set_fixmaps, but before<br>
the access through the virtual address.<br>
<br>
But changing things to do a global tlb flush would just be wrong.</blockquote></div></div><div dir="auto"><br></div><div dir="auto"><br></div><div dir="auto">As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).</div><div dir="auto">My concern is merely that speculative page walks on other cores would cache stale entries.</div></div>
Nadav Amit Aug. 25, 2018, 2:29 a.m. UTC | #31
On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>Adding a few people to the cc.
>
>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
>wrote:
>> >
>> > Can you actually find something that changes the fixmaps after boot
>> > (again, ignoring kmap)?
>>
>> At least the alternatives mechanism appears to do so.
>>
>> IIUC the following path is possible when adding a module:
>>
>>         jump_label_add_module()
>>         ->__jump_label_update()
>>         ->arch_jump_label_transform()
>>         ->__jump_label_transform()
>>         ->text_poke_bp()
>>         ->text_poke()
>>         ->set_fixmap()
>
>Yeah, that looks a bit iffy.
>
>But making the tlb flush global wouldn't help.  This is running on a
>local core, and if there are other CPU's that can do this at the same
>time, then they'd just fight about the same mapping.
>
>Honestly, I think it's ok just because I *hope* this is all serialized
>anyway (jump_label_lock? But what about other users of text_poke?).

The users should hold text_mutex.

>
>But I'd be a lot happier about it if it either used an explicit lock
>to make sure, or used per-cpu fixmap entries.

My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.

>
>And the tlb flush is done *after* the address is used, which is bogus
>anyway.

It seems to me that it is intended to remove the mapping that might be a security issue. 

But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.

>
>> And a similar path can happen when static_key_enable/disable() is
>called.
>
>Same comments.
>
>How about replacing that
>
>        local_irq_save(flags);
>       ... do critical things here ...
>        local_irq_restore(flags);
>
>in text_poke() with
>
>        static DEFINE_SPINLOCK(poke_lock);
>
>        spin_lock_irqsave(&poke_lock, flags);
>       ... do critical things here ...
>        spin_unlock_irqrestore(&poke_lock, flags);
>
>and moving the local_flush_tlb() to after the set_fixmaps, but before
>the access through the virtual address.
>
>But changing things to do a global tlb flush would just be wrong.

As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).

My concern is merely that speculative page walks on other cores would cache stale entries.
Andy Lutomirski Aug. 25, 2018, 4:23 a.m. UTC | #32
On Fri, Aug 24, 2018 at 7:29 PM,  <nadav.amit@gmail.com> wrote:
>
>
> On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>Adding a few people to the cc.
>>
>>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
>>wrote:
>>> >
>>> > Can you actually find something that changes the fixmaps after boot
>>> > (again, ignoring kmap)?
>>>
>>> At least the alternatives mechanism appears to do so.
>>>
>>> IIUC the following path is possible when adding a module:
>>>
>>>         jump_label_add_module()
>>>         ->__jump_label_update()
>>>         ->arch_jump_label_transform()
>>>         ->__jump_label_transform()
>>>         ->text_poke_bp()
>>>         ->text_poke()
>>>         ->set_fixmap()
>>
>>Yeah, that looks a bit iffy.
>>
>>But making the tlb flush global wouldn't help.  This is running on a
>>local core, and if there are other CPU's that can do this at the same
>>time, then they'd just fight about the same mapping.
>>
>>Honestly, I think it's ok just because I *hope* this is all serialized
>>anyway (jump_label_lock? But what about other users of text_poke?).
>
> The users should hold text_mutex.
>
>>
>>But I'd be a lot happier about it if it either used an explicit lock
>>to make sure, or used per-cpu fixmap entries.
>
> My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.
>
>>
>>And the tlb flush is done *after* the address is used, which is bogus
>>anyway.
>
> It seems to me that it is intended to remove the mapping that might be a security issue.
>
> But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.
>
>>
>>> And a similar path can happen when static_key_enable/disable() is
>>called.
>>
>>Same comments.
>>
>>How about replacing that
>>
>>        local_irq_save(flags);
>>       ... do critical things here ...
>>        local_irq_restore(flags);
>>
>>in text_poke() with
>>
>>        static DEFINE_SPINLOCK(poke_lock);
>>
>>        spin_lock_irqsave(&poke_lock, flags);
>>       ... do critical things here ...
>>        spin_unlock_irqrestore(&poke_lock, flags);
>>
>>and moving the local_flush_tlb() to after the set_fixmaps, but before
>>the access through the virtual address.
>>
>>But changing things to do a global tlb flush would just be wrong.
>
> As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).
>
> My concern is merely that speculative page walks on other cores would cache stale entries.
>
>

This is almost certainly a bug, or even two bugs.  Bug 1:  why on
Earth do we flush in __set_pte_vaddr()?  We should flush when
*clearing* or when modifying an existing fixmap entry.  Right now, if
we do text_poke() after boot, then the TLB entry will stick around and
will be a nice exploit target.

Bug 2: what you're describing.  It's racy.

Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
Masami Hiramatsu (Google) Aug. 26, 2018, 2:23 a.m. UTC | #33
On Fri, 24 Aug 2018 21:23:26 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> On Fri, Aug 24, 2018 at 7:29 PM,  <nadav.amit@gmail.com> wrote:
> >
> >
> > On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >>Adding a few people to the cc.
> >>
> >>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
> >>wrote:
> >>> >
> >>> > Can you actually find something that changes the fixmaps after boot
> >>> > (again, ignoring kmap)?
> >>>
> >>> At least the alternatives mechanism appears to do so.
> >>>
> >>> IIUC the following path is possible when adding a module:
> >>>
> >>>         jump_label_add_module()
> >>>         ->__jump_label_update()
> >>>         ->arch_jump_label_transform()
> >>>         ->__jump_label_transform()
> >>>         ->text_poke_bp()
> >>>         ->text_poke()
> >>>         ->set_fixmap()
> >>
> >>Yeah, that looks a bit iffy.
> >>
> >>But making the tlb flush global wouldn't help.  This is running on a
> >>local core, and if there are other CPU's that can do this at the same
> >>time, then they'd just fight about the same mapping.
> >>
> >>Honestly, I think it's ok just because I *hope* this is all serialized
> >>anyway (jump_label_lock? But what about other users of text_poke?).
> >
> > The users should hold text_mutex.
> >
> >>
> >>But I'd be a lot happier about it if it either used an explicit lock
> >>to make sure, or used per-cpu fixmap entries.
> >
> > My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.
> >
> >>
> >>And the tlb flush is done *after* the address is used, which is bogus
> >>anyway.
> >
> > It seems to me that it is intended to remove the mapping that might be a security issue.
> >
> > But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.
> >
> >>
> >>> And a similar path can happen when static_key_enable/disable() is
> >>called.
> >>
> >>Same comments.
> >>
> >>How about replacing that
> >>
> >>        local_irq_save(flags);
> >>       ... do critical things here ...
> >>        local_irq_restore(flags);
> >>
> >>in text_poke() with
> >>
> >>        static DEFINE_SPINLOCK(poke_lock);
> >>
> >>        spin_lock_irqsave(&poke_lock, flags);
> >>       ... do critical things here ...
> >>        spin_unlock_irqrestore(&poke_lock, flags);
> >>
> >>and moving the local_flush_tlb() to after the set_fixmaps, but before
> >>the access through the virtual address.
> >>
> >>But changing things to do a global tlb flush would just be wrong.
> >
> > As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).
> >
> > My concern is merely that speculative page walks on other cores would cache stale entries.
> >
> >
> 
> This is almost certainly a bug, or even two bugs.  Bug 1:  why on
> Earth do we flush in __set_pte_vaddr()?  We should flush when
> *clearing* or when modifying an existing fixmap entry.  Right now, if
> we do text_poke() after boot, then the TLB entry will stick around and
> will be a nice exploit target.
> 
> Bug 2: what you're describing.  It's racy.
> 
> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?

No, since kmap_atomic() is only for x86_32 and highmem support kernel.
In x86-64, it seems that returns just a page address. That is not
good for text_poke, since it needs to make a writable alias for RO
code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?

Thank you,
Andy Lutomirski Aug. 26, 2018, 4:21 a.m. UTC | #34
On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> On Fri, 24 Aug 2018 21:23:26 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>
> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
> In x86-64, it seems that returns just a page address. That is not
> good for text_poke, since it needs to make a writable alias for RO
> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>

I just re-read text_poke().  It's, um, horrible.  Not only is the
implementation overcomplicated and probably buggy, but it's SLOOOOOW.
It's totally the wrong API -- poking one instruction at a time
basically can't be efficient on x86.  The API should either poke lots
of instructions at once or should be text_poke_begin(); ...;
text_poke_end();.

Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
too scary of an approach?  With the patch applied, text_poke() is a
fantastic exploit target.  On the other hand, even without the patch
applied, text_poke() is every bit as juicy.

--Andy
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 014f214da581..811c8735b129 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -690,40 +690,15 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
 void *text_poke(void *addr, const void *opcode, size_t len)
 {
 	unsigned long flags;
-	char *vaddr;
-	struct page *pages[2];
-	int i;
-
-	/*
-	 * While boot memory allocator is runnig we cannot use struct
-	 * pages as they are not yet initialized.
-	 */
-	BUG_ON(!after_bootmem);
+	unsigned long old_cr0;
 
-	if (!core_kernel_text((unsigned long)addr)) {
-		pages[0] = vmalloc_to_page(addr);
-		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
-	} else {
-		pages[0] = virt_to_page(addr);
-		WARN_ON(!PageReserved(pages[0]));
-		pages[1] = virt_to_page(addr + PAGE_SIZE);
-	}
-	BUG_ON(!pages[0]);
 	local_irq_save(flags);
-	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
-	if (pages[1])
-		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
-	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
-	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
-	clear_fixmap(FIX_TEXT_POKE0);
-	if (pages[1])
-		clear_fixmap(FIX_TEXT_POKE1);
-	local_flush_tlb();
-	sync_core();
-	/* Could also do a CLFLUSH here to speed up CPU recovery; but
-	   that causes hangs on some VIA CPUs. */
-	for (i = 0; i < len; i++)
-		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+	old_cr0 = read_cr0();
+	write_cr0(old_cr0 & ~X86_CR0_WP);
+
+	memcpy(addr, opcode, len);
+
+	write_cr0(old_cr0);	/* also serializes */
 	local_irq_restore(flags);
 	return addr;
 }
Kees Cook Aug. 26, 2018, 4:43 a.m. UTC | #35
On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> On Fri, 24 Aug 2018 21:23:26 -0700
>> Andy Lutomirski <luto@kernel.org> wrote:
>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>
>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>> In x86-64, it seems that returns just a page address. That is not
>> good for text_poke, since it needs to make a writable alias for RO
>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>
>
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.
>
> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
> too scary of an approach?  With the patch applied, text_poke() is a
> fantastic exploit target.  On the other hand, even without the patch
> applied, text_poke() is every bit as juicy.

I tried to convince Ingo to use this method for doing "write rarely"
and he soundly rejected it. :) I've always liked this because AFAICT,
it's local to the CPU. I had proposed it in
https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d

With that, text_poke() mostly becomes:

rare_write_begin()
memcpy(addr, opcode, len);
rare_write_end()

As for juiciness, if an attacker already has execution control, they
can do more interesting things than text_poke(). But regardless, yes,
it's always made me uncomfortable. :)

-Kees
Nadav Amit Aug. 26, 2018, 5:53 a.m. UTC | #36
at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:

> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>> 
>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>> In x86-64, it seems that returns just a page address. That is not
>>> good for text_poke, since it needs to make a writable alias for RO
>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>> 
>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>> It's totally the wrong API -- poking one instruction at a time
>> basically can't be efficient on x86.  The API should either poke lots
>> of instructions at once or should be text_poke_begin(); ...;
>> text_poke_end();.
>> 
>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>> too scary of an approach?  With the patch applied, text_poke() is a
>> fantastic exploit target.  On the other hand, even without the patch
>> applied, text_poke() is every bit as juicy.
> 
> I tried to convince Ingo to use this method for doing "write rarely"
> and he soundly rejected it. :) I've always liked this because AFAICT,
> it's local to the CPU. I had proposed it in
> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> 
> With that, text_poke() mostly becomes:
> 
> rare_write_begin()
> memcpy(addr, opcode, len);
> rare_write_end()
> 
> As for juiciness, if an attacker already has execution control, they
> can do more interesting things than text_poke(). But regardless, yes,
> it's always made me uncomfortable. :)

I think that the key to harden the security of text_poke() against its use
as a gadget in a ROP/JOP attack is to add a check/assertion for the old
(expected) value, such as:

rare_write_begin()
if (*addr == prev_opcode)
	memcpy(addr, opcode, len);
rare_write_end()
Peter Zijlstra Aug. 26, 2018, 9:09 a.m. UTC | #37
On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.

I don't think anybody ever cared about performance here. Only
correctness. That whole text_poke_bp() thing is entirely tricky.

FWIW, before text_poke_bp(), text_poke() would only be used from
stop_machine, so all the other CPUs would be stuck busy-waiting with
IRQs disabled. These days, yeah, that's lots more dodgy, but yes
text_mutex should be serializing all that.

And on that, I so hate comments like: "must be called under foo_mutex",
we have lockdep_assert_held() for that.
Andy Lutomirski Aug. 26, 2018, 2:20 p.m. UTC | #38
> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
> 
>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>> 
>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>> In x86-64, it seems that returns just a page address. That is not
>>> good for text_poke, since it needs to make a writable alias for RO
>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>> 
>> 
>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>> It's totally the wrong API -- poking one instruction at a time
>> basically can't be efficient on x86.  The API should either poke lots
>> of instructions at once or should be text_poke_begin(); ...;
>> text_poke_end();.
>> 
>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>> too scary of an approach?  With the patch applied, text_poke() is a
>> fantastic exploit target.  On the other hand, even without the patch
>> applied, text_poke() is every bit as juicy.
> 
> I tried to convince Ingo to use this method for doing "write rarely"
> and he soundly rejected it. :) I've always liked this because AFAICT,
> it's local to the CPU. I had proposed it in
> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d

Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.

> 
> With that, text_poke() mostly becomes:
> 
> rare_write_begin()
> memcpy(addr, opcode, len);
> rare_write_end()
> 
> As for juiciness, if an attacker already has execution control, they
> can do more interesting things than text_poke(). But regardless, yes,
> it's always made me uncomfortable. :)
> 
> -Kees
> 
> -- 
> Kees Cook
> Pixel Security
Kees Cook Aug. 26, 2018, 4:47 p.m. UTC | #39
On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
>
>> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>
>>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>>>
>>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>>> In x86-64, it seems that returns just a page address. That is not
>>>> good for text_poke, since it needs to make a writable alias for RO
>>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>>>
>>>
>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>> It's totally the wrong API -- poking one instruction at a time
>>> basically can't be efficient on x86.  The API should either poke lots
>>> of instructions at once or should be text_poke_begin(); ...;
>>> text_poke_end();.
>>>
>>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>>> too scary of an approach?  With the patch applied, text_poke() is a
>>> fantastic exploit target.  On the other hand, even without the patch
>>> applied, text_poke() is every bit as juicy.
>>
>> I tried to convince Ingo to use this method for doing "write rarely"
>> and he soundly rejected it. :) I've always liked this because AFAICT,
>> it's local to the CPU. I had proposed it in
>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>
> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.

Sorry, it looks like it was tglx, not Ingo:

https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos

This thread is long, and one thing that I think went unanswered was
"why do we want this to be fast?" the answer is: for doing page table
updates. Page tables are becoming a bigger target for attacks now, and
it's be nice if they could stay read-only unless they're getting
updated (with something like this).

-Kees
Andy Lutomirski Aug. 26, 2018, 5:25 p.m. UTC | #40
> On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> 
>> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> 
>> 
>>>> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> 
>>>>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>>>> 
>>>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>>>> In x86-64, it seems that returns just a page address. That is not
>>>>> good for text_poke, since it needs to make a writable alias for RO
>>>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>>>> 
>>>> 
>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>> It's totally the wrong API -- poking one instruction at a time
>>>> basically can't be efficient on x86.  The API should either poke lots
>>>> of instructions at once or should be text_poke_begin(); ...;
>>>> text_poke_end();.
>>>> 
>>>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>>>> too scary of an approach?  With the patch applied, text_poke() is a
>>>> fantastic exploit target.  On the other hand, even without the patch
>>>> applied, text_poke() is every bit as juicy.
>>> 
>>> I tried to convince Ingo to use this method for doing "write rarely"
>>> and he soundly rejected it. :) I've always liked this because AFAICT,
>>> it's local to the CPU. I had proposed it in
>>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>> 
>> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> 
> Sorry, it looks like it was tglx, not Ingo:
> 
> https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> 
> This thread is long, and one thing that I think went unanswered was
> "why do we want this to be fast?" the answer is: for doing page table
> updates. Page tables are becoming a bigger target for attacks now, and
> it's be nice if they could stay read-only unless they're getting
> updated (with something like this).
> 
> 

It kind of sounds like tglx would prefer the CR3 approach. And indeed my patch has a serious problem wrt the NMI code.
Thomas Gleixner Aug. 26, 2018, 8:15 p.m. UTC | #41
On Sun, 26 Aug 2018, Andy Lutomirski wrote:
> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>> I tried to convince Ingo to use this method for doing "write rarely"
> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
> >>> it's local to the CPU. I had proposed it in
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> >> 
> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> > 
> > Sorry, it looks like it was tglx, not Ingo:
> > 
> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> > 
> > This thread is long, and one thing that I think went unanswered was
> > "why do we want this to be fast?" the answer is: for doing page table
> > updates. Page tables are becoming a bigger target for attacks now, and
> > it's be nice if they could stay read-only unless they're getting
> > updated (with something like this).
> > 
> > 
> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
> patch has a serious problem wrt the NMI code.

That's exactly the problem I have with CR0. It leaves everything and some
more writeable for any code which can interrupt that section.

Performance wise CR0 is not that much better than CR3 except that it has
the costs nicely hidden.

Thanks,

	tglx
Kees Cook Aug. 26, 2018, 10:03 p.m. UTC | #42
On Sun, Aug 26, 2018 at 1:15 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Sun, 26 Aug 2018, Andy Lutomirski wrote:
>> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
>> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> >>> I tried to convince Ingo to use this method for doing "write rarely"
>> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
>> >>> it's local to the CPU. I had proposed it in
>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>> >>
>> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
>> >
>> > Sorry, it looks like it was tglx, not Ingo:
>> >
>> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
>> >
>> > This thread is long, and one thing that I think went unanswered was
>> > "why do we want this to be fast?" the answer is: for doing page table
>> > updates. Page tables are becoming a bigger target for attacks now, and
>> > it's be nice if they could stay read-only unless they're getting
>> > updated (with something like this).
>> >
>> >
>> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
>> patch has a serious problem wrt the NMI code.
>
> That's exactly the problem I have with CR0. It leaves everything and some
> more writeable for any code which can interrupt that section.

I thought the point was that the implementation I suggested was
NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
like disabling interrupts is redundant to preempt_disable()? But I
don't understand how; it looks like the preempt stuff is advisory?)

-Kees
Matthew Wilcox (Oracle) Aug. 26, 2018, 10:15 p.m. UTC | #43
On Sun, Aug 26, 2018 at 03:03:59PM -0700, Kees Cook wrote:
> I thought the point was that the implementation I suggested was
> NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
> like disabling interrupts is redundant to preempt_disable()? But I
> don't understand how; it looks like the preempt stuff is advisory?)

Oter way round; disabling interrupts implicitly disables preemption
Jann Horn Aug. 26, 2018, 10:29 p.m. UTC | #44
On Mon, Aug 27, 2018 at 12:11 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Sun, Aug 26, 2018 at 1:15 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Sun, 26 Aug 2018, Andy Lutomirski wrote:
> >> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> >> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> >>> I tried to convince Ingo to use this method for doing "write rarely"
> >> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
> >> >>> it's local to the CPU. I had proposed it in
> >> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> >> >>
> >> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> >> >
> >> > Sorry, it looks like it was tglx, not Ingo:
> >> >
> >> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> >> >
> >> > This thread is long, and one thing that I think went unanswered was
> >> > "why do we want this to be fast?" the answer is: for doing page table
> >> > updates. Page tables are becoming a bigger target for attacks now, and
> >> > it's be nice if they could stay read-only unless they're getting
> >> > updated (with something like this).
> >> >
> >> >
> >> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
> >> patch has a serious problem wrt the NMI code.
> >
> > That's exactly the problem I have with CR0. It leaves everything and some
> > more writeable for any code which can interrupt that section.
>
> I thought the point was that the implementation I suggested was
> NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
> like disabling interrupts is redundant to preempt_disable()? But I
> don't understand how; it looks like the preempt stuff is advisory?)

Where are you dealing with NMIs? local_irq_disable() disables the
interrupt flag, but Non-Maskable Interrupts can still come in. As far
as I know, the only way to block those is to artificially generate an
NMI yourself (Xen does that sometimes). Otherwise, you have to twiddle
CR0.WP in the NMI handler entry/exit code.
Jann Horn Aug. 26, 2018, 10:48 p.m. UTC | #45
On Sun, Aug 26, 2018 at 6:21 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > On Fri, 24 Aug 2018 21:23:26 -0700
> > Andy Lutomirski <luto@kernel.org> wrote:
> >> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
> >
> > No, since kmap_atomic() is only for x86_32 and highmem support kernel.
> > In x86-64, it seems that returns just a page address. That is not
> > good for text_poke, since it needs to make a writable alias for RO
> > code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
> >
>
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.
>
> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
> too scary of an approach?  With the patch applied, text_poke() is a
> fantastic exploit target.  On the other hand, even without the patch
> applied, text_poke() is every bit as juicy.

Twiddling CR0.WP is incompatible with Xen PV, right? It can't let you
do it because you're not actually running in ring 0 (but in ring 1 or
3), so CR0.WP has no influence on what you can access; and it must not
let you bypass write protection because you have read-only access to
host page tables. I think this code has to be compatible with Xen PV,
right?

In theory Xen PV could support this by emulating X86 instructions, but
I don't see anything related to CR0.WP in their emulation code. From
xen/arch/x86/pv/emul-priv-op.c:

    case 0: /* Write CR0 */
        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
        {
            gdprintk(XENLOG_WARNING,
                     "Attempt to change unmodifiable CR0 flags\n");
            break;
        }
        do_fpu_taskswitch(!!(val & X86_CR0_TS));
        return X86EMUL_OKAY;

Having a special fallback path for "patch kernel code while running
under Xen PV" would be kinda ugly.
Masami Hiramatsu (Google) Aug. 27, 2018, 3:03 a.m. UTC | #46
On Sun, 26 Aug 2018 11:09:58 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> > I just re-read text_poke().  It's, um, horrible.  Not only is the
> > implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> > It's totally the wrong API -- poking one instruction at a time
> > basically can't be efficient on x86.  The API should either poke lots
> > of instructions at once or should be text_poke_begin(); ...;
> > text_poke_end();.
> 
> I don't think anybody ever cared about performance here. Only
> correctness. That whole text_poke_bp() thing is entirely tricky.

Agreed. Self modification is a special event.

> FWIW, before text_poke_bp(), text_poke() would only be used from
> stop_machine, so all the other CPUs would be stuck busy-waiting with
> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> text_mutex should be serializing all that.

I'm still not sure that speculative page-table walk can be done
over the mutex. Also, if the fixmap area is for aliasing
pages (which always mapped to memory), what kind of
security issue can happen?

Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
changing CR3 sounds good to me. I think we don't even need per-cpu,
it can call a thread/function on a dedicated core (like the first
boot processor) and wait :) This may prevent leakage of pte change
to other cores.

> And on that, I so hate comments like: "must be called under foo_mutex",
> we have lockdep_assert_held() for that.

Indeed. I also think that text_poke() should not call BUG_ON, but
its caller should decide it is recoverable or not.

Thank you,
Nadav Amit Aug. 27, 2018, 3:26 a.m. UTC | #47
at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Sun, 26 Aug 2018 11:09:58 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>> It's totally the wrong API -- poking one instruction at a time
>>> basically can't be efficient on x86.  The API should either poke lots
>>> of instructions at once or should be text_poke_begin(); ...;
>>> text_poke_end();.
>> 
>> I don't think anybody ever cared about performance here. Only
>> correctness. That whole text_poke_bp() thing is entirely tricky.
> 
> Agreed. Self modification is a special event.
> 
>> FWIW, before text_poke_bp(), text_poke() would only be used from
>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>> text_mutex should be serializing all that.
> 
> I'm still not sure that speculative page-table walk can be done
> over the mutex. Also, if the fixmap area is for aliasing
> pages (which always mapped to memory), what kind of
> security issue can happen?

The PTE is accessible from other cores, so just as we assume for L1TF that
the every addressable memory might be cached in L1, we should assume and
PTE might be cached in the TLB when it is present.

Although the mapping is for an alias, there are a couple of issues here.
First, this alias mapping is writable, so it might an attacker to change the
kernel code (following another initial attack). Second, the alias mapping is
never explicitly flushed. We may assume that once the original mapping is
removed/changed, a full TLB flush would take place, but there is no
guarantee it actually takes place.

> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
> changing CR3 sounds good to me. I think we don't even need per-cpu,
> it can call a thread/function on a dedicated core (like the first
> boot processor) and wait :) This may prevent leakage of pte change
> to other cores.

I implemented per-cpu fixmap, but I think that it makes more sense to take
peterz approach and set an entry in the PGD level. Per-CPU fixmap either
requires to pre-populate various levels in the page-table hierarchy, or
conditionally synchronize whenever module memory is allocated, since they
can share the same PGD, PUD & PMD. While usually the synchronization is not
needed, the possibility that synchronization is needed complicates locking.

Anyhow, having fixed addresses for the fixmap can be used to circumvent
KASLR.

I don’t think a dedicated core is needed. Anyhow there is a lock
(text_mutex), so use_mm() can be used after acquiring the mutex.
Nicholas Piggin Aug. 27, 2018, 5 a.m. UTC | #48
On Fri, 24 Aug 2018 13:39:53 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:  
> > > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:  
> > > > The only problem with this approach is that we've lost track of the granule
> > > > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > > > the TLB invalidations for huge mappings, which actually works nicely in the
> > > > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > > > rather than iterating over it at a 4k granule).
> > > > 
> > > > One thing we could do is switch to synchronous mode if we detect a change in
> > > > granule (i.e. treat it like a batch failure).  
> > > 
> > > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > > hard.  
> > 
> > Hurm.. look at commit:
> > 
> >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")  
> 
> Ah, good, it seems that already got cleaned up a lot. But it all moved
> into the power code.. blergh.

I lost track of what the problem is here?

For powerpc, tlb_start_vma is not the right API to use for this because
it wants to deal with different page sizes within a vma.

Thanks,
Nick
Peter Zijlstra Aug. 27, 2018, 7:47 a.m. UTC | #49
On Mon, Aug 27, 2018 at 03:00:08PM +1000, Nicholas Piggin wrote:
> On Fri, 24 Aug 2018 13:39:53 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:

> > > Hurm.. look at commit:
> > > 
> > >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")  
> > 
> > Ah, good, it seems that already got cleaned up a lot. But it all moved
> > into the power code.. blergh.
> 
> I lost track of what the problem is here?

Aside from the commit above being absolute crap (which did get fixed up,
luckily) I would really like to get rid of all arch specific mmu_gather.

We can have opt-in bits to the generic code, but the endless back and
forth between common and arch code is an utter pain in the arse.

And there's only like 4 architectures that still have a custom
mmu_gather:

  - sh
  - arm
  - ia64
  - s390

sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
arch/ia64' leaving us with s390.

After that everyone uses the common code and we can clean up.

> For powerpc, tlb_start_vma is not the right API to use for this because
> it wants to deal with different page sizes within a vma.

Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
still correct for huge pages, inefficient maybe, but so is flushing
every other page because 'sparse' transparant-huge-pages.
Nicholas Piggin Aug. 27, 2018, 8:04 a.m. UTC | #50
On Mon, 27 Aug 2018 09:47:01 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 03:00:08PM +1000, Nicholas Piggin wrote:
> > On Fri, 24 Aug 2018 13:39:53 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:  
> > > On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:  
> 
> > > > Hurm.. look at commit:
> > > > 
> > > >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")    
> > > 
> > > Ah, good, it seems that already got cleaned up a lot. But it all moved
> > > into the power code.. blergh.  
> > 
> > I lost track of what the problem is here?  
> 
> Aside from the commit above being absolute crap (which did get fixed up,
> luckily) I would really like to get rid of all arch specific mmu_gather.
> 
> We can have opt-in bits to the generic code, but the endless back and
> forth between common and arch code is an utter pain in the arse.
> 
> And there's only like 4 architectures that still have a custom
> mmu_gather:
> 
>   - sh
>   - arm
>   - ia64
>   - s390
> 
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

Well I don't see a big problem in having an arch_mmu_gather field
or small bits. powerpc would actually like that rather than trying
to add things it wants into generic code (and it wants more than
just a few flags bits, ideally).

> After that everyone uses the common code and we can clean up.
> 
> > For powerpc, tlb_start_vma is not the right API to use for this because
> > it wants to deal with different page sizes within a vma.  
> 
> Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> still correct for huge pages, inefficient maybe, but so is flushing
> every other page because 'sparse' transparant-huge-pages.

It could do that. It requires a tlbie that matches the page size,
so it means 3 sizes. I think possibly even that would be better
than current code, but we could do better if we had a few specific
fields in there.

Thanks,
Nick
Masami Hiramatsu (Google) Aug. 27, 2018, 8:05 a.m. UTC | #51
On Sun, 26 Aug 2018 20:26:09 -0700
Nadav Amit <nadav.amit@gmail.com> wrote:

> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> >> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> >>> I just re-read text_poke().  It's, um, horrible.  Not only is the
> >>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> >>> It's totally the wrong API -- poking one instruction at a time
> >>> basically can't be efficient on x86.  The API should either poke lots
> >>> of instructions at once or should be text_poke_begin(); ...;
> >>> text_poke_end();.
> >> 
> >> I don't think anybody ever cared about performance here. Only
> >> correctness. That whole text_poke_bp() thing is entirely tricky.
> > 
> > Agreed. Self modification is a special event.
> > 
> >> FWIW, before text_poke_bp(), text_poke() would only be used from
> >> stop_machine, so all the other CPUs would be stuck busy-waiting with
> >> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> >> text_mutex should be serializing all that.
> > 
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
> 
> The PTE is accessible from other cores, so just as we assume for L1TF that
> the every addressable memory might be cached in L1, we should assume and
> PTE might be cached in the TLB when it is present.

Ok, so other cores can accidentally cache the PTE in TLB, (and no way
to shoot down explicitly?)

> Although the mapping is for an alias, there are a couple of issues here.
> First, this alias mapping is writable, so it might an attacker to change the
> kernel code (following another initial attack).

Combined with some buffer overflow, correct? If the attacker already can
write a kernel data directly, he is in the kernel mode.

> Second, the alias mapping is
> never explicitly flushed. We may assume that once the original mapping is
> removed/changed, a full TLB flush would take place, but there is no
> guarantee it actually takes place.

Hmm, would this means a full TLB flush will not flush alias mapping?
(or, the full TLB flush just doesn't work?)

> > Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
> > changing CR3 sounds good to me. I think we don't even need per-cpu,
> > it can call a thread/function on a dedicated core (like the first
> > boot processor) and wait :) This may prevent leakage of pte change
> > to other cores.
> 
> I implemented per-cpu fixmap, but I think that it makes more sense to take
> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
> requires to pre-populate various levels in the page-table hierarchy, or
> conditionally synchronize whenever module memory is allocated, since they
> can share the same PGD, PUD & PMD. While usually the synchronization is not
> needed, the possibility that synchronization is needed complicates locking.
> 

Could you point which PeterZ approach you said? I guess it will be
make a clone of PGD and use it for local page mapping (as new mm).
If so, yes it sounds perfectly fine to me.

> Anyhow, having fixed addresses for the fixmap can be used to circumvent
> KASLR.

I think text_poke doesn't mind using random address :)

> I don’t think a dedicated core is needed. Anyhow there is a lock
> (text_mutex), so use_mm() can be used after acquiring the mutex.

Hmm, use_mm() said;

/*
 * use_mm
 *      Makes the calling kernel thread take on the specified
 *      mm context.
 *      (Note: this routine is intended to be called only
 *      from a kernel thread context)
 */

So maybe we need a dedicated kernel thread for safeness?

Thank you,
Benjamin Herrenschmidt Aug. 27, 2018, 8:09 a.m. UTC | #52
On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> > Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> > ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> > do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> > still correct for huge pages, inefficient maybe, but so is flushing
> > every other page because 'sparse' transparant-huge-pages.
> 
> It could do that. It requires a tlbie that matches the page size,
> so it means 3 sizes. I think possibly even that would be better
> than current code, but we could do better if we had a few specific
> fields in there.

More tlbies ? With the cost of the broadasts on the fabric ? I don't
think so.. or I'm not understanding your point...

Sadly our architecture requires a precise match between the page size
specified in the tlbie instruction and the entry in the TLB or it won't
be flushed.

Ben.
Peter Zijlstra Aug. 27, 2018, 8:13 a.m. UTC | #53
On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> On Sun, 26 Aug 2018 11:09:58 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:

> > FWIW, before text_poke_bp(), text_poke() would only be used from
> > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > text_mutex should be serializing all that.
> 
> I'm still not sure that speculative page-table walk can be done
> over the mutex. Also, if the fixmap area is for aliasing
> pages (which always mapped to memory), what kind of
> security issue can happen?

So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
such that other CPUs get to continue with whatever they're doing).

While at that point, CPU-B gets an interrupt, and the CPU's
branch-trace-buffer for the IRET points to / near our fixmap. Then the
CPU could do a speculative TLB fill based on the BTB value, either
directly or indirectly (through speculative driven fault-ahead) of
whatever is in te fixmap at the time.

Then CPU-A completes the text_poke and only does a local TLB invalidate
on CPU-A, leaving CPU-B with an active translation.

*FAIL*
Peter Zijlstra Aug. 27, 2018, 8:20 a.m. UTC | #54
On Mon, Aug 27, 2018 at 06:09:50PM +1000, Benjamin Herrenschmidt wrote:

> Sadly our architecture requires a precise match between the page size
> specified in the tlbie instruction and the entry in the TLB or it won't
> be flushed.

Argh.. OK I see. That is rather unfortunate and does seem to require
something along the lines of tlb_remove_check_page_size_change().
Nicholas Piggin Aug. 27, 2018, 8:54 a.m. UTC | #55
On Mon, 27 Aug 2018 10:20:45 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 06:09:50PM +1000, Benjamin Herrenschmidt wrote:
> 
> > Sadly our architecture requires a precise match between the page size
> > specified in the tlbie instruction and the entry in the TLB or it won't
> > be flushed.  
> 
> Argh.. OK I see. That is rather unfortunate and does seem to require
> something along the lines of tlb_remove_check_page_size_change().

Or we can do better with some more of our own data in mmu_gather,
but things that probably few or no other architectures want. I've
held off trying to put any crap in generic code because there's
other lower hanging fruit still, but I'd really rather just give
archs the ability to put their own data in there. I don't really
see a downside to it (divergence of course, but the existing
proliferation of code is much harder to follow than some data that
would be maintained and used purely by the arch, and beats having
to implement entirely your own mmu_gather).

Thanks,
Nick
Christoph Hellwig Aug. 27, 2018, 8:57 a.m. UTC | #56
On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

Is removing ia64 a serious plan?  It is the cause for a fair share of
oddities in dma lang, and I did not have much luck getting maintainer
replies lately, but I didn't know of a plan to get rid of it.

What is the state of people still using ia64 mainline kernels vs just
old distros in the still existing machines?
Nicholas Piggin Aug. 27, 2018, 9:02 a.m. UTC | #57
On Mon, 27 Aug 2018 18:09:50 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> > > Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> > > ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> > > do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> > > still correct for huge pages, inefficient maybe, but so is flushing
> > > every other page because 'sparse' transparant-huge-pages.  
> > 
> > It could do that. It requires a tlbie that matches the page size,
> > so it means 3 sizes. I think possibly even that would be better
> > than current code, but we could do better if we had a few specific
> > fields in there.  
> 
> More tlbies ? With the cost of the broadasts on the fabric ? I don't
> think so.. or I'm not understanding your point...

More tlbies are no good, but there will be some places where it works
out much better (and fewer tlbies). Worst possible case for current code
is a big unmap with lots of scattered page sizes. We _should_ get that
with just a single PID flush at the end, but what we will get today is
a bunch of PID and VA flushes.

I don't propose doing that though, I'd rather be explicit about
tracking start and end range of each page size. Still not "optimal"
but neither is existing single range for sparse mappings... anyway it
will need to be profiled, but my point is we don't really fit exactly
what x86/arm want.

Thanks,
Nick
Masami Hiramatsu (Google) Aug. 27, 2018, 9:39 a.m. UTC | #58
On Mon, 27 Aug 2018 10:13:29 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > FWIW, before text_poke_bp(), text_poke() would only be used from
> > > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > > text_mutex should be serializing all that.
> > 
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
> 
> So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
> such that other CPUs get to continue with whatever they're doing).
> 
> While at that point, CPU-B gets an interrupt, and the CPU's
> branch-trace-buffer for the IRET points to / near our fixmap. Then the
> CPU could do a speculative TLB fill based on the BTB value, either
> directly or indirectly (through speculative driven fault-ahead) of
> whatever is in te fixmap at the time.

Hmm, but how "near" is it enough? Since text_poke just map a non
executable alias page in fixmap, it is hard to suppose that IRET points
there (except for attacker change the IRET address).

I see that Intel CPU sometimes speculatively read-ahead the page tables,
but in that case, I guess we just need to keep fixmap area away from
text area. (Of course, it is hard to estimate how far is enough :( )

Anyway, I agree to introduce new page-table (and kthread?) for fixmap.

> Then CPU-A completes the text_poke and only does a local TLB invalidate
> on CPU-A, leaving CPU-B with an active translation.
> 
> *FAIL*

Ah, I got it. So on CPU-B, it can write-access to fixmap'd pages unless
the CPU-B shoot down the full TLB...

Thank you,
Jann Horn Aug. 27, 2018, 9:55 a.m. UTC | #59
On Mon, Aug 27, 2018 at 10:13 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > FWIW, before text_poke_bp(), text_poke() would only be used from
> > > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > > text_mutex should be serializing all that.
> >
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
>
> So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
> such that other CPUs get to continue with whatever they're doing).
>
> While at that point, CPU-B gets an interrupt, and the CPU's
> branch-trace-buffer for the IRET points to / near our fixmap. Then the
> CPU could do a speculative TLB fill based on the BTB value, either
> directly or indirectly (through speculative driven fault-ahead) of
> whatever is in te fixmap at the time.

Worse: The way academics have been defeating KASLR for a while is
based on TLB fills for kernel addresses, triggered from userspace.
Quoting https://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf :

| Additionally, even if a permission error occurs, this still allows to
| launch address translations and, hence, generate valid TLB entries
| by accessing privileged kernel space memory from user mode.

This was actually part of the original motivation for KAISER/KPTI.
Quoting https://gruss.cc/files/kaiser.pdf :

| Modern operating system kernels employ address space layout
| randomization (ASLR) to prevent control-flow hijacking attacks and
| code-injection attacks. While kernel security relies fundamentally
on preventing
| access to address information, recent attacks have shown that the
| hardware directly leaks this information.

I believe that PTI probably prevents this way of directly triggering
TLB fills for now (under the assumption that hyperthreads with equal
CR3 don't share TLB entries), but I would still assume that an
attacker can probably trigger TLB fills for arbitrary addresses
anytime. And at some point in the future, I believe people would
probably like to be able to disable PTI again?

> Then CPU-A completes the text_poke and only does a local TLB invalidate
> on CPU-A, leaving CPU-B with an active translation.
>
> *FAIL*
Peter Zijlstra Aug. 27, 2018, 11 a.m. UTC | #60
On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> And there's only like 4 architectures that still have a custom
> mmu_gather:
> 
>   - sh
>   - arm
>   - ia64
>   - s390
> 
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

The one obvious thing SH and ARM want is a sensible default for
tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )

The below make tlb_start_vma() default to flush_cache_range(), which
should be right and sufficient. The only exceptions that I found where
(oddly):

  - m68k-mmu
  - sparc64
  - unicore

Those architectures appear to have a non-NOP flush_cache_range(), but
their current tlb_start_vma() does not call it.

Furthermore, I think tlb_flush() is broken on arc and parisc; in
particular they don't appear to have any TLB invalidate for the
shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.

Possibly shift_arg_pages() should be fixed instead.

Some archs (nds32,sparc32) avoid this by having an unconditional
flush_tlb_mm() in tlb_flush(), which seems somewhat suboptimal if you
have flush_tlb_range().  TLB_FLUSH_VMA() might be an option, however
hideous it is.

---

diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
index a9db5f62aaf3..7af2b373ebe7 100644
--- a/arch/arc/include/asm/tlb.h
+++ b/arch/arc/include/asm/tlb.h
@@ -23,15 +23,6 @@ do {						\
  *
  * Note, read http://lkml.org/lkml/2004/1/15/6
  */
-#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
-#define tlb_start_vma(tlb, vma)
-#else
-#define tlb_start_vma(tlb, vma)						\
-do {									\
-	if (!tlb->fullmm)						\
-		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
-} while(0)
-#endif
 
 #define tlb_end_vma(tlb, vma)						\
 do {									\
diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h
index b6823b9e94da..9d04b4649692 100644
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -5,16 +5,6 @@
 #include <asm/cpu-features.h>
 #include <asm/mipsregs.h>
 
-/*
- * MIPS doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
 /*
diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h
index b35ae5eae3ab..0bf7c9482381 100644
--- a/arch/nds32/include/asm/tlb.h
+++ b/arch/nds32/include/asm/tlb.h
@@ -4,12 +4,6 @@
 #ifndef __ASMNDS32_TLB_H
 #define __ASMNDS32_TLB_H
 
-#define tlb_start_vma(tlb,vma)						\
-	do {								\
-		if (!tlb->fullmm)					\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	} while (0)
-
 #define tlb_end_vma(tlb,vma)				\
 	do { 						\
 		if(!tlb->fullmm)			\
diff --git a/arch/nios2/include/asm/tlb.h b/arch/nios2/include/asm/tlb.h
index d3bc648e08b5..9b518c6d0f62 100644
--- a/arch/nios2/include/asm/tlb.h
+++ b/arch/nios2/include/asm/tlb.h
@@ -15,16 +15,6 @@
 
 extern void set_mmu_pid(unsigned long pid);
 
-/*
- * NiosII doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for the area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
 
diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h
index 0c881e74d8a6..b1984f9cd3af 100644
--- a/arch/parisc/include/asm/tlb.h
+++ b/arch/parisc/include/asm/tlb.h
@@ -7,11 +7,6 @@ do {	if ((tlb)->fullmm)		\
 		flush_tlb_mm((tlb)->mm);\
 } while (0)
 
-#define tlb_start_vma(tlb, vma) \
-do {	if (!(tlb)->fullmm)	\
-		flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
 #define tlb_end_vma(tlb, vma)	\
 do {	if (!(tlb)->fullmm)	\
 		flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
diff --git a/arch/sparc/include/asm/tlb_32.h b/arch/sparc/include/asm/tlb_32.h
index 343cea19e573..68d817273de8 100644
--- a/arch/sparc/include/asm/tlb_32.h
+++ b/arch/sparc/include/asm/tlb_32.h
@@ -2,11 +2,6 @@
 #ifndef _SPARC_TLB_H
 #define _SPARC_TLB_H
 
-#define tlb_start_vma(tlb, vma) \
-do {								\
-	flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
-} while (0)
-
 #define tlb_end_vma(tlb, vma) \
 do {								\
 	flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h
index 0d766f9c1083..1a93e350382e 100644
--- a/arch/xtensa/include/asm/tlb.h
+++ b/arch/xtensa/include/asm/tlb.h
@@ -16,19 +16,10 @@
 
 #if (DCACHE_WAY_SIZE <= PAGE_SIZE)
 
-/* Note, read http://lkml.org/lkml/2004/1/15/6 */
-
-# define tlb_start_vma(tlb,vma)			do { } while (0)
 # define tlb_end_vma(tlb,vma)			do { } while (0)
 
 #else
 
-# define tlb_start_vma(tlb, vma)					      \
-	do {								      \
-		if (!tlb->fullmm)					      \
-			flush_cache_range(vma, vma->vm_start, vma->vm_end);   \
-	} while(0)
-
 # define tlb_end_vma(tlb, vma)						      \
 	do {								      \
 		if (!tlb->fullmm)					      \
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e811ef7b8350..1d037fd5bb7a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
  * the vmas are adjusted to only cover the region to be torn down.
  */
 #ifndef tlb_start_vma
-#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_start_vma(tlb, vma)						\
+do {									\
+	if (!tlb->fullmm)						\
+		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
+} while (0)
 #endif
 
-#define __tlb_end_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm && tlb->end) {			\
-			tlb_flush(tlb);				\
-			__tlb_reset_range(tlb);			\
-		}						\
-	} while (0)
-
 #ifndef tlb_end_vma
-#define tlb_end_vma	__tlb_end_vma
+#define tlb_end_vma(tlb, vma)						\
+	do {								\
+		if (!tlb->fullmm && tlb->end) {				\
+			tlb_flush(tlb);					\
+			__tlb_reset_range(tlb);				\
+		}							\
+	} while (0)
 #endif
 
 #ifndef __tlb_remove_tlb_entry
Peter Zijlstra Aug. 27, 2018, 11:28 a.m. UTC | #61
On Mon, Aug 27, 2018 at 01:57:08AM -0700, Christoph Hellwig wrote:
> On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> > sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> > arch/ia64' leaving us with s390.
> 
> Is removing ia64 a serious plan?

I 'joked' about it a while ago on IRC, and aegl reacted that it might
not be entirely unreasonable.

> It is the cause for a fair share of
> oddities in dma lang, and I did not have much luck getting maintainer
> replies lately, but I didn't know of a plan to get rid of it.
> 
> What is the state of people still using ia64 mainline kernels vs just
> old distros in the still existing machines?

Both arjan and aegl said that the vast majority of people still running
ia64 machines run old distros.
Jason Duerstock Aug. 27, 2018, 11:45 a.m. UTC | #62
I cannot speak to how widespread it has been adopted, but the linux
(kernel) package for version 4.17.17 has been successfully built and
installed for ia64 under Debian ports.  There is clearly more work to
do to get ia64 rehabilitated, but there are over 10,000 packages
currently successfully built for ia64 under Debian ports[1].

Jason

[1] https://buildd.debian.org/status/architecture.php?a=ia64&suite=sid
On Mon, Aug 27, 2018 at 4:57 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> > sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> > arch/ia64' leaving us with s390.
>
> Is removing ia64 a serious plan?  It is the cause for a fair share of
> oddities in dma lang, and I did not have much luck getting maintainer
> replies lately, but I didn't know of a plan to get rid of it.
>
> What is the state of people still using ia64 mainline kernels vs just
> old distros in the still existing machines?
Rik van Riel Aug. 27, 2018, 1:36 p.m. UTC | #63
On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:

> It could do that. It requires a tlbie that matches the page size,
> so it means 3 sizes. I think possibly even that would be better
> than current code, but we could do better if we had a few specific
> fields in there.

Would it cause a noticeable overhead to keep track
of which page sizes were removed, and to simply flush
the whole TLB in the (unlikely?) event that multiple
page sizes were removed in the same munmap?

Once the unmap is so large that multiple page sizes
were covered, you may already be looking at so many
individual flush operations that a full flush might
be faster.

Is there a point on PPC where simply flushing the
whole TLB, and having other things be reloaded later,
is faster than flushing every individual page mapping
that got unmapped?
Nicholas Piggin Aug. 27, 2018, 2:29 p.m. UTC | #64
On Mon, 27 Aug 2018 09:36:50 -0400
Rik van Riel <riel@surriel.com> wrote:

> On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> 
> > It could do that. It requires a tlbie that matches the page size,
> > so it means 3 sizes. I think possibly even that would be better
> > than current code, but we could do better if we had a few specific
> > fields in there.  
> 
> Would it cause a noticeable overhead to keep track
> of which page sizes were removed, and to simply flush
> the whole TLB in the (unlikely?) event that multiple
> page sizes were removed in the same munmap?
> 
> Once the unmap is so large that multiple page sizes
> were covered, you may already be looking at so many
> individual flush operations that a full flush might
> be faster.

It will take some profiling and measuring. unmapping a small number of
huge pages plus a small number of surrounding small pages may not be
uncommon if THP is working well. That could become a lot more expensive.

> 
> Is there a point on PPC where simply flushing the
> whole TLB, and having other things be reloaded later,
> is faster than flushing every individual page mapping
> that got unmapped?

There is. For local TLB flushes that point is well over 100 individual
invalidates though. We're generally better off flushing all page sizes
for that case.

Thanks,
Nick
Nadav Amit Aug. 27, 2018, 5:34 p.m. UTC | #65
at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Sun, 26 Aug 2018 20:26:09 -0700
> Nadav Amit <nadav.amit@gmail.com> wrote:
> 
>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> 
>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>> text_poke_end();.
>>>> 
>>>> I don't think anybody ever cared about performance here. Only
>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>> 
>>> Agreed. Self modification is a special event.
>>> 
>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>> text_mutex should be serializing all that.
>>> 
>>> I'm still not sure that speculative page-table walk can be done
>>> over the mutex. Also, if the fixmap area is for aliasing
>>> pages (which always mapped to memory), what kind of
>>> security issue can happen?
>> 
>> The PTE is accessible from other cores, so just as we assume for L1TF that
>> the every addressable memory might be cached in L1, we should assume and
>> PTE might be cached in the TLB when it is present.
> 
> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
> to shoot down explicitly?)

There is way (although current it does not). But it seems that the consensus
is that it is better to avoid it being mapped at all in remote cores.

>> Although the mapping is for an alias, there are a couple of issues here.
>> First, this alias mapping is writable, so it might an attacker to change the
>> kernel code (following another initial attack).
> 
> Combined with some buffer overflow, correct? If the attacker already can
> write a kernel data directly, he is in the kernel mode.

Right.

> 
>> Second, the alias mapping is
>> never explicitly flushed. We may assume that once the original mapping is
>> removed/changed, a full TLB flush would take place, but there is no
>> guarantee it actually takes place.
> 
> Hmm, would this means a full TLB flush will not flush alias mapping?
> (or, the full TLB flush just doesn't work?)

It will flush the alias mapping, but currently there is no such explicit
flush.

>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>> it can call a thread/function on a dedicated core (like the first
>>> boot processor) and wait :) This may prevent leakage of pte change
>>> to other cores.
>> 
>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>> requires to pre-populate various levels in the page-table hierarchy, or
>> conditionally synchronize whenever module memory is allocated, since they
>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>> needed, the possibility that synchronization is needed complicates locking.
> 
> Could you point which PeterZ approach you said? I guess it will be
> make a clone of PGD and use it for local page mapping (as new mm).
> If so, yes it sounds perfectly fine to me.

The thread is too long. What I think is best is having a mapping in the PGD
level. I’ll try to give it a shot, and see what I get.

>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>> KASLR.
> 
> I think text_poke doesn't mind using random address :)
> 
>> I don’t think a dedicated core is needed. Anyhow there is a lock
>> (text_mutex), so use_mm() can be used after acquiring the mutex.
> 
> Hmm, use_mm() said;
> 
> /*
> * use_mm
> *      Makes the calling kernel thread take on the specified
> *      mm context.
> *      (Note: this routine is intended to be called only
> *      from a kernel thread context)
> */
> 
> So maybe we need a dedicated kernel thread for safeness?

Yes, it says so. But I am not sure it cannot be changed, at least for this
specific use-case. Switching kernel threads just for patching seems to me as
an overkill.

Let me see if I can get something half-reasonable doing so...
Andy Lutomirski Aug. 27, 2018, 6:45 p.m. UTC | #66
On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
>> On Sun, 26 Aug 2018 20:26:09 -0700
>> Nadav Amit <nadav.amit@gmail.com> wrote:
>>
>>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>
>>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>>>
>>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>>> text_poke_end();.
>>>>>
>>>>> I don't think anybody ever cared about performance here. Only
>>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>>>
>>>> Agreed. Self modification is a special event.
>>>>
>>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>>> text_mutex should be serializing all that.
>>>>
>>>> I'm still not sure that speculative page-table walk can be done
>>>> over the mutex. Also, if the fixmap area is for aliasing
>>>> pages (which always mapped to memory), what kind of
>>>> security issue can happen?
>>>
>>> The PTE is accessible from other cores, so just as we assume for L1TF that
>>> the every addressable memory might be cached in L1, we should assume and
>>> PTE might be cached in the TLB when it is present.
>>
>> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
>> to shoot down explicitly?)
>
> There is way (although current it does not). But it seems that the consensus
> is that it is better to avoid it being mapped at all in remote cores.
>
>>> Although the mapping is for an alias, there are a couple of issues here.
>>> First, this alias mapping is writable, so it might an attacker to change the
>>> kernel code (following another initial attack).
>>
>> Combined with some buffer overflow, correct? If the attacker already can
>> write a kernel data directly, he is in the kernel mode.
>
> Right.
>
>>
>>> Second, the alias mapping is
>>> never explicitly flushed. We may assume that once the original mapping is
>>> removed/changed, a full TLB flush would take place, but there is no
>>> guarantee it actually takes place.
>>
>> Hmm, would this means a full TLB flush will not flush alias mapping?
>> (or, the full TLB flush just doesn't work?)
>
> It will flush the alias mapping, but currently there is no such explicit
> flush.
>
>>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>>> it can call a thread/function on a dedicated core (like the first
>>>> boot processor) and wait :) This may prevent leakage of pte change
>>>> to other cores.
>>>
>>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>>> requires to pre-populate various levels in the page-table hierarchy, or
>>> conditionally synchronize whenever module memory is allocated, since they
>>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>>> needed, the possibility that synchronization is needed complicates locking.
>>
>> Could you point which PeterZ approach you said? I guess it will be
>> make a clone of PGD and use it for local page mapping (as new mm).
>> If so, yes it sounds perfectly fine to me.
>
> The thread is too long. What I think is best is having a mapping in the PGD
> level. I’ll try to give it a shot, and see what I get.
>
>>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>>> KASLR.
>>
>> I think text_poke doesn't mind using random address :)
>>
>>> I don’t think a dedicated core is needed. Anyhow there is a lock
>>> (text_mutex), so use_mm() can be used after acquiring the mutex.
>>
>> Hmm, use_mm() said;
>>
>> /*
>> * use_mm
>> *      Makes the calling kernel thread take on the specified
>> *      mm context.
>> *      (Note: this routine is intended to be called only
>> *      from a kernel thread context)
>> */
>>
>> So maybe we need a dedicated kernel thread for safeness?
>
> Yes, it says so. But I am not sure it cannot be changed, at least for this
> specific use-case. Switching kernel threads just for patching seems to me as
> an overkill.
>
> Let me see if I can get something half-reasonable doing so...
>

I don't understand at all how a kernel thread helps.  The useful bit
is to have a dedicated mm, which would involve setting up an mm_struct
and mapping the kernel and module text, EFI-style, in the user portion
of the mm.  But, to do the text_poke(), we'd just use the mm *without
calling use_mm*.

In other words, the following sequence should be (almost) just fine:

typedef struct {
  struct mm_struct *prev;
} temporary_mm_state_t;

temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
    temporary_mm_state_t state;

    lockdep_assert_irqs_disabled();
    state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
    switch_mm_irqs_off(NULL, mm, current);
}

void unuse_temporary_mm(temporary_mm_state_t prev)
{
    lockdep_assert_irqs_disabled();
    switch_mm_irqs_off(NULL, prev.prev, current);
}

The only thing wrong with this that I can see is that it interacts
poorly with perf.  But perf is *already* busted in this regard.  The
following (whitespace damaged, sorry) should fix it:

commit b62bff5a8406d252de752cfe75068d0b73b9cdf0
Author: Andy Lutomirski <luto@kernel.org>
Date:   Mon Aug 27 11:41:55 2018 -0700

    x86/nmi: Fix some races in NMI uaccess

    In NMI context, we might be in the middle of context switching or in
    the middle of switch_mm_irqs_off().  In either case, CR3 might not
    match current->mm, which could cause copy_from_user_nmi() and
    friends to read the wrong memory.

    Fix it by adding a new nmi_uaccess_okay() helper and checking it in
    copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

    Signed-off-by: Andy Lutomirski <luto@kernel.org>

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct
perf_callchain_entry_ctx *entry, struct pt_regs *regs

     perf_callchain_store(entry, regs->ip);

-    if (!current->mm)
+    if (!nmi_uaccess_okay())
         return;

     if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 89a73bc31622..b23b2625793b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -230,6 +230,22 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+    struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+    struct mm_struct *current_mm = current->mm;
+
+    return current_mm && loaded_mm == current_mm &&
+        loaded_mm->pgd == __va(read_cr3_pa());
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..c5f758430be2 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -19,6 +19,9 @@ copy_from_user_nmi(void *to, const void __user
*from, unsigned long n)
     if (__range_not_ok(from, n, TASK_SIZE))
         return n;

+    if (!nmi_uaccess_okay())
+        return n;
+
     /*
      * Even though this function is typically called from NMI/IRQ context
      * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 457b281b9339..f4b41d5a93dd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -345,6 +345,9 @@ void switch_mm_irqs_off(struct mm_struct *prev,
struct mm_struct *next,
          */
         trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
     } else {
+        /* Let NMI code know that CR3 may not match expectations. */
+        this_cpu_write(cpu_tlbstate.loaded_mm, NULL);
+
         /* The new ASID is already up to date. */
         load_new_mm_cr3(next->pgd, new_asid, false);

What do you all think?
Nadav Amit Aug. 27, 2018, 6:54 p.m. UTC | #67
at 11:45 AM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> 
>>> On Sun, 26 Aug 2018 20:26:09 -0700
>>> Nadav Amit <nadav.amit@gmail.com> wrote:
>>> 
>>>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>> 
>>>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>>>> 
>>>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>>>> text_poke_end();.
>>>>>> 
>>>>>> I don't think anybody ever cared about performance here. Only
>>>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>>>> 
>>>>> Agreed. Self modification is a special event.
>>>>> 
>>>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>>>> text_mutex should be serializing all that.
>>>>> 
>>>>> I'm still not sure that speculative page-table walk can be done
>>>>> over the mutex. Also, if the fixmap area is for aliasing
>>>>> pages (which always mapped to memory), what kind of
>>>>> security issue can happen?
>>>> 
>>>> The PTE is accessible from other cores, so just as we assume for L1TF that
>>>> the every addressable memory might be cached in L1, we should assume and
>>>> PTE might be cached in the TLB when it is present.
>>> 
>>> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
>>> to shoot down explicitly?)
>> 
>> There is way (although current it does not). But it seems that the consensus
>> is that it is better to avoid it being mapped at all in remote cores.
>> 
>>>> Although the mapping is for an alias, there are a couple of issues here.
>>>> First, this alias mapping is writable, so it might an attacker to change the
>>>> kernel code (following another initial attack).
>>> 
>>> Combined with some buffer overflow, correct? If the attacker already can
>>> write a kernel data directly, he is in the kernel mode.
>> 
>> Right.
>> 
>>>> Second, the alias mapping is
>>>> never explicitly flushed. We may assume that once the original mapping is
>>>> removed/changed, a full TLB flush would take place, but there is no
>>>> guarantee it actually takes place.
>>> 
>>> Hmm, would this means a full TLB flush will not flush alias mapping?
>>> (or, the full TLB flush just doesn't work?)
>> 
>> It will flush the alias mapping, but currently there is no such explicit
>> flush.
>> 
>>>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>>>> it can call a thread/function on a dedicated core (like the first
>>>>> boot processor) and wait :) This may prevent leakage of pte change
>>>>> to other cores.
>>>> 
>>>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>>>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>>>> requires to pre-populate various levels in the page-table hierarchy, or
>>>> conditionally synchronize whenever module memory is allocated, since they
>>>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>>>> needed, the possibility that synchronization is needed complicates locking.
>>> 
>>> Could you point which PeterZ approach you said? I guess it will be
>>> make a clone of PGD and use it for local page mapping (as new mm).
>>> If so, yes it sounds perfectly fine to me.
>> 
>> The thread is too long. What I think is best is having a mapping in the PGD
>> level. I’ll try to give it a shot, and see what I get.
>> 
>>>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>>>> KASLR.
>>> 
>>> I think text_poke doesn't mind using random address :)
>>> 
>>>> I don’t think a dedicated core is needed. Anyhow there is a lock
>>>> (text_mutex), so use_mm() can be used after acquiring the mutex.
>>> 
>>> Hmm, use_mm() said;
>>> 
>>> /*
>>> * use_mm
>>> *      Makes the calling kernel thread take on the specified
>>> *      mm context.
>>> *      (Note: this routine is intended to be called only
>>> *      from a kernel thread context)
>>> */
>>> 
>>> So maybe we need a dedicated kernel thread for safeness?
>> 
>> Yes, it says so. But I am not sure it cannot be changed, at least for this
>> specific use-case. Switching kernel threads just for patching seems to me as
>> an overkill.
>> 
>> Let me see if I can get something half-reasonable doing so...
> 
> I don't understand at all how a kernel thread helps.  The useful bit
> is to have a dedicated mm, which would involve setting up an mm_struct
> and mapping the kernel and module text, EFI-style, in the user portion
> of the mm.  But, to do the text_poke(), we'd just use the mm *without
> calling use_mm*.
> 
> In other words, the following sequence should be (almost) just fine:
> 
> typedef struct {
>  struct mm_struct *prev;
> } temporary_mm_state_t;
> 
> temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> {
>    temporary_mm_state_t state;
> 
>    lockdep_assert_irqs_disabled();
>    state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>    switch_mm_irqs_off(NULL, mm, current);
> }
> 
> void unuse_temporary_mm(temporary_mm_state_t prev)
> {
>    lockdep_assert_irqs_disabled();
>    switch_mm_irqs_off(NULL, prev.prev, current);
> }
> 
> The only thing wrong with this that I can see is that it interacts
> poorly with perf.  But perf is *already* busted in this regard.  The
> following (whitespace damaged, sorry) should fix it:
> 
> commit b62bff5a8406d252de752cfe75068d0b73b9cdf0
> Author: Andy Lutomirski <luto@kernel.org>
> Date:   Mon Aug 27 11:41:55 2018 -0700
> 
>    x86/nmi: Fix some races in NMI uaccess
> 
>    In NMI context, we might be in the middle of context switching or in
>    the middle of switch_mm_irqs_off().  In either case, CR3 might not
>    match current->mm, which could cause copy_from_user_nmi() and
>    friends to read the wrong memory.
> 
>    Fix it by adding a new nmi_uaccess_okay() helper and checking it in
>    copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> 
>    Signed-off-by: Andy Lutomirski <luto@kernel.org>
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 5f4829f10129..dfb2f7c0d019 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2465,7 +2465,7 @@ perf_callchain_user(struct
> perf_callchain_entry_ctx *entry, struct pt_regs *regs
> 
>     perf_callchain_store(entry, regs->ip);
> 
> -    if (!current->mm)
> +    if (!nmi_uaccess_okay())
>         return;
> 
>     if (perf_callchain_user32(regs, entry))
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 89a73bc31622..b23b2625793b 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -230,6 +230,22 @@ struct tlb_state {
> };
> DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
> 
> +/*
> + * Blindly accessing user memory from NMI context can be dangerous
> + * if we're in the middle of switching the current user task or
> + * switching the loaded mm.  It can also be dangerous if we
> + * interrupted some kernel code that was temporarily using a
> + * different mm.
> + */
> +static inline bool nmi_uaccess_okay(void)
> +{
> +    struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
> +    struct mm_struct *current_mm = current->mm;
> +
> +    return current_mm && loaded_mm == current_mm &&
> +        loaded_mm->pgd == __va(read_cr3_pa());
> +}
> +
> /* Initialize cr4 shadow for this CPU. */
> static inline void cr4_init_shadow(void)
> {
> diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> index c8c6ad0d58b8..c5f758430be2 100644
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -19,6 +19,9 @@ copy_from_user_nmi(void *to, const void __user
> *from, unsigned long n)
>     if (__range_not_ok(from, n, TASK_SIZE))
>         return n;
> 
> +    if (!nmi_uaccess_okay())
> +        return n;
> +
>     /*
>      * Even though this function is typically called from NMI/IRQ context
>      * disable pagefaults so that its behaviour is consistent even when
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 457b281b9339..f4b41d5a93dd 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -345,6 +345,9 @@ void switch_mm_irqs_off(struct mm_struct *prev,
> struct mm_struct *next,
>          */
>         trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
>     } else {
> +        /* Let NMI code know that CR3 may not match expectations. */
> +        this_cpu_write(cpu_tlbstate.loaded_mm, NULL);
> +
>         /* The new ASID is already up to date. */
>         load_new_mm_cr3(next->pgd, new_asid, false);
> 
> What do you all think?

I agree in general. But I think that current->mm would need to be loaded, as
otherwise I am afraid it would break switch_mm_irqs_off().
Andy Lutomirski Aug. 27, 2018, 6:58 p.m. UTC | #68
On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> What do you all think?
>
> I agree in general. But I think that current->mm would need to be loaded, as
> otherwise I am afraid it would break switch_mm_irqs_off().
>

What breaks?
Nadav Amit Aug. 27, 2018, 7:10 p.m. UTC | #69
at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> What do you all think?
>> 
>> I agree in general. But I think that current->mm would need to be loaded, as
>> otherwise I am afraid it would break switch_mm_irqs_off().
> 
> What breaks?

Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
matter.
Nadav Amit Aug. 27, 2018, 7:43 p.m. UTC | #70
at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:

> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> What do you all think?
>>> 
>>> I agree in general. But I think that current->mm would need to be loaded, as
>>> otherwise I am afraid it would break switch_mm_irqs_off().
>> 
>> What breaks?
> 
> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
> matter.

So here is what I got. It certainly needs some cleanup, but it boots.

Let me know how crappy you find it...


diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bbc796eb0a3b..336779650a41 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
 	return cr3;
 }
 
+typedef struct {
+	struct mm_struct *prev;
+} temporary_mm_state_t;
+
+static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
+{
+	temporary_mm_state_t state;
+
+	lockdep_assert_irqs_disabled();
+	state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	switch_mm_irqs_off(NULL, mm, current);
+	return state;
+}
+
+static inline void unuse_temporary_mm(temporary_mm_state_t prev)
+{
+	lockdep_assert_irqs_disabled();
+	switch_mm_irqs_off(NULL, prev.prev, current);
+}
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5715647fc4fe..ef62af9a0ef7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
 	/* Default trampoline pgd value */
 	trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
+
+void __init patching_mm_init(void);
+#define patching_mm_init patching_mm_init
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 054765ab2da2..9f44262abde0 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
 #define LDT_PGD_ENTRY		(pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
 #define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 
+#define TEXT_POKE_PGD_ENTRY	-5UL
+#define TEXT_POKE_ADDR		(TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
+
 #define __VMALLOC_BASE_L4	0xffffc90000000000UL
 #define __VMALLOC_BASE_L5 	0xffa0000000000000UL
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 99fff853c944..840c72ec8c4f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
+struct mm_struct;
+void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
+
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 2ecd34e2d46c..cb364ea5b19d 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 
+extern struct mm_struct *patching_mm;
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a481763a3776..fd8a950b0d62 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
 #include <linux/stop_machine.h>
 #include <linux/slab.h>
 #include <linux/kdebug.h>
+#include <linux/mmu_context.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
@@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 		WARN_ON(!PageReserved(pages[0]));
 		pages[1] = virt_to_page(addr + PAGE_SIZE);
 	}
-	BUG_ON(!pages[0]);
+
 	local_irq_save(flags);
+	BUG_ON(!pages[0]);
+
+	/*
+	 * During initial boot, it is hard to initialize patching_mm due to
+	 * dependencies in boot order.
+	 */
+	if (patching_mm) {
+		pte_t pte;
+		temporary_mm_state_t prev;
+
+		prev = use_temporary_mm(patching_mm);
+		pte = mk_pte(pages[0], PAGE_KERNEL);
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
+		pte = mk_pte(pages[1], PAGE_KERNEL);
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
+
+		memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
+		       opcode, len);
+
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
+		local_flush_tlb();
+		sync_core();
+
+		unuse_temporary_mm(prev);
+		goto out;
+	}
+
 	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
 	if (pages[1])
 		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
@@ -715,6 +744,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 	sync_core();
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
 	   that causes hangs on some VIA CPUs. */
+out:
 	for (i = 0; i < len; i++)
 		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
 	local_irq_restore(flags);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..bd0d629e3831 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,7 @@
 #include <asm/init.h>
 #include <asm/uv/uv.h>
 #include <asm/setup.h>
+#include <asm/text-patching.h>
 
 #include "mm_internal.h"
 
@@ -285,14 +286,14 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__set_pte_vaddr(pud, vaddr, new_pte);
 }
 
-void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	p4d_t *p4d_page;
 
 	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
 
-	pgd = pgd_offset_k(vaddr);
+	pgd = pgd_offset(mm, vaddr);
 	if (pgd_none(*pgd)) {
 		printk(KERN_ERR
 			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
@@ -303,6 +304,11 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
 }
 
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+	set_mm_pte_vaddr(&init_mm, vaddr, pteval);
+}
+
 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	pgd_t *pgd;
@@ -1399,6 +1405,17 @@ unsigned long memory_block_size_bytes(void)
 	return memory_block_size_probed;
 }
 
+struct mm_struct *patching_mm;
+EXPORT_SYMBOL(patching_mm);
+
+void __init patching_mm_init(void)
+{
+	populate_extra_pte(TEXT_POKE_ADDR);
+	populate_extra_pte(TEXT_POKE_ADDR + PAGE_SIZE);
+
+	patching_mm = copy_init_mm();
+}
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..c95d2240c23a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1083,6 +1083,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 static inline void init_espfix_bsp(void) { }
 #endif
 
+#ifndef patching_mm_init
+static inline void patching_mm_init(void) { }
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #ifndef io_remap_pfn_range
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 108ede99e533..ac0a675678f5 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -74,6 +74,7 @@ extern void exit_itimers(struct signal_struct *);
 extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
+struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..9a313efc80a6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -724,6 +724,7 @@ asmlinkage __visible void __init start_kernel(void)
 	taskstats_init_early();
 	delayacct_init();
 
+	patching_mm_init();
 	check_bugs();
 
 	acpi_subsystem_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b27babc4c78..325d1a5ca903 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1249,9 +1249,9 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  * Allocate a new mm structure and copy contents from the
  * mm structure of the passed in task structure.
  */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm)
 {
-	struct mm_struct *mm, *oldmm = current->mm;
+	struct mm_struct *mm;
 	int err;
 
 	mm = allocate_mm();
@@ -1317,7 +1317,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	}
 
 	retval = -ENOMEM;
-	mm = dup_mm(tsk);
+	mm = dup_mm(tsk, current->mm);
 	if (!mm)
 		goto fail_nomem;
 
@@ -2082,6 +2082,11 @@ struct task_struct *fork_idle(int cpu)
 	return task;
 }
 
+struct mm_struct *copy_init_mm(void)
+{
+	return dup_mm(NULL, &init_mm);
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
Andy Lutomirski Aug. 27, 2018, 7:58 p.m. UTC | #71
On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>
>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>
>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> What do you all think?
>>>>
>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>
>>> What breaks?
>>
>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>> matter.
>
> So here is what I got. It certainly needs some cleanup, but it boots.
>
> Let me know how crappy you find it...
>
>
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index bbc796eb0a3b..336779650a41 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>         return cr3;
>  }
>
> +typedef struct {
> +       struct mm_struct *prev;
> +} temporary_mm_state_t;
> +
> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> +{
> +       temporary_mm_state_t state;
> +
> +       lockdep_assert_irqs_disabled();
> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> +       switch_mm_irqs_off(NULL, mm, current);
> +       return state;
> +}
> +
> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
> +{
> +       lockdep_assert_irqs_disabled();
> +       switch_mm_irqs_off(NULL, prev.prev, current);
> +}
> +
>  #endif /* _ASM_X86_MMU_CONTEXT_H */
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 5715647fc4fe..ef62af9a0ef7 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>         /* Default trampoline pgd value */
>         trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>  }
> +
> +void __init patching_mm_init(void);
> +#define patching_mm_init patching_mm_init
> +
>  # ifdef CONFIG_RANDOMIZE_MEMORY
>  void __meminit init_trampoline(void);
>  # else
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 054765ab2da2..9f44262abde0 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>  #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>  #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>
> +#define TEXT_POKE_PGD_ENTRY    -5UL
> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
> +
>  #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>  #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 99fff853c944..840c72ec8c4f 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  /* Install a pte for a particular vaddr in kernel space. */
>  void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>
> +struct mm_struct;
> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
> +
>  #ifdef CONFIG_X86_32
>  extern void native_pagetable_init(void);
>  #else
> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> index 2ecd34e2d46c..cb364ea5b19d 100644
> --- a/arch/x86/include/asm/text-patching.h
> +++ b/arch/x86/include/asm/text-patching.h
> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>  extern int poke_int3_handler(struct pt_regs *regs);
>  extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>
> +extern struct mm_struct *patching_mm;
> +
>  #endif /* _ASM_X86_TEXT_PATCHING_H */
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index a481763a3776..fd8a950b0d62 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -11,6 +11,7 @@
>  #include <linux/stop_machine.h>
>  #include <linux/slab.h>
>  #include <linux/kdebug.h>
> +#include <linux/mmu_context.h>
>  #include <asm/text-patching.h>
>  #include <asm/alternative.h>
>  #include <asm/sections.h>
> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>                 WARN_ON(!PageReserved(pages[0]));
>                 pages[1] = virt_to_page(addr + PAGE_SIZE);
>         }
> -       BUG_ON(!pages[0]);
> +
>         local_irq_save(flags);
> +       BUG_ON(!pages[0]);
> +
> +       /*
> +        * During initial boot, it is hard to initialize patching_mm due to
> +        * dependencies in boot order.
> +        */
> +       if (patching_mm) {
> +               pte_t pte;
> +               temporary_mm_state_t prev;
> +
> +               prev = use_temporary_mm(patching_mm);
> +               pte = mk_pte(pages[0], PAGE_KERNEL);
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
> +               pte = mk_pte(pages[1], PAGE_KERNEL);
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
> +
> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
> +                      opcode, len);
> +
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
> +               local_flush_tlb();

Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
needed.  How about getting rid of all the weird TLB flushing stuff and
instead putting the mapping at vaddr - __START_KERNEL_map or whatever
it is?  You *might* need to flush_tlb_mm_range() on module unload, but
that's it.

> +               sync_core();

I can't think of any case where sync_core() is needed.  The mm switch
serializes.

Also, is there any circumstance in which any of this is used before at
least jump table init?  All the early stuff is text_poke_early(),
right?
Nadav Amit Aug. 27, 2018, 8:16 p.m. UTC | #72
at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> 
>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> What do you all think?
>>>>> 
>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>> 
>>>> What breaks?
>>> 
>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>> matter.
>> 
>> So here is what I got. It certainly needs some cleanup, but it boots.
>> 
>> Let me know how crappy you find it...
>> 
>> 
>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>> index bbc796eb0a3b..336779650a41 100644
>> --- a/arch/x86/include/asm/mmu_context.h
>> +++ b/arch/x86/include/asm/mmu_context.h
>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>        return cr3;
>> }
>> 
>> +typedef struct {
>> +       struct mm_struct *prev;
>> +} temporary_mm_state_t;
>> +
>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>> +{
>> +       temporary_mm_state_t state;
>> +
>> +       lockdep_assert_irqs_disabled();
>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>> +       switch_mm_irqs_off(NULL, mm, current);
>> +       return state;
>> +}
>> +
>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>> +{
>> +       lockdep_assert_irqs_disabled();
>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>> +}
>> +
>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>> index 5715647fc4fe..ef62af9a0ef7 100644
>> --- a/arch/x86/include/asm/pgtable.h
>> +++ b/arch/x86/include/asm/pgtable.h
>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>        /* Default trampoline pgd value */
>>        trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>> }
>> +
>> +void __init patching_mm_init(void);
>> +#define patching_mm_init patching_mm_init
>> +
>> # ifdef CONFIG_RANDOMIZE_MEMORY
>> void __meminit init_trampoline(void);
>> # else
>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>> index 054765ab2da2..9f44262abde0 100644
>> --- a/arch/x86/include/asm/pgtable_64_types.h
>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>> 
>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>> +
>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>> 
>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>> index 99fff853c944..840c72ec8c4f 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>> /* Install a pte for a particular vaddr in kernel space. */
>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>> 
>> +struct mm_struct;
>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>> +
>> #ifdef CONFIG_X86_32
>> extern void native_pagetable_init(void);
>> #else
>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>> index 2ecd34e2d46c..cb364ea5b19d 100644
>> --- a/arch/x86/include/asm/text-patching.h
>> +++ b/arch/x86/include/asm/text-patching.h
>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>> extern int poke_int3_handler(struct pt_regs *regs);
>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>> 
>> +extern struct mm_struct *patching_mm;
>> +
>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index a481763a3776..fd8a950b0d62 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -11,6 +11,7 @@
>> #include <linux/stop_machine.h>
>> #include <linux/slab.h>
>> #include <linux/kdebug.h>
>> +#include <linux/mmu_context.h>
>> #include <asm/text-patching.h>
>> #include <asm/alternative.h>
>> #include <asm/sections.h>
>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>                WARN_ON(!PageReserved(pages[0]));
>>                pages[1] = virt_to_page(addr + PAGE_SIZE);
>>        }
>> -       BUG_ON(!pages[0]);
>> +
>>        local_irq_save(flags);
>> +       BUG_ON(!pages[0]);
>> +
>> +       /*
>> +        * During initial boot, it is hard to initialize patching_mm due to
>> +        * dependencies in boot order.
>> +        */
>> +       if (patching_mm) {
>> +               pte_t pte;
>> +               temporary_mm_state_t prev;
>> +
>> +               prev = use_temporary_mm(patching_mm);
>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>> +
>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>> +                      opcode, len);
>> +
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>> +               local_flush_tlb();
> 
> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
> needed.  How about getting rid of all the weird TLB flushing stuff and
> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
> that's it.

I don’t see what’s wrong in SMP, since this entire piece of code should be
running under text_mutex.

I don’t quite understand your proposal. I really don’t want to have any
chance in which the page-tables for the poked address is not preallocated.

It is more complicated than needed, and there are redundant TLB flushes. The
reason I preferred to do it this way, is in order not to use other functions
that take locks during the software page-walk and not to duplicate existing
code. Yet, duplication might be the way to go.

>> +               sync_core();
> 
> I can't think of any case where sync_core() is needed.  The mm switch
> serializes.

Good point!

> 
> Also, is there any circumstance in which any of this is used before at
> least jump table init?  All the early stuff is text_poke_early(),
> right?

Not before jump_label_init. However, I did not manage to get rid of the two
code-patches in text_poke(), since text_poke is used relatively early by
x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
to duplicate init_mm - still fails.
Nadav Amit Aug. 27, 2018, 9:55 p.m. UTC | #73
at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:

> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> 
>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> 
>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>> What do you all think?
>>>>>> 
>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>> 
>>>>> What breaks?
>>>> 
>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>> matter.
>>> 
>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>> 
>>> Let me know how crappy you find it...
>>> 
>>> 
>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>> index bbc796eb0a3b..336779650a41 100644
>>> --- a/arch/x86/include/asm/mmu_context.h
>>> +++ b/arch/x86/include/asm/mmu_context.h
>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>       return cr3;
>>> }
>>> 
>>> +typedef struct {
>>> +       struct mm_struct *prev;
>>> +} temporary_mm_state_t;
>>> +
>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>> +{
>>> +       temporary_mm_state_t state;
>>> +
>>> +       lockdep_assert_irqs_disabled();
>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>> +       switch_mm_irqs_off(NULL, mm, current);
>>> +       return state;
>>> +}
>>> +
>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>> +{
>>> +       lockdep_assert_irqs_disabled();
>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>> +}
>>> +
>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>> --- a/arch/x86/include/asm/pgtable.h
>>> +++ b/arch/x86/include/asm/pgtable.h
>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>       /* Default trampoline pgd value */
>>>       trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>> }
>>> +
>>> +void __init patching_mm_init(void);
>>> +#define patching_mm_init patching_mm_init
>>> +
>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>> void __meminit init_trampoline(void);
>>> # else
>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>> index 054765ab2da2..9f44262abde0 100644
>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>> 
>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>> +
>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>> 
>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>> index 99fff853c944..840c72ec8c4f 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>> /* Install a pte for a particular vaddr in kernel space. */
>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>> 
>>> +struct mm_struct;
>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>> +
>>> #ifdef CONFIG_X86_32
>>> extern void native_pagetable_init(void);
>>> #else
>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>> --- a/arch/x86/include/asm/text-patching.h
>>> +++ b/arch/x86/include/asm/text-patching.h
>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>> extern int poke_int3_handler(struct pt_regs *regs);
>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>> 
>>> +extern struct mm_struct *patching_mm;
>>> +
>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>> index a481763a3776..fd8a950b0d62 100644
>>> --- a/arch/x86/kernel/alternative.c
>>> +++ b/arch/x86/kernel/alternative.c
>>> @@ -11,6 +11,7 @@
>>> #include <linux/stop_machine.h>
>>> #include <linux/slab.h>
>>> #include <linux/kdebug.h>
>>> +#include <linux/mmu_context.h>
>>> #include <asm/text-patching.h>
>>> #include <asm/alternative.h>
>>> #include <asm/sections.h>
>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>               WARN_ON(!PageReserved(pages[0]));
>>>               pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>       }
>>> -       BUG_ON(!pages[0]);
>>> +
>>>       local_irq_save(flags);
>>> +       BUG_ON(!pages[0]);
>>> +
>>> +       /*
>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>> +        * dependencies in boot order.
>>> +        */
>>> +       if (patching_mm) {
>>> +               pte_t pte;
>>> +               temporary_mm_state_t prev;
>>> +
>>> +               prev = use_temporary_mm(patching_mm);
>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>> +
>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>> +                      opcode, len);
>>> +
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>> +               local_flush_tlb();
>> 
>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>> needed.  How about getting rid of all the weird TLB flushing stuff and
>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>> that's it.
> 
> I don’t see what’s wrong in SMP, since this entire piece of code should be
> running under text_mutex.
> 
> I don’t quite understand your proposal. I really don’t want to have any
> chance in which the page-tables for the poked address is not preallocated.
> 
> It is more complicated than needed, and there are redundant TLB flushes. The
> reason I preferred to do it this way, is in order not to use other functions
> that take locks during the software page-walk and not to duplicate existing
> code. Yet, duplication might be the way to go.
> 
>>> +               sync_core();
>> 
>> I can't think of any case where sync_core() is needed.  The mm switch
>> serializes.
> 
> Good point!
> 
>> Also, is there any circumstance in which any of this is used before at
>> least jump table init?  All the early stuff is text_poke_early(),
>> right?
> 
> Not before jump_label_init. However, I did not manage to get rid of the two
> code-patches in text_poke(), since text_poke is used relatively early by
> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
> to duplicate init_mm - still fails.

Another correction: the populate_extra_pte() is not needed.

Anyhow, if you want to do this whole thing differently, I obviously will not
object, but I think it will end up more complicated.

I think I finally understood your comment about "vaddr -
__START_KERNEL_map”. I did something like that before, and it is not
super-simple. You need not only to conditionally flush the TLB, but also
to synchronize the PUD/PMD on changes. Don’t forget that module memory
is installed even when BPF programs are installed.

Let me know if you want me to submit cleaner patches or you want to carry on
yourself.

Thanks,
Nadav
Benjamin Herrenschmidt Aug. 27, 2018, 10:13 p.m. UTC | #74
On Mon, 2018-08-27 at 19:02 +1000, Nicholas Piggin wrote:
> > More tlbies ? With the cost of the broadasts on the fabric ? I don't
> > think so.. or I'm not understanding your point...
> 
> More tlbies are no good, but there will be some places where it works
> out much better (and fewer tlbies). Worst possible case for current code
> is a big unmap with lots of scattered page sizes. We _should_ get that
> with just a single PID flush at the end, but what we will get today is
> a bunch of PID and VA flushes.
> 
> I don't propose doing that though, I'd rather be explicit about
> tracking start and end range of each page size. Still not "optimal"
> but neither is existing single range for sparse mappings... anyway it
> will need to be profiled, but my point is we don't really fit exactly
> what x86/arm want.

If we have an arch specific part, we could just remember up to N
"large" pages there without actually flushing, and if that overflows,
upgrade to a full flush.

Cheers,
Ben.
Andy Lutomirski Aug. 27, 2018, 10:32 p.m. UTC | #75
On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>
>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>
>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>
>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>
>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> What do you all think?
>>>>>>>
>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>
>>>>>> What breaks?
>>>>>
>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>> matter.
>>>>
>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>
>>>> Let me know how crappy you find it...
>>>>
>>>>
>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>> index bbc796eb0a3b..336779650a41 100644
>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>       return cr3;
>>>> }
>>>>
>>>> +typedef struct {
>>>> +       struct mm_struct *prev;
>>>> +} temporary_mm_state_t;
>>>> +
>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>> +{
>>>> +       temporary_mm_state_t state;
>>>> +
>>>> +       lockdep_assert_irqs_disabled();
>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>> +       return state;
>>>> +}
>>>> +
>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>> +{
>>>> +       lockdep_assert_irqs_disabled();
>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>> +}
>>>> +
>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>> --- a/arch/x86/include/asm/pgtable.h
>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>       /* Default trampoline pgd value */
>>>>       trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>> }
>>>> +
>>>> +void __init patching_mm_init(void);
>>>> +#define patching_mm_init patching_mm_init
>>>> +
>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>> void __meminit init_trampoline(void);
>>>> # else
>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>> index 054765ab2da2..9f44262abde0 100644
>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>
>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>> +
>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>
>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>> index 99fff853c944..840c72ec8c4f 100644
>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>
>>>> +struct mm_struct;
>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>> +
>>>> #ifdef CONFIG_X86_32
>>>> extern void native_pagetable_init(void);
>>>> #else
>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>> --- a/arch/x86/include/asm/text-patching.h
>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>
>>>> +extern struct mm_struct *patching_mm;
>>>> +
>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>> index a481763a3776..fd8a950b0d62 100644
>>>> --- a/arch/x86/kernel/alternative.c
>>>> +++ b/arch/x86/kernel/alternative.c
>>>> @@ -11,6 +11,7 @@
>>>> #include <linux/stop_machine.h>
>>>> #include <linux/slab.h>
>>>> #include <linux/kdebug.h>
>>>> +#include <linux/mmu_context.h>
>>>> #include <asm/text-patching.h>
>>>> #include <asm/alternative.h>
>>>> #include <asm/sections.h>
>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>               WARN_ON(!PageReserved(pages[0]));
>>>>               pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>       }
>>>> -       BUG_ON(!pages[0]);
>>>> +
>>>>       local_irq_save(flags);
>>>> +       BUG_ON(!pages[0]);
>>>> +
>>>> +       /*
>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>> +        * dependencies in boot order.
>>>> +        */
>>>> +       if (patching_mm) {
>>>> +               pte_t pte;
>>>> +               temporary_mm_state_t prev;
>>>> +
>>>> +               prev = use_temporary_mm(patching_mm);
>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>> +
>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>> +                      opcode, len);
>>>> +
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>> +               local_flush_tlb();
>>>
>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>> that's it.
>>
>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>> running under text_mutex.
>>
>> I don’t quite understand your proposal. I really don’t want to have any
>> chance in which the page-tables for the poked address is not preallocated.
>>
>> It is more complicated than needed, and there are redundant TLB flushes. The
>> reason I preferred to do it this way, is in order not to use other functions
>> that take locks during the software page-walk and not to duplicate existing
>> code. Yet, duplication might be the way to go.
>>
>>>> +               sync_core();
>>>
>>> I can't think of any case where sync_core() is needed.  The mm switch
>>> serializes.
>>
>> Good point!
>>
>>> Also, is there any circumstance in which any of this is used before at
>>> least jump table init?  All the early stuff is text_poke_early(),
>>> right?
>>
>> Not before jump_label_init. However, I did not manage to get rid of the two
>> code-patches in text_poke(), since text_poke is used relatively early by
>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>> to duplicate init_mm - still fails.
>
> Another correction: the populate_extra_pte() is not needed.
>
> Anyhow, if you want to do this whole thing differently, I obviously will not
> object, but I think it will end up more complicated.
>
> I think I finally understood your comment about "vaddr -
> __START_KERNEL_map”. I did something like that before, and it is not
> super-simple. You need not only to conditionally flush the TLB, but also
> to synchronize the PUD/PMD on changes. Don’t forget that module memory
> is installed even when BPF programs are installed.
>
> Let me know if you want me to submit cleaner patches or you want to carry on
> yourself.
>

I think your approach is a good start and should be good enough (with
cleanups) as a fix for the bug.  But I think your code has the same
bug that we have now!  You're reusing the same address on multiple
CPUs without flushing.  You can easily fix it by forcing a flush
before loading the mm, which should be as simple as adding
flush_tlb_mm() before you load the mm.  (It won't actually flush
anything by itself, since the mm isn't loaded, but it will update the
bookkeeping so that switch_mm_irqs_off() flushes the mm.)

Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
do the vaddr - __START_KERNEL_map thing, then at least pick an address
in the low half of the address space such as 0 :)  Ideally you'd only
use this thing late enough that you could even use the normal
insert_pfn (or similar) API for it, but that doesn't really matter.
Nadav Amit Aug. 27, 2018, 10:54 p.m. UTC | #76
at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> 
>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> 
>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> 
>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>> What do you all think?
>>>>>>>> 
>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>> 
>>>>>>> What breaks?
>>>>>> 
>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>> matter.
>>>>> 
>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>> 
>>>>> Let me know how crappy you find it...
>>>>> 
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>      return cr3;
>>>>> }
>>>>> 
>>>>> +typedef struct {
>>>>> +       struct mm_struct *prev;
>>>>> +} temporary_mm_state_t;
>>>>> +
>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>> +{
>>>>> +       temporary_mm_state_t state;
>>>>> +
>>>>> +       lockdep_assert_irqs_disabled();
>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>> +       return state;
>>>>> +}
>>>>> +
>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>> +{
>>>>> +       lockdep_assert_irqs_disabled();
>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>> +}
>>>>> +
>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>      /* Default trampoline pgd value */
>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>> }
>>>>> +
>>>>> +void __init patching_mm_init(void);
>>>>> +#define patching_mm_init patching_mm_init
>>>>> +
>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>> void __meminit init_trampoline(void);
>>>>> # else
>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>> 
>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>> +
>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>> 
>>>>> +struct mm_struct;
>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>> +
>>>>> #ifdef CONFIG_X86_32
>>>>> extern void native_pagetable_init(void);
>>>>> #else
>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>> 
>>>>> +extern struct mm_struct *patching_mm;
>>>>> +
>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>> --- a/arch/x86/kernel/alternative.c
>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>> @@ -11,6 +11,7 @@
>>>>> #include <linux/stop_machine.h>
>>>>> #include <linux/slab.h>
>>>>> #include <linux/kdebug.h>
>>>>> +#include <linux/mmu_context.h>
>>>>> #include <asm/text-patching.h>
>>>>> #include <asm/alternative.h>
>>>>> #include <asm/sections.h>
>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>              WARN_ON(!PageReserved(pages[0]));
>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>      }
>>>>> -       BUG_ON(!pages[0]);
>>>>> +
>>>>>      local_irq_save(flags);
>>>>> +       BUG_ON(!pages[0]);
>>>>> +
>>>>> +       /*
>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>> +        * dependencies in boot order.
>>>>> +        */
>>>>> +       if (patching_mm) {
>>>>> +               pte_t pte;
>>>>> +               temporary_mm_state_t prev;
>>>>> +
>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>> +
>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>> +                      opcode, len);
>>>>> +
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>> +               local_flush_tlb();
>>>> 
>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>> that's it.
>>> 
>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>> running under text_mutex.
>>> 
>>> I don’t quite understand your proposal. I really don’t want to have any
>>> chance in which the page-tables for the poked address is not preallocated.
>>> 
>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>> reason I preferred to do it this way, is in order not to use other functions
>>> that take locks during the software page-walk and not to duplicate existing
>>> code. Yet, duplication might be the way to go.
>>> 
>>>>> +               sync_core();
>>>> 
>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>> serializes.
>>> 
>>> Good point!
>>> 
>>>> Also, is there any circumstance in which any of this is used before at
>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>> right?
>>> 
>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>> code-patches in text_poke(), since text_poke is used relatively early by
>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>> to duplicate init_mm - still fails.
>> 
>> Another correction: the populate_extra_pte() is not needed.
>> 
>> Anyhow, if you want to do this whole thing differently, I obviously will not
>> object, but I think it will end up more complicated.
>> 
>> I think I finally understood your comment about "vaddr -
>> __START_KERNEL_map”. I did something like that before, and it is not
>> super-simple. You need not only to conditionally flush the TLB, but also
>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>> is installed even when BPF programs are installed.
>> 
>> Let me know if you want me to submit cleaner patches or you want to carry on
>> yourself.
> 
> I think your approach is a good start and should be good enough (with
> cleanups) as a fix for the bug.  But I think your code has the same
> bug that we have now!  You're reusing the same address on multiple
> CPUs without flushing.  You can easily fix it by forcing a flush
> before loading the mm, which should be as simple as adding
> flush_tlb_mm() before you load the mm.  (It won't actually flush
> anything by itself, since the mm isn't loaded, but it will update the
> bookkeeping so that switch_mm_irqs_off() flushes the mm.)

What am I missing?

We have a lock (text_mutex) which prevents the use of the new page-table
hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
a local TLB flush before the lock is released and before IRQs are enabled.
So how can the PTE be cached on multiple CPUs?

Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
redundant TLB flushes, but these flushes are already there.

> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
> do the vaddr - __START_KERNEL_map thing, then at least pick an address
> in the low half of the address space such as 0 :)  Ideally you'd only
> use this thing late enough that you could even use the normal
> insert_pfn (or similar) API for it, but that doesn't really matter.

Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
misunderstanding it (again) before, thinking you want me to use vaddr (and
not the delta). I’ll give it a try.

Thanks,
Nadav
Andy Lutomirski Aug. 27, 2018, 11:01 p.m. UTC | #77
On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>
>> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>
>>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>
>>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>
>>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>
>>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>> What do you all think?
>>>>>>>>>
>>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>>>
>>>>>>>> What breaks?
>>>>>>>
>>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>>> matter.
>>>>>>
>>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>>>
>>>>>> Let me know how crappy you find it...
>>>>>>
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>>      return cr3;
>>>>>> }
>>>>>>
>>>>>> +typedef struct {
>>>>>> +       struct mm_struct *prev;
>>>>>> +} temporary_mm_state_t;
>>>>>> +
>>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>>> +{
>>>>>> +       temporary_mm_state_t state;
>>>>>> +
>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>>> +       return state;
>>>>>> +}
>>>>>> +
>>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>>> +{
>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>>> +}
>>>>>> +
>>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>>      /* Default trampoline pgd value */
>>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>>> }
>>>>>> +
>>>>>> +void __init patching_mm_init(void);
>>>>>> +#define patching_mm_init patching_mm_init
>>>>>> +
>>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>>> void __meminit init_trampoline(void);
>>>>>> # else
>>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>
>>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>>> +
>>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>>>
>>>>>> +struct mm_struct;
>>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>>> +
>>>>>> #ifdef CONFIG_X86_32
>>>>>> extern void native_pagetable_init(void);
>>>>>> #else
>>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>>>
>>>>>> +extern struct mm_struct *patching_mm;
>>>>>> +
>>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>>> --- a/arch/x86/kernel/alternative.c
>>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>>> @@ -11,6 +11,7 @@
>>>>>> #include <linux/stop_machine.h>
>>>>>> #include <linux/slab.h>
>>>>>> #include <linux/kdebug.h>
>>>>>> +#include <linux/mmu_context.h>
>>>>>> #include <asm/text-patching.h>
>>>>>> #include <asm/alternative.h>
>>>>>> #include <asm/sections.h>
>>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>>              WARN_ON(!PageReserved(pages[0]));
>>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>>      }
>>>>>> -       BUG_ON(!pages[0]);
>>>>>> +
>>>>>>      local_irq_save(flags);
>>>>>> +       BUG_ON(!pages[0]);
>>>>>> +
>>>>>> +       /*
>>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>>> +        * dependencies in boot order.
>>>>>> +        */
>>>>>> +       if (patching_mm) {
>>>>>> +               pte_t pte;
>>>>>> +               temporary_mm_state_t prev;
>>>>>> +
>>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>>> +
>>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>>> +                      opcode, len);
>>>>>> +
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>>> +               local_flush_tlb();
>>>>>
>>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>>> that's it.
>>>>
>>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>>> running under text_mutex.
>>>>
>>>> I don’t quite understand your proposal. I really don’t want to have any
>>>> chance in which the page-tables for the poked address is not preallocated.
>>>>
>>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>>> reason I preferred to do it this way, is in order not to use other functions
>>>> that take locks during the software page-walk and not to duplicate existing
>>>> code. Yet, duplication might be the way to go.
>>>>
>>>>>> +               sync_core();
>>>>>
>>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>>> serializes.
>>>>
>>>> Good point!
>>>>
>>>>> Also, is there any circumstance in which any of this is used before at
>>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>>> right?
>>>>
>>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>>> code-patches in text_poke(), since text_poke is used relatively early by
>>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>>> to duplicate init_mm - still fails.
>>>
>>> Another correction: the populate_extra_pte() is not needed.
>>>
>>> Anyhow, if you want to do this whole thing differently, I obviously will not
>>> object, but I think it will end up more complicated.
>>>
>>> I think I finally understood your comment about "vaddr -
>>> __START_KERNEL_map”. I did something like that before, and it is not
>>> super-simple. You need not only to conditionally flush the TLB, but also
>>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>>> is installed even when BPF programs are installed.
>>>
>>> Let me know if you want me to submit cleaner patches or you want to carry on
>>> yourself.
>>
>> I think your approach is a good start and should be good enough (with
>> cleanups) as a fix for the bug.  But I think your code has the same
>> bug that we have now!  You're reusing the same address on multiple
>> CPUs without flushing.  You can easily fix it by forcing a flush
>> before loading the mm, which should be as simple as adding
>> flush_tlb_mm() before you load the mm.  (It won't actually flush
>> anything by itself, since the mm isn't loaded, but it will update the
>> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
>
> What am I missing?
>
> We have a lock (text_mutex) which prevents the use of the new page-table
> hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
> a local TLB flush before the lock is released and before IRQs are enabled.
> So how can the PTE be cached on multiple CPUs?
>
> Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
> redundant TLB flushes, but these flushes are already there.

I missed that set_pte_vaddr() contained a flush.  It's probably nicer
to use one of the APIs that doesn't imply a flush.  If you end up with
vm_insert_pfn() or similar, you can use can use copy_to_user() if
needed :)

>
>> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
>> do the vaddr - __START_KERNEL_map thing, then at least pick an address
>> in the low half of the address space such as 0 :)  Ideally you'd only
>> use this thing late enough that you could even use the normal
>> insert_pfn (or similar) API for it, but that doesn't really matter.
>
> Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
> misunderstanding it (again) before, thinking you want me to use vaddr (and
> not the delta). I’ll give it a try.

It does prevent easy preallocation of the intermediate tables.

And do we really allow text_poke() / text_poke_bp() on BPF programs?  Hmm.
Masami Hiramatsu (Google) Aug. 28, 2018, 8:49 a.m. UTC | #78
On Mon, 27 Aug 2018 16:01:32 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> > at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
> >
> >> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>
> >>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
> >>>>
> >>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>
> >>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
> >>>>>>>
> >>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>>>>> What do you all think?
> >>>>>>>>>
> >>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
> >>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
> >>>>>>>>
> >>>>>>>> What breaks?
> >>>>>>>
> >>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
> >>>>>>> matter.
> >>>>>>
> >>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
> >>>>>>
> >>>>>> Let me know how crappy you find it...
> >>>>>>
> >>>>>>
> >>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> >>>>>> index bbc796eb0a3b..336779650a41 100644
> >>>>>> --- a/arch/x86/include/asm/mmu_context.h
> >>>>>> +++ b/arch/x86/include/asm/mmu_context.h
> >>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
> >>>>>>      return cr3;
> >>>>>> }
> >>>>>>
> >>>>>> +typedef struct {
> >>>>>> +       struct mm_struct *prev;
> >>>>>> +} temporary_mm_state_t;
> >>>>>> +
> >>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> >>>>>> +{
> >>>>>> +       temporary_mm_state_t state;
> >>>>>> +
> >>>>>> +       lockdep_assert_irqs_disabled();
> >>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> >>>>>> +       switch_mm_irqs_off(NULL, mm, current);
> >>>>>> +       return state;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
> >>>>>> +{
> >>>>>> +       lockdep_assert_irqs_disabled();
> >>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
> >>>>>> +}
> >>>>>> +
> >>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
> >>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> >>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable.h
> >>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
> >>>>>>      /* Default trampoline pgd value */
> >>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
> >>>>>> }
> >>>>>> +
> >>>>>> +void __init patching_mm_init(void);
> >>>>>> +#define patching_mm_init patching_mm_init
> >>>>>> +
> >>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
> >>>>>> void __meminit init_trampoline(void);
> >>>>>> # else
> >>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> index 054765ab2da2..9f44262abde0 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
> >>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
> >>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
> >>>>>>
> >>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
> >>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
> >>>>>> +
> >>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
> >>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
> >>>>>>
> >>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> >>>>>> index 99fff853c944..840c72ec8c4f 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable_types.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
> >>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
> >>>>>> /* Install a pte for a particular vaddr in kernel space. */
> >>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
> >>>>>>
> >>>>>> +struct mm_struct;
> >>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
> >>>>>> +
> >>>>>> #ifdef CONFIG_X86_32
> >>>>>> extern void native_pagetable_init(void);
> >>>>>> #else
> >>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> >>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
> >>>>>> --- a/arch/x86/include/asm/text-patching.h
> >>>>>> +++ b/arch/x86/include/asm/text-patching.h
> >>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
> >>>>>> extern int poke_int3_handler(struct pt_regs *regs);
> >>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
> >>>>>>
> >>>>>> +extern struct mm_struct *patching_mm;
> >>>>>> +
> >>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
> >>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> >>>>>> index a481763a3776..fd8a950b0d62 100644
> >>>>>> --- a/arch/x86/kernel/alternative.c
> >>>>>> +++ b/arch/x86/kernel/alternative.c
> >>>>>> @@ -11,6 +11,7 @@
> >>>>>> #include <linux/stop_machine.h>
> >>>>>> #include <linux/slab.h>
> >>>>>> #include <linux/kdebug.h>
> >>>>>> +#include <linux/mmu_context.h>
> >>>>>> #include <asm/text-patching.h>
> >>>>>> #include <asm/alternative.h>
> >>>>>> #include <asm/sections.h>
> >>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
> >>>>>>              WARN_ON(!PageReserved(pages[0]));
> >>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
> >>>>>>      }
> >>>>>> -       BUG_ON(!pages[0]);
> >>>>>> +
> >>>>>>      local_irq_save(flags);
> >>>>>> +       BUG_ON(!pages[0]);
> >>>>>> +
> >>>>>> +       /*
> >>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
> >>>>>> +        * dependencies in boot order.
> >>>>>> +        */
> >>>>>> +       if (patching_mm) {
> >>>>>> +               pte_t pte;
> >>>>>> +               temporary_mm_state_t prev;
> >>>>>> +
> >>>>>> +               prev = use_temporary_mm(patching_mm);
> >>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
> >>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
> >>>>>> +
> >>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
> >>>>>> +                      opcode, len);
> >>>>>> +
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
> >>>>>> +               local_flush_tlb();
> >>>>>
> >>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
> >>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
> >>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
> >>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
> >>>>> that's it.
> >>>>
> >>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
> >>>> running under text_mutex.
> >>>>
> >>>> I don’t quite understand your proposal. I really don’t want to have any
> >>>> chance in which the page-tables for the poked address is not preallocated.
> >>>>
> >>>> It is more complicated than needed, and there are redundant TLB flushes. The
> >>>> reason I preferred to do it this way, is in order not to use other functions
> >>>> that take locks during the software page-walk and not to duplicate existing
> >>>> code. Yet, duplication might be the way to go.
> >>>>
> >>>>>> +               sync_core();
> >>>>>
> >>>>> I can't think of any case where sync_core() is needed.  The mm switch
> >>>>> serializes.
> >>>>
> >>>> Good point!
> >>>>
> >>>>> Also, is there any circumstance in which any of this is used before at
> >>>>> least jump table init?  All the early stuff is text_poke_early(),
> >>>>> right?
> >>>>
> >>>> Not before jump_label_init. However, I did not manage to get rid of the two
> >>>> code-patches in text_poke(), since text_poke is used relatively early by
> >>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
> >>>> to duplicate init_mm - still fails.
> >>>
> >>> Another correction: the populate_extra_pte() is not needed.
> >>>
> >>> Anyhow, if you want to do this whole thing differently, I obviously will not
> >>> object, but I think it will end up more complicated.
> >>>
> >>> I think I finally understood your comment about "vaddr -
> >>> __START_KERNEL_map”. I did something like that before, and it is not
> >>> super-simple. You need not only to conditionally flush the TLB, but also
> >>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
> >>> is installed even when BPF programs are installed.
> >>>
> >>> Let me know if you want me to submit cleaner patches or you want to carry on
> >>> yourself.
> >>
> >> I think your approach is a good start and should be good enough (with
> >> cleanups) as a fix for the bug.  But I think your code has the same
> >> bug that we have now!  You're reusing the same address on multiple
> >> CPUs without flushing.  You can easily fix it by forcing a flush
> >> before loading the mm, which should be as simple as adding
> >> flush_tlb_mm() before you load the mm.  (It won't actually flush
> >> anything by itself, since the mm isn't loaded, but it will update the
> >> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
> >
> > What am I missing?
> >
> > We have a lock (text_mutex) which prevents the use of the new page-table
> > hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
> > a local TLB flush before the lock is released and before IRQs are enabled.
> > So how can the PTE be cached on multiple CPUs?
> >
> > Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
> > redundant TLB flushes, but these flushes are already there.
> 
> I missed that set_pte_vaddr() contained a flush.  It's probably nicer
> to use one of the APIs that doesn't imply a flush.  If you end up with
> vm_insert_pfn() or similar, you can use can use copy_to_user() if
> needed :)
> 
> >
> >> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
> >> do the vaddr - __START_KERNEL_map thing, then at least pick an address
> >> in the low half of the address space such as 0 :)  Ideally you'd only
> >> use this thing late enough that you could even use the normal
> >> insert_pfn (or similar) API for it, but that doesn't really matter.
> >
> > Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
> > misunderstanding it (again) before, thinking you want me to use vaddr (and
> > not the delta). I’ll give it a try.
> 
> It does prevent easy preallocation of the intermediate tables.

Would we have to change the mapping address? If so, how about using hash
of vaddr, which allow us to map it in one pte?

> And do we really allow text_poke() / text_poke_bp() on BPF programs?  Hmm.

I don't think so. Allowing text_poke from BPF program is security hazzerd.
(or would you mean BPF JIT?)

Thanks,
Nadav Amit Aug. 28, 2018, 5:33 p.m. UTC | #79
at 1:49 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Mon, 27 Aug 2018 16:01:32 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> 
>>>>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> 
>>>>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> 
>>>>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>>> 
>>>>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>>>> What do you all think?
>>>>>>>>>>> 
>>>>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>>>>> 
>>>>>>>>>> What breaks?
>>>>>>>>> 
>>>>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>>>>> matter.
>>>>>>>> 
>>>>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>>>>> 
>>>>>>>> Let me know how crappy you find it...
>>>>>>>> 
>>>>>>>> 
>>>>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>>>>     return cr3;
>>>>>>>> }
>>>>>>>> 
>>>>>>>> +typedef struct {
>>>>>>>> +       struct mm_struct *prev;
>>>>>>>> +} temporary_mm_state_t;
>>>>>>>> +
>>>>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>>>>> +{
>>>>>>>> +       temporary_mm_state_t state;
>>>>>>>> +
>>>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>>>>> +       return state;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>>>>> +{
>>>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>>>>     /* Default trampoline pgd value */
>>>>>>>>     trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>>>>> }
>>>>>>>> +
>>>>>>>> +void __init patching_mm_init(void);
>>>>>>>> +#define patching_mm_init patching_mm_init
>>>>>>>> +
>>>>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>>>>> void __meminit init_trampoline(void);
>>>>>>>> # else
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>>> 
>>>>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>>> +
>>>>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>>>>> 
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>>>>> 
>>>>>>>> +struct mm_struct;
>>>>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>>>>> +
>>>>>>>> #ifdef CONFIG_X86_32
>>>>>>>> extern void native_pagetable_init(void);
>>>>>>>> #else
>>>>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>>>>> 
>>>>>>>> +extern struct mm_struct *patching_mm;
>>>>>>>> +
>>>>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>>>>> --- a/arch/x86/kernel/alternative.c
>>>>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>>>>> @@ -11,6 +11,7 @@
>>>>>>>> #include <linux/stop_machine.h>
>>>>>>>> #include <linux/slab.h>
>>>>>>>> #include <linux/kdebug.h>
>>>>>>>> +#include <linux/mmu_context.h>
>>>>>>>> #include <asm/text-patching.h>
>>>>>>>> #include <asm/alternative.h>
>>>>>>>> #include <asm/sections.h>
>>>>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>>>>             WARN_ON(!PageReserved(pages[0]));
>>>>>>>>             pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>>>>     }
>>>>>>>> -       BUG_ON(!pages[0]);
>>>>>>>> +
>>>>>>>>     local_irq_save(flags);
>>>>>>>> +       BUG_ON(!pages[0]);
>>>>>>>> +
>>>>>>>> +       /*
>>>>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>>>>> +        * dependencies in boot order.
>>>>>>>> +        */
>>>>>>>> +       if (patching_mm) {
>>>>>>>> +               pte_t pte;
>>>>>>>> +               temporary_mm_state_t prev;
>>>>>>>> +
>>>>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>>>>> +
>>>>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>>>>> +                      opcode, len);
>>>>>>>> +
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>>>>> +               local_flush_tlb();
>>>>>>> 
>>>>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>>>>> that's it.
>>>>>> 
>>>>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>>>>> running under text_mutex.
>>>>>> 
>>>>>> I don’t quite understand your proposal. I really don’t want to have any
>>>>>> chance in which the page-tables for the poked address is not preallocated.
>>>>>> 
>>>>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>>>>> reason I preferred to do it this way, is in order not to use other functions
>>>>>> that take locks during the software page-walk and not to duplicate existing
>>>>>> code. Yet, duplication might be the way to go.
>>>>>> 
>>>>>>>> +               sync_core();
>>>>>>> 
>>>>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>>>>> serializes.
>>>>>> 
>>>>>> Good point!
>>>>>> 
>>>>>>> Also, is there any circumstance in which any of this is used before at
>>>>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>>>>> right?
>>>>>> 
>>>>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>>>>> code-patches in text_poke(), since text_poke is used relatively early by
>>>>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>>>>> to duplicate init_mm - still fails.
>>>>> 
>>>>> Another correction: the populate_extra_pte() is not needed.
>>>>> 
>>>>> Anyhow, if you want to do this whole thing differently, I obviously will not
>>>>> object, but I think it will end up more complicated.
>>>>> 
>>>>> I think I finally understood your comment about "vaddr -
>>>>> __START_KERNEL_map”. I did something like that before, and it is not
>>>>> super-simple. You need not only to conditionally flush the TLB, but also
>>>>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>>>>> is installed even when BPF programs are installed.
>>>>> 
>>>>> Let me know if you want me to submit cleaner patches or you want to carry on
>>>>> yourself.
>>>> 
>>>> I think your approach is a good start and should be good enough (with
>>>> cleanups) as a fix for the bug.  But I think your code has the same
>>>> bug that we have now!  You're reusing the same address on multiple
>>>> CPUs without flushing.  You can easily fix it by forcing a flush
>>>> before loading the mm, which should be as simple as adding
>>>> flush_tlb_mm() before you load the mm.  (It won't actually flush
>>>> anything by itself, since the mm isn't loaded, but it will update the
>>>> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
>>> 
>>> What am I missing?
>>> 
>>> We have a lock (text_mutex) which prevents the use of the new page-table
>>> hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
>>> a local TLB flush before the lock is released and before IRQs are enabled.
>>> So how can the PTE be cached on multiple CPUs?
>>> 
>>> Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
>>> redundant TLB flushes, but these flushes are already there.
>> 
>> I missed that set_pte_vaddr() contained a flush.  It's probably nicer
>> to use one of the APIs that doesn't imply a flush.  If you end up with
>> vm_insert_pfn() or similar, you can use can use copy_to_user() if
>> needed :)
>> 
>>>> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
>>>> do the vaddr - __START_KERNEL_map thing, then at least pick an address
>>>> in the low half of the address space such as 0 :)  Ideally you'd only
>>>> use this thing late enough that you could even use the normal
>>>> insert_pfn (or similar) API for it, but that doesn't really matter.
>>> 
>>> Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
>>> misunderstanding it (again) before, thinking you want me to use vaddr (and
>>> not the delta). I’ll give it a try.
>> 
>> It does prevent easy preallocation of the intermediate tables.
> 
> Would we have to change the mapping address? If so, how about using hash
> of vaddr, which allow us to map it in one pte?

We do need two PTEs, and I will reuse the same two PTEs in order not to
waste memory on page-tables. I will try to randomize their address to be on
the safe side.
Vineet Gupta Aug. 30, 2018, 12:13 a.m. UTC | #80
On 08/27/2018 04:00 AM, Peter Zijlstra wrote:
>
> The one obvious thing SH and ARM want is a sensible default for
> tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )
>
> The below make tlb_start_vma() default to flush_cache_range(), which
> should be right and sufficient. The only exceptions that I found where
> (oddly):
>
>   - m68k-mmu
>   - sparc64
>   - unicore
>
> Those architectures appear to have a non-NOP flush_cache_range(), but
> their current tlb_start_vma() does not call it.

So indeed we follow the DaveM's insight from 2004 about tlb_{start,end}_vma() and
those are No-ops for ARC for the general case. For the historic VIPT aliasing
dcache they are what they should be per 2004 link above - I presume that is all
hunky dory with you ?

> Furthermore, I think tlb_flush() is broken on arc and parisc; in
> particular they don't appear to have any TLB invalidate for the
> shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.

Care to explain this issue a bit more ?
And that is independent of the current discussion.

> Possibly shift_arg_pages() should be fixed instead.
>
> Some archs (nds32,sparc32) avoid this by having an unconditional
> flush_tlb_mm() in tlb_flush(), which seems somewhat suboptimal if you
> have flush_tlb_range().  TLB_FLUSH_VMA() might be an option, however
> hideous it is.
>
> ---
>
> diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
> index a9db5f62aaf3..7af2b373ebe7 100644
> --- a/arch/arc/include/asm/tlb.h
> +++ b/arch/arc/include/asm/tlb.h
> @@ -23,15 +23,6 @@ do {						\
>   *
>   * Note, read http://lkml.org/lkml/2004/1/15/6
>   */
> -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
> -#define tlb_start_vma(tlb, vma)
> -#else
> -#define tlb_start_vma(tlb, vma)						\
> -do {									\
> -	if (!tlb->fullmm)						\
> -		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> -} while(0)
> -#endif
>  
>  #define tlb_end_vma(tlb, vma)						\
>  do {									\

[snip..]

> 				      \
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index e811ef7b8350..1d037fd5bb7a 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
>   * the vmas are adjusted to only cover the region to be torn down.
>   */
>  #ifndef tlb_start_vma
> -#define tlb_start_vma(tlb, vma) do { } while (0)
> +#define tlb_start_vma(tlb, vma)						\
> +do {									\
> +	if (!tlb->fullmm)						\
> +		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> +} while (0)
>  #endif

So for non aliasing arches to be not affected, this relies on flush_cache_range()
to be no-op ?

> -#define __tlb_end_vma(tlb, vma)					\
> -	do {							\
> -		if (!tlb->fullmm && tlb->end) {			\
> -			tlb_flush(tlb);				\
> -			__tlb_reset_range(tlb);			\
> -		}						\
> -	} while (0)
> -
>  #ifndef tlb_end_vma
> -#define tlb_end_vma	__tlb_end_vma
> +#define tlb_end_vma(tlb, vma)						\
> +	do {								\
> +		if (!tlb->fullmm && tlb->end) {				\
> +			tlb_flush(tlb);					\
> +			__tlb_reset_range(tlb);				\
> +		}							\
> +	} while (0)
>  #endif
>  
>  #ifndef __tlb_remove_tlb_entry

And this one is for shift_arg_pages() but will also cause extraneous flushes for
other cases - not happening currently !

-Vineet
Peter Zijlstra Aug. 30, 2018, 10:23 a.m. UTC | #81
On Thu, Aug 30, 2018 at 12:13:50AM +0000, Vineet Gupta wrote:
> On 08/27/2018 04:00 AM, Peter Zijlstra wrote:
> >
> > The one obvious thing SH and ARM want is a sensible default for
> > tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )
> >
> > The below make tlb_start_vma() default to flush_cache_range(), which
> > should be right and sufficient. The only exceptions that I found where
> > (oddly):
> >
> >   - m68k-mmu
> >   - sparc64
> >   - unicore
> >
> > Those architectures appear to have a non-NOP flush_cache_range(), but
> > their current tlb_start_vma() does not call it.
> 
> So indeed we follow the DaveM's insight from 2004 about tlb_{start,end}_vma() and
> those are No-ops for ARC for the general case. For the historic VIPT aliasing
> dcache they are what they should be per 2004 link above - I presume that is all
> hunky dory with you ?

Yep, I was just confused about those 3 architectures having
flush_cache_range() but not calling it from tlb_start_vma(). The regular
VIPT aliasing thing is all good. And not having them is also fine.

> > Furthermore, I think tlb_flush() is broken on arc and parisc; in
> > particular they don't appear to have any TLB invalidate for the
> > shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.
> 
> Care to explain this issue a bit more ?
> And that is independent of the current discussion.
> 
> > Possibly shift_arg_pages() should be fixed instead.

So the problem is that shift_arg_pages() does not call tlb_start_vma() /
tlb_end_vma(). It also has fullmm=0. Therefore, on ARC, there will not
be a TLB invalidate at all when freeing the page-tables.

And while looking at this more, move_page_tables() also looks dodgy, I
think it always needs a TLB flush of the entire 'old' range.

> > diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
> > index a9db5f62aaf3..7af2b373ebe7 100644
> > --- a/arch/arc/include/asm/tlb.h
> > +++ b/arch/arc/include/asm/tlb.h
> > @@ -23,15 +23,6 @@ do {						\
> >   *
> >   * Note, read http://lkml.org/lkml/2004/1/15/6
> >   */
> > -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
> > -#define tlb_start_vma(tlb, vma)
> > -#else
> > -#define tlb_start_vma(tlb, vma)						\
> > -do {									\
> > -	if (!tlb->fullmm)						\
> > -		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> > -} while(0)
> > -#endif
> >  
> >  #define tlb_end_vma(tlb, vma)						\
> >  do {									\
> 
> [snip..]
> 
> > 				      \
> > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> > index e811ef7b8350..1d037fd5bb7a 100644
> > --- a/include/asm-generic/tlb.h
> > +++ b/include/asm-generic/tlb.h
> > @@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
> >   * the vmas are adjusted to only cover the region to be torn down.
> >   */
> >  #ifndef tlb_start_vma
> > -#define tlb_start_vma(tlb, vma) do { } while (0)
> > +#define tlb_start_vma(tlb, vma)						\
> > +do {									\
> > +	if (!tlb->fullmm)						\
> > +		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> > +} while (0)
> >  #endif
> 
> So for non aliasing arches to be not affected, this relies on flush_cache_range()
> to be no-op ?

Yes; a cursory inspected shows this to be so. With 'obvious' exception
from the above 3 architectures.

> > -#define __tlb_end_vma(tlb, vma)					\
> > -	do {							\
> > -		if (!tlb->fullmm && tlb->end) {			\
> > -			tlb_flush(tlb);				\
> > -			__tlb_reset_range(tlb);			\
> > -		}						\
> > -	} while (0)
> > -
> >  #ifndef tlb_end_vma
> > -#define tlb_end_vma	__tlb_end_vma
> > +#define tlb_end_vma(tlb, vma)						\
> > +	do {								\
> > +		if (!tlb->fullmm && tlb->end) {				\
> > +			tlb_flush(tlb);					\
> > +			__tlb_reset_range(tlb);				\
> > +		}							\
> > +	} while (0)
> >  #endif
> >  
> >  #ifndef __tlb_remove_tlb_entry
> 
> And this one is for shift_arg_pages() but will also cause extraneous flushes for
> other cases - not happening currently !

No, shift_arg_pages() does not in fact call tlb_end_vma().

The reason for the flush in tlb_end_vma() is (as far as we can remember)
to 'better' deal with large gaps between adjacent VMAs.  Without the
flush here, the range would be extended to cover the (potential) dead
space between the VMAs. Resulting in more expensive range flushes.
diff mbox series

Patch

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -362,6 +362,9 @@  config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
 	bool
 
+config HAVE_RCU_TABLE_INVALIDATE
+	bool
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,6 +180,7 @@  config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_RCU_TABLE_INVALIDATE	if HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,17 +238,22 @@  void arch_tlb_gather_mmu(struct mmu_gath
 	__tlb_reset_range(tlb);
 }
 
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
 	if (!tlb->end)
 		return;
 
 	tlb_flush(tlb);
 	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
+	__tlb_reset_range(tlb);
+}
+
+static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+{
+	__tlb_flush_mmu_tlbonly(tlb);
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb_table_flush(tlb);
 #endif
-	__tlb_reset_range(tlb);
 }
 
 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
@@ -330,6 +335,21 @@  bool __tlb_remove_page_size(struct mmu_g
  * See the comment near struct mmu_table_batch.
  */
 
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+	/*
+	 * Invalidate page-table caches used by hardware walkers. Then we still
+	 * need to RCU-sched wait while freeing the pages because software
+	 * walkers can still be in-flight.
+	 */
+	__tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
 static void tlb_remove_table_smp_sync(void *arg)
 {
 	/* Simply deliver the interrupt */
@@ -366,6 +386,7 @@  void tlb_table_flush(struct mmu_gather *
 	struct mmu_table_batch **batch = &tlb->batch;
 
 	if (*batch) {
+		tlb_table_invalidate(tlb);
 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
 		*batch = NULL;
 	}
@@ -378,11 +399,13 @@  void tlb_remove_table(struct mmu_gather
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
+			tlb_table_invalidate(tlb);
 			tlb_remove_table_one(table);
 			return;
 		}
 		(*batch)->nr = 0;
 	}
+
 	(*batch)->tables[(*batch)->nr++] = table;
 	if ((*batch)->nr == MAX_TABLE_BATCH)
 		tlb_table_flush(tlb);