diff mbox series

[V4,3/8] x86: Support kmap_local() forced debugging

Message ID 20201118204007.169209557@linutronix.de (mailing list archive)
State New, archived
Headers show
Series mm/highmem: Preemptible variant of kmap_atomic & friends | expand

Commit Message

Thomas Gleixner Nov. 18, 2020, 7:48 p.m. UTC
kmap_local() and related interfaces are NOOPs on 64bit and only create
temporary fixmaps for highmem pages on 32bit. That means the test coverage
for this code is pretty small.

CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which
allows to provide support for enforced kmap_local() debugging even on
64bit.

For 32bit the support is unconditional, for 64bit it's only supported when
CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set
up yet another fixmap PGT.

If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic()
will use the temporary fixmap mapping path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V4: New patch
---
 arch/x86/Kconfig                        |    1 +
 arch/x86/include/asm/fixmap.h           |   12 +++++++++---
 arch/x86/include/asm/pgtable_64_types.h |    6 +++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

Comments

Steven Rostedt Jan. 6, 2021, 11:01 p.m. UTC | #1
On Wed, 18 Nov 2020 20:48:41 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> kmap_local() and related interfaces are NOOPs on 64bit and only create
> temporary fixmaps for highmem pages on 32bit. That means the test coverage
> for this code is pretty small.
> 
> CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which
> allows to provide support for enforced kmap_local() debugging even on
> 64bit.
> 
> For 32bit the support is unconditional, for 64bit it's only supported when
> CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set
> up yet another fixmap PGT.
> 
> If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic()
> will use the temporary fixmap mapping path.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
> V4: New patch
> ---
>  arch/x86/Kconfig                        |    1 +
>  arch/x86/include/asm/fixmap.h           |   12 +++++++++---
>  arch/x86/include/asm/pgtable_64_types.h |    6 +++++-
>  3 files changed, 15 insertions(+), 4 deletions(-)
> 
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -93,6 +93,7 @@ config X86
>  	select ARCH_SUPPORTS_ACPI
>  	select ARCH_SUPPORTS_ATOMIC_RMW
>  	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64

I triggered the following crash on x86_32 by simply doing a:

(ssh'ing into the box)

  # head -100 /tmp/output-file

Where the /tmp/output-file was the output of a trace-cmd report.
Even after rebooting and not running the tracing code, simply doing the
head command still crashed.

BUG: unable to handle page fault for address: fff58000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
*pdpt = 0000000006de9001 *pde = 0000000001968063 *pte = 0000000000000000 
Oops: 0000 [#1] SMP PTI
CPU: 3 PID: 3935 Comm: sshd Not tainted 5.11.0-rc2-test+ #2
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
EIP: skb_copy_bits+0x10c/0x1b9
Code: 3b 5d e8 0f 47 5d e8 c7 45 e0 00 00 00 00 8b 7d e0 39 7d e8 76 3a 8b 45 d4 e8 a4 e4 ff ff 8b 55 e4 03 55 e0 89 d9 01 c6 89 d7 <f3> a4 e8 c9 e4 ff ff 01 5d e0 8b 5d e8 b8 00 10 00 00 2b 5d e0 83
EAX: fff57000 EBX: 000005a8 ECX: 000000f8 EDX: c77b9900
ESI: fff58000 EDI: c77b9db0 EBP: c6de39ec ESP: c6de39c0
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210286
CR0: 80050033 CR2: fff58000 CR3: 06de6000 CR4: 001506f0
Call Trace:
 skb_segment+0x4a3/0x828
 ? __tcp_mtu_to_mss+0x2d/0x6b
 tcp_gso_segment+0xf6/0x336
 ? list_add+0x26/0x26
 tcp4_gso_segment+0x77/0x7c
 ? tcp_gso_segment+0x336/0x336
 inet_gso_segment+0x1a1/0x2df
 ? inet_unregister_protosw+0x5e/0x5e
 skb_mac_gso_segment+0xb9/0x107
 __skb_gso_segment+0xdf/0x10f
 ? netif_skb_features+0x1ca/0x24a
 ? __qdisc_run+0x1e4/0x418
 validate_xmit_skb.constprop.0+0x10f/0x1ad
 validate_xmit_skb_list+0x25/0x45
 sch_direct_xmit+0x5c/0x19d
 __qdisc_run+0x3e3/0x418
 ? qdisc_run_begin+0x53/0x5d
 qdisc_run+0x26/0x30
 __dev_queue_xmit+0x2bd/0x524
 ? mark_held_locks+0x40/0x51
 dev_queue_xmit+0xf/0x11
 ip_finish_output2+0x378/0x3d7
 __ip_finish_output+0xd6/0xe2
 ip_output+0x8c/0xbb
 ? ip_mc_output+0x18d/0x18d
 dst_output+0x27/0x2d
 ip_local_out+0x2b/0x30
 __ip_queue_xmit+0x32e/0x38e
 ? __copy_skb_header+0x4b/0x98
 ? __ip_queue_xmit+0x38e/0x38e
 ip_queue_xmit+0x16/0x1b
 __tcp_transmit_skb+0x731/0x794
 tcp_transmit_skb+0x16/0x18
 tcp_write_xmit+0x7b4/0xa90
 __tcp_push_pending_frames+0x2c/0x6b
 tcp_push+0x8c/0xf1
 tcp_sendmsg_locked+0x74a/0x7f2
 ? tcp_sendmsg_locked+0x7f2/0x7f2
 tcp_sendmsg+0x27/0x38
 ? tcp_sendmsg_locked+0x7f2/0x7f2
 inet_sendmsg+0x3c/0x5f
 ? inet_send_prepare+0x3b/0x3b
 sock_sendmsg_nosec+0x1a/0x2d
 sock_sendmsg+0x25/0x29
 sock_write_iter+0x84/0xa7
 vfs_write+0xf5/0x19b
 ksys_write+0x68/0xaa
 __ia32_sys_write+0x15/0x17
 __do_fast_syscall_32+0x66/0x76
 do_fast_syscall_32+0x29/0x5b
 do_SYSENTER_32+0x15/0x17
 entry_SYSENTER_32+0x9f/0xf2
EIP: 0xb7ee3545
Code: c4 01 10 03 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
EAX: ffffffda EBX: 00000003 ECX: 01d12448 EDX: 00002028
ESI: 00002028 EDI: 01d12448 EBP: bff4e388 ESP: bff4e328
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00200246
 ? asm_exc_nmi+0xc5/0x2ab
Modules linked in: ppdev parport_pc parport
CR2: 00000000fff58000
---[ end trace 3d4582614c9c2a0e ]---
EIP: skb_copy_bits+0x10c/0x1b9
Code: 3b 5d e8 0f 47 5d e8 c7 45 e0 00 00 00 00 8b 7d e0 39 7d e8 76 3a 8b 45 d4 e8 a4 e4 ff ff 8b 55 e4 03 55 e0 89 d9 01 c6 89 d7 <f3> a4 e8 c9 e4 ff ff 01 5d e0 8b 5d e8 b8 00 10 00 00 2b 5d e0 83
EAX: fff57000 EBX: 000005a8 ECX: 000000f8 EDX: c77b9900
ESI: fff58000 EDI: c77b9db0 EBP: c6de39ec ESP: c6de39c0
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210286
CR0: 80050033 CR2: fff58000 CR3: 06de6000 CR4: 001506f0
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: disabled
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

This was against 5.11-rc2.

I bisected it down to the commit that added this patch.

> +	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096

If I remove the above line, it works fine.

Attached is the config file.

-- Steve

>  	select ARCH_USE_BUILTIN_BSWAP
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select ARCH_USE_QUEUED_SPINLOCKS
> --- a/arch/x86/include/asm/fixmap.h
> +++ b/arch/x86/include/asm/fixmap.h
> @@ -14,13 +14,20 @@
>  #ifndef _ASM_X86_FIXMAP_H
>  #define _ASM_X86_FIXMAP_H
>  
> +#include <asm/kmap_size.h>
> +
>  /*
>   * Exposed to assembly code for setting up initial page tables. Cannot be
>   * calculated in assembly code (fixmap entries are an enum), but is sanity
>   * checked in the actual fixmap C code to make sure that the fixmap is
>   * covered fully.
>   */
> -#define FIXMAP_PMD_NUM	2
> +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
> +# define FIXMAP_PMD_NUM	2
> +#else
> +# define KM_PMDS	(KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
> +# define FIXMAP_PMD_NUM (KM_PMDS + 2)
> +#endif
>  /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
>  #define FIXMAP_PMD_TOP	507
>  
> @@ -31,7 +38,6 @@
>  #include <asm/pgtable_types.h>
>  #ifdef CONFIG_X86_32
>  #include <linux/threads.h>
> -#include <asm/kmap_size.h>
>  #else
>  #include <uapi/asm/vsyscall.h>
>  #endif
> @@ -92,7 +98,7 @@ enum fixed_addresses {
>  	FIX_IO_APIC_BASE_0,
>  	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
>  #endif
> -#ifdef CONFIG_X86_32
> +#ifdef CONFIG_KMAP_LOCAL
>  	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
>  	FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
>  #ifdef CONFIG_PCI_MMCONFIG
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d;
>  
>  #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
>  /* The module sections ends with the start of the fixmap */
> -#define MODULES_END		_AC(0xffffffffff000000, UL)
> +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
> +# define MODULES_END		_AC(0xffffffffff000000, UL)
> +#else
> +# define MODULES_END		_AC(0xfffffffffe000000, UL)
> +#endif
>  #define MODULES_LEN		(MODULES_END - MODULES_VADDR)
>  
>  #define ESPFIX_PGD_ENTRY	_AC(-2, UL)
> 
>
Linus Torvalds Jan. 7, 2021, 1:03 a.m. UTC | #2
On Wed, Jan 6, 2021 at 3:01 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> I triggered the following crash on x86_32 by simply doing a:
>
> (ssh'ing into the box)
>
>   # head -100 /tmp/output-file
>
> Where the /tmp/output-file was the output of a trace-cmd report.
> Even after rebooting and not running the tracing code, simply doing the
> head command still crashed.

The code decodes to

   0:   3b 5d e8                cmp    -0x18(%ebp),%ebx
   3:   0f 47 5d e8             cmova  -0x18(%ebp),%ebx
   7:   c7 45 e0 00 00 00 00    movl   $0x0,-0x20(%ebp)
   e:   8b 7d e0                mov    -0x20(%ebp),%edi
  11:   39 7d e8                cmp    %edi,-0x18(%ebp)
  14:   76 3a                   jbe    0x50
  16:   8b 45 d4                mov    -0x2c(%ebp),%eax
  19:   e8 a4 e4 ff ff          call   0xffffe4c2
  1e:   8b 55 e4                mov    -0x1c(%ebp),%edx
  21:   03 55 e0                add    -0x20(%ebp),%edx
  24:   89 d9                   mov    %ebx,%ecx
  26:   01 c6                   add    %eax,%esi
  28:   89 d7                   mov    %edx,%edi
  2a:*  f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
 <-- trapping instruction
  2c:   e8 c9 e4 ff ff          call   0xffffe4fa
  31:   01 5d e0                add    %ebx,-0x20(%ebp)
  34:   8b 5d e8                mov    -0x18(%ebp),%ebx
  37:   b8 00 10 00 00          mov    $0x1000,%eax
  3c:   2b 5d e0                sub    -0x20(%ebp),%ebx

and while it would be good to see the output of
scripts/decode_stacktrace.sh, I strongly suspect that the above is

                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);

(although I wonder how/why the heck you've enabled
CC_OPTIMIZE_FOR_SIZE=y, which is what causes "memcpy()" to be done as
that "rep movsb". I thought we disabled it because it's so bad on most
cpus).

So that first "call" instruction is the kmap_atomic(), the "rep movs"
is the memcpy(), and the "call" instruction immediately after is the
kunmap_atomic().

Anyway, you can see vaddr in register state:

        EAX: fff57000

so we've kmapped that one page at fff57000, but we're accessing past
it into the next page:

> BUG: unable to handle page fault for address: fff58000

with the current source address being (ESI: fff58000) and we still
have 248 bytes to go (ECX: 000000f8) even though we've already
overflowed into the next page.

You can see the original count still (EBX: 000005a8), so it really
looks like that skb_frag_foreach_page() logic

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

must be wrong, and doesn't handle the "each page" part properly. It
must have started in the middle of the page, and p_len (that 0x5a8)
was wrong.

IOW, it really looks like p_off + p_len had the value 0x10f8, which is
larger than one page. And looking at the code, in
skb_frag_foreach_page(), I see:

             p_off = (f_off) & (PAGE_SIZE - 1),                         \
             p_len = skb_frag_must_loop(p) ?                            \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,              \

where that "min_t(u32, f_len, PAGE_SIZE - p_off)" looks correct, but
then presumably skb_frag_must_loop() must be wrong.

Oh, and when I look at that, I see

    static inline bool skb_frag_must_loop(struct page *p)
    {
    #if defined(CONFIG_HIGHMEM)
            if (PageHighMem(p))
                    return true;
    #endif
            return false;
    }

and that is no longer true. With the kmap debugging, even non-highmem
pages need that "do one page at a time" code, because even non-highmem
pages get remapped by kmap().

IOW, I think the patch to fix this might be something like the attached.

I wonder whether there is other code that "knows" about kmap() only
affecting PageHighmem() pages thing that is no longer true.

Looking at some other code, skb_gro_reset_offset() looks suspiciously
like it also thinks highmem pages are special.

Adding the networking people involved in this area to the cc too.

               Linus
Steven Rostedt Jan. 7, 2021, 1:16 a.m. UTC | #3
On Wed, 6 Jan 2021 17:03:48 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> (although I wonder how/why the heck you've enabled
> CC_OPTIMIZE_FOR_SIZE=y, which is what causes "memcpy()" to be done as
> that "rep movsb". I thought we disabled it because it's so bad on most
> cpus).

Why?

Because to test x86_32, I have a Fedora Core 13 (yes 13!) partition
(baremetal) that I use. And the .config I use for it hasn't changed
since that time ;-) (except to add new features that I want to test on
x86_32).

Anyway, I'll test out your patch. Thanks for investigating this.

-- Steve
Steven Rostedt Jan. 7, 2021, 1:49 a.m. UTC | #4
On Wed, 6 Jan 2021 17:03:48 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -366,7 +366,7 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
>  static inline bool skb_frag_must_loop(struct page *p)
>  {
>  #if defined(CONFIG_HIGHMEM)
> -	if (PageHighMem(p))
> +	if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
>  		return true;
>  #endif
>  	return false;

I applied this and I haven't been able to crash it again.

Thanks,

-- Steve
Jakub Kicinski Jan. 7, 2021, 1:49 a.m. UTC | #5
On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> I wonder whether there is other code that "knows" about kmap() only
> affecting PageHighmem() pages thing that is no longer true.
> 
> Looking at some other code, skb_gro_reset_offset() looks suspiciously
> like it also thinks highmem pages are special.
> 
> Adding the networking people involved in this area to the cc too.

Thanks for the detailed analysis! skb_gro_reset_offset() checks if
kernel can read data in the fragments directly as an optimization, 
in case the entire header is in a fragment. 

IIUC DEBUG_KMAP_LOCAL_FORCE_MAP only affects the mappings from
explicit kmap calls, which GRO won't make - it will fall back to
pulling the header out of the fragment and end up in skb_copy_bits(),
i.e. the loop you fixed. So GRO should be good. I think..
Willem de Bruijn Jan. 7, 2021, 2:11 a.m. UTC | #6
On Wed, Jan 6, 2021 at 8:49 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> > I wonder whether there is other code that "knows" about kmap() only
> > affecting PageHighmem() pages thing that is no longer true.
> >
> > Looking at some other code, skb_gro_reset_offset() looks suspiciously
> > like it also thinks highmem pages are special.
> >
> > Adding the networking people involved in this area to the cc too.
>
> Thanks for the detailed analysis! skb_gro_reset_offset() checks if
> kernel can read data in the fragments directly as an optimization,
> in case the entire header is in a fragment.
>
> IIUC DEBUG_KMAP_LOCAL_FORCE_MAP only affects the mappings from
> explicit kmap calls, which GRO won't make - it will fall back to
> pulling the header out of the fragment and end up in skb_copy_bits(),
> i.e. the loop you fixed. So GRO should be good. I think..

Agreed. That code in skb_gro_reset_offset skips the GRO frag0
optimization in various cases, including if the first fragment is in
high mem.

That specific check goes back to the introduction of the frag0
optimization in commit 86911732d399 ("gro: Avoid copying headers of
unmerged packets"), at the time in helper skb_gro_header().

Very glad to hear that the fix addresses the crash in
skb_frag_foreach_page. Thanks!
Willem de Bruijn Jan. 7, 2021, 4:44 a.m. UTC | #7
On Wed, Jan 6, 2021 at 9:11 PM Willem de Bruijn <willemb@google.com> wrote:
>
> On Wed, Jan 6, 2021 at 8:49 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> > > I wonder whether there is other code that "knows" about kmap() only
> > > affecting PageHighmem() pages thing that is no longer true.
> > >
> > > Looking at some other code, skb_gro_reset_offset() looks suspiciously
> > > like it also thinks highmem pages are special.
> > >
> > > Adding the networking people involved in this area to the cc too.

But there are three other kmap_atomic callers under net/ that do not
loop at all, so assume non-compound pages. In esp_output_head,
esp6_output_head and skb_seq_read. The first two directly use
skb_page_frag_refill, which can allocate compound (but not
__GFP_HIGHMEM) pages, and the third can be inserted with
netfilter xt_string in the path of tcp transmit skbs, which can also
have compound pages. I think that these could similarly access
data beyond the end of the kmap_atomic mapped page. I'll take
a closer look.
Linus Torvalds Jan. 7, 2021, 7:47 p.m. UTC | #8
On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <willemb@google.com> wrote:
>
> But there are three other kmap_atomic callers under net/ that do not
> loop at all, so assume non-compound pages. In esp_output_head,
> esp6_output_head and skb_seq_read. The first two directly use
> skb_page_frag_refill, which can allocate compound (but not
> __GFP_HIGHMEM) pages, and the third can be inserted with
> netfilter xt_string in the path of tcp transmit skbs, which can also
> have compound pages. I think that these could similarly access
> data beyond the end of the kmap_atomic mapped page. I'll take
> a closer look.

Thanks.

Note that I have flushed my random one-liner patch from my system, and
expect to get a proper fix through the normal networking pulls.

And _if_ the networking people feel that my one-liner was the proper
fix, you can use it and add my sign-off if you want to, but it really
was more of a "this is the quick ugly fix for testing" rather than
anything else.

          Linus
Steven Rostedt Jan. 7, 2021, 8:52 p.m. UTC | #9
On Thu, 7 Jan 2021 11:47:02 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <willemb@google.com> wrote:
> >
> > But there are three other kmap_atomic callers under net/ that do not
> > loop at all, so assume non-compound pages. In esp_output_head,
> > esp6_output_head and skb_seq_read. The first two directly use
> > skb_page_frag_refill, which can allocate compound (but not
> > __GFP_HIGHMEM) pages, and the third can be inserted with
> > netfilter xt_string in the path of tcp transmit skbs, which can also
> > have compound pages. I think that these could similarly access
> > data beyond the end of the kmap_atomic mapped page. I'll take
> > a closer look.  
> 
> Thanks.
> 
> Note that I have flushed my random one-liner patch from my system, and
> expect to get a proper fix through the normal networking pulls.
> 
> And _if_ the networking people feel that my one-liner was the proper
> fix, you can use it and add my sign-off if you want to, but it really
> was more of a "this is the quick ugly fix for testing" rather than
> anything else.
> 

Please add:

  Link: https://lore.kernel.org/linux-mm/20210106180132.41dc249d@gandalf.local.home/
  Reported-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

And if you take Linus's patch, please add my:

  Tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

and if you come up with another patch, please send it to me for testing.

Thanks!

-- Steve
Willem de Bruijn Jan. 7, 2021, 9:07 p.m. UTC | #10
On Thu, Jan 7, 2021 at 3:53 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 7 Jan 2021 11:47:02 -0800
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <willemb@google.com> wrote:
> > >
> > > But there are three other kmap_atomic callers under net/ that do not
> > > loop at all, so assume non-compound pages. In esp_output_head,
> > > esp6_output_head and skb_seq_read. The first two directly use
> > > skb_page_frag_refill, which can allocate compound (but not
> > > __GFP_HIGHMEM) pages, and the third can be inserted with
> > > netfilter xt_string in the path of tcp transmit skbs, which can also
> > > have compound pages. I think that these could similarly access
> > > data beyond the end of the kmap_atomic mapped page. I'll take
> > > a closer look.
> >
> > Thanks.
> >
> > Note that I have flushed my random one-liner patch from my system, and
> > expect to get a proper fix through the normal networking pulls.
> >
> > And _if_ the networking people feel that my one-liner was the proper
> > fix, you can use it and add my sign-off if you want to, but it really
> > was more of a "this is the quick ugly fix for testing" rather than
> > anything else.

I do think it is the proper fix as is. If no one else has comments, I
can submit it through the net tree.

It won't address the other issues that became apparent only as a
result of this. I'm preparing separate patches for those.

> Please add:
>
>   Link: https://lore.kernel.org/linux-mm/20210106180132.41dc249d@gandalf.local.home/
>   Reported-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>
> And if you take Linus's patch, please add my:
>
>   Tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>
> and if you come up with another patch, please send it to me for testing.
>
> Thanks!

Will do, thanks.
diff mbox series

Patch

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@  config X86
 	select ARCH_SUPPORTS_ACPI
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
+	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,13 +14,20 @@ 
 #ifndef _ASM_X86_FIXMAP_H
 #define _ASM_X86_FIXMAP_H
 
+#include <asm/kmap_size.h>
+
 /*
  * Exposed to assembly code for setting up initial page tables. Cannot be
  * calculated in assembly code (fixmap entries are an enum), but is sanity
  * checked in the actual fixmap C code to make sure that the fixmap is
  * covered fully.
  */
-#define FIXMAP_PMD_NUM	2
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define FIXMAP_PMD_NUM	2
+#else
+# define KM_PMDS	(KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
+# define FIXMAP_PMD_NUM (KM_PMDS + 2)
+#endif
 /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
 #define FIXMAP_PMD_TOP	507
 
@@ -31,7 +38,6 @@ 
 #include <asm/pgtable_types.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
-#include <asm/kmap_size.h>
 #else
 #include <uapi/asm/vsyscall.h>
 #endif
@@ -92,7 +98,7 @@  enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_KMAP_LOCAL
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 #ifdef CONFIG_PCI_MMCONFIG
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -143,7 +143,11 @@  extern unsigned int ptrs_per_p4d;
 
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END		_AC(0xffffffffff000000, UL)
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define MODULES_END		_AC(0xffffffffff000000, UL)
+#else
+# define MODULES_END		_AC(0xfffffffffe000000, UL)
+#endif
 #define MODULES_LEN		(MODULES_END - MODULES_VADDR)
 
 #define ESPFIX_PGD_ENTRY	_AC(-2, UL)