diff mbox series

[3/3] s390/mm: fix mis-accounting of pgtable_bytes

Message ID 1539621759-5967-4-git-send-email-schwidefsky@de.ibm.com (mailing list archive)
State New, archived
Headers show
Series pgtable bytes mis-accounting v2 | expand

Commit Message

Martin Schwidefsky Oct. 15, 2018, 4:42 p.m. UTC
In case a fork or a clone system fails in copy_process and the error
handling does the mmput() at the bad_fork_cleanup_mm label, the
following warning messages will appear on the console:

  BUG: non-zero pgtables_bytes on freeing mm: 16384

The reason for that is the tricks we play with mm_inc_nr_puds() and
mm_inc_nr_pmds() in init_new_context().

A normal 64-bit process has 3 levels of page table, the p4d level and
the pud level are folded. On process termination the free_pud_range()
function in mm/memory.c will subtract 16KB from pgtable_bytes with a
mm_dec_nr_puds() call, but there actually is not really a pud table.

One issue with this is the fact that pgtable_bytes is usually off
by a few kilobytes, but the more severe problem is that for a failed
fork or clone the free_pgtables() function is not called. In this case
there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with
the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context().
The pgtable_bytes will be off by 16384 or 32768 bytes and we get the
BUG message. The message itself is purely cosmetic, but annoying.

To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded
function to check for the true size of the address space.

Reported-by: Li Wang <liwang@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/mmu_context.h |  5 -----
 arch/s390/include/asm/pgalloc.h     |  6 +++---
 arch/s390/include/asm/pgtable.h     | 18 ++++++++++++++++++
 arch/s390/include/asm/tlb.h         |  6 +++---
 4 files changed, 24 insertions(+), 11 deletions(-)

Comments

Li Wang Oct. 31, 2018, 6:18 a.m. UTC | #1
On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <schwidefsky@de.ibm.com
> wrote:

> In case a fork or a clone system fails in copy_process and the error
> handling does the mmput() at the bad_fork_cleanup_mm label, the
> following warning messages will appear on the console:
>
>   BUG: non-zero pgtables_bytes on freeing mm: 16384
>
> The reason for that is the tricks we play with mm_inc_nr_puds() and
> mm_inc_nr_pmds() in init_new_context().
>
> A normal 64-bit process has 3 levels of page table, the p4d level and
> the pud level are folded. On process termination the free_pud_range()
> function in mm/memory.c will subtract 16KB from pgtable_bytes with a
> mm_dec_nr_puds() call, but there actually is not really a pud table.
>
> One issue with this is the fact that pgtable_bytes is usually off
> by a few kilobytes, but the more severe problem is that for a failed
> fork or clone the free_pgtables() function is not called. In this case
> there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with
> the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context().
> The pgtable_bytes will be off by 16384 or 32768 bytes and we get the
> BUG message. The message itself is purely cosmetic, but annoying.
>
> To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded
> function to check for the true size of the address space.
>

I can confirm that it works to the problem, the warning message is gone
after applying this patch on s390x. And I also done ltp syscalls/cve test
for the patch set on x86_64 arch, there has no new regression.

Tested-by: Li Wang <liwang@redhat.com>


> Reported-by: Li Wang <liwang@redhat.com>
> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
> ---
>  arch/s390/include/asm/mmu_context.h |  5 -----
>  arch/s390/include/asm/pgalloc.h     |  6 +++---
>  arch/s390/include/asm/pgtable.h     | 18 ++++++++++++++++++
>  arch/s390/include/asm/tlb.h         |  6 +++---
>  4 files changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/arch/s390/include/asm/mmu_context.h
> b/arch/s390/include/asm/mmu_context.h
> index 0717ee76885d..f1ab9420ccfb 100644
> --- a/arch/s390/include/asm/mmu_context.h
> +++ b/arch/s390/include/asm/mmu_context.h
> @@ -45,8 +45,6 @@ static inline int init_new_context(struct task_struct
> *tsk,
>                 mm->context.asce_limit = STACK_TOP_MAX;
>                 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
>                                    _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
> -               /* pgd_alloc() did not account this pud */
> -               mm_inc_nr_puds(mm);
>                 break;
>         case -PAGE_SIZE:
>                 /* forked 5-level task, set new asce with new_mm->pgd */
> @@ -62,9 +60,6 @@ static inline int init_new_context(struct task_struct
> *tsk,
>                 /* forked 2-level compat task, set new asce with new
> mm->pgd */
>                 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
>                                    _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
> -               /* pgd_alloc() did not account this pmd */
> -               mm_inc_nr_pmds(mm);
> -               mm_inc_nr_puds(mm);
>         }
>         crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
>         return 0;
> diff --git a/arch/s390/include/asm/pgalloc.h
> b/arch/s390/include/asm/pgalloc.h
> index f0f9bcf94c03..5ee733720a57 100644
> --- a/arch/s390/include/asm/pgalloc.h
> +++ b/arch/s390/include/asm/pgalloc.h
> @@ -36,11 +36,11 @@ static inline void crst_table_init(unsigned long
> *crst, unsigned long entry)
>
>  static inline unsigned long pgd_entry_type(struct mm_struct *mm)
>  {
> -       if (mm->context.asce_limit <= _REGION3_SIZE)
> +       if (mm_pmd_folded(mm))
>                 return _SEGMENT_ENTRY_EMPTY;
> -       if (mm->context.asce_limit <= _REGION2_SIZE)
> +       if (mm_pud_folded(mm))
>                 return _REGION3_ENTRY_EMPTY;
> -       if (mm->context.asce_limit <= _REGION1_SIZE)
> +       if (mm_p4d_folded(mm))
>                 return _REGION2_ENTRY_EMPTY;
>         return _REGION1_ENTRY_EMPTY;
>  }
> diff --git a/arch/s390/include/asm/pgtable.h
> b/arch/s390/include/asm/pgtable.h
> index 0e7cb0dc9c33..de05466ce50c 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -485,6 +485,24 @@ static inline int is_module_addr(void *addr)
>                                    _REGION_ENTRY_PROTECT | \
>                                    _REGION_ENTRY_NOEXEC)
>
> +static inline bool mm_p4d_folded(struct mm_struct *mm)
> +{
> +       return mm->context.asce_limit <= _REGION1_SIZE;
> +}
> +#define mm_p4d_folded(mm) mm_p4d_folded(mm)
> +
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> +       return mm->context.asce_limit <= _REGION2_SIZE;
> +}
> +#define mm_pud_folded(mm) mm_pud_folded(mm)
> +
> +static inline bool mm_pmd_folded(struct mm_struct *mm)
> +{
> +       return mm->context.asce_limit <= _REGION3_SIZE;
> +}
> +#define mm_pmd_folded(mm) mm_pmd_folded(mm)
> +
>  static inline int mm_has_pgste(struct mm_struct *mm)
>  {
>  #ifdef CONFIG_PGSTE
> diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
> index 457b7ba0fbb6..b31c779cf581 100644
> --- a/arch/s390/include/asm/tlb.h
> +++ b/arch/s390/include/asm/tlb.h
> @@ -136,7 +136,7 @@ static inline void pte_free_tlb(struct mmu_gather
> *tlb, pgtable_t pte,
>  static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
>                                 unsigned long address)
>  {
> -       if (tlb->mm->context.asce_limit <= _REGION3_SIZE)
> +       if (mm_pmd_folded(tlb->mm))
>                 return;
>         pgtable_pmd_page_dtor(virt_to_page(pmd));
>         tlb_remove_table(tlb, pmd);
> @@ -152,7 +152,7 @@ static inline void pmd_free_tlb(struct mmu_gather
> *tlb, pmd_t *pmd,
>  static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
>                                 unsigned long address)
>  {
> -       if (tlb->mm->context.asce_limit <= _REGION1_SIZE)
> +       if (mm_p4d_folded(tlb->mm))
>                 return;
>         tlb_remove_table(tlb, p4d);
>  }
> @@ -167,7 +167,7 @@ static inline void p4d_free_tlb(struct mmu_gather
> *tlb, p4d_t *p4d,
>  static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
>                                 unsigned long address)
>  {
> -       if (tlb->mm->context.asce_limit <= _REGION2_SIZE)
> +       if (mm_pud_folded(tlb->mm))
>                 return;
>         tlb_remove_table(tlb, pud);
>  }
> --
> 2.16.4
>
>
Martin Schwidefsky Oct. 31, 2018, 6:31 a.m. UTC | #2
On Wed, 31 Oct 2018 14:18:33 +0800
Li Wang <liwang@redhat.com> wrote:

> On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <schwidefsky@de.ibm.com
> > wrote:  
> 
> > In case a fork or a clone system fails in copy_process and the error
> > handling does the mmput() at the bad_fork_cleanup_mm label, the
> > following warning messages will appear on the console:
> >
> >   BUG: non-zero pgtables_bytes on freeing mm: 16384
> >
> > The reason for that is the tricks we play with mm_inc_nr_puds() and
> > mm_inc_nr_pmds() in init_new_context().
> >
> > A normal 64-bit process has 3 levels of page table, the p4d level and
> > the pud level are folded. On process termination the free_pud_range()
> > function in mm/memory.c will subtract 16KB from pgtable_bytes with a
> > mm_dec_nr_puds() call, but there actually is not really a pud table.
> >
> > One issue with this is the fact that pgtable_bytes is usually off
> > by a few kilobytes, but the more severe problem is that for a failed
> > fork or clone the free_pgtables() function is not called. In this case
> > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with
> > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context().
> > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the
> > BUG message. The message itself is purely cosmetic, but annoying.
> >
> > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded
> > function to check for the true size of the address space.
> >  
> 
> I can confirm that it works to the problem, the warning message is gone
> after applying this patch on s390x. And I also done ltp syscalls/cve test
> for the patch set on x86_64 arch, there has no new regression.
> 
> Tested-by: Li Wang <liwang@redhat.com>

Thanks for testing. Unfortunately Heiko reported another issue yesterday
with the patch applied. This time the other way around:

BUG: non-zero pgtables_bytes on freeing mm: -16384

I am trying to understand how this can happen. For now I would like to
keep the patch on hold in case they need another change.
Li Wang Oct. 31, 2018, 6:43 a.m. UTC | #3
On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com>
wrote:

> On Wed, 31 Oct 2018 14:18:33 +0800
> Li Wang <liwang@redhat.com> wrote:
>
> > On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <
> schwidefsky@de.ibm.com
> > > wrote:
> >
> > > In case a fork or a clone system fails in copy_process and the error
> > > handling does the mmput() at the bad_fork_cleanup_mm label, the
> > > following warning messages will appear on the console:
> > >
> > >   BUG: non-zero pgtables_bytes on freeing mm: 16384
> > >
> > > The reason for that is the tricks we play with mm_inc_nr_puds() and
> > > mm_inc_nr_pmds() in init_new_context().
> > >
> > > A normal 64-bit process has 3 levels of page table, the p4d level and
> > > the pud level are folded. On process termination the free_pud_range()
> > > function in mm/memory.c will subtract 16KB from pgtable_bytes with a
> > > mm_dec_nr_puds() call, but there actually is not really a pud table.
> > >
> > > One issue with this is the fact that pgtable_bytes is usually off
> > > by a few kilobytes, but the more severe problem is that for a failed
> > > fork or clone the free_pgtables() function is not called. In this case
> > > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with
> > > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context().
> > > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the
> > > BUG message. The message itself is purely cosmetic, but annoying.
> > >
> > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded
> > > function to check for the true size of the address space.
> > >
> >
> > I can confirm that it works to the problem, the warning message is gone
> > after applying this patch on s390x. And I also done ltp syscalls/cve test
> > for the patch set on x86_64 arch, there has no new regression.
> >
> > Tested-by: Li Wang <liwang@redhat.com>
>
> Thanks for testing. Unfortunately Heiko reported another issue yesterday
> with the patch applied. This time the other way around:
>
> BUG: non-zero pgtables_bytes on freeing mm: -16384
>

Okay, the problem is still triggered by LTP/cve-2017-17052.c?
I tried this patch on my platform and it works! My test environment as:

# lscpu
Architecture:          s390x
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Big Endian
CPU(s):                2
On-line CPU(s) list:   0,1
Thread(s) per core:    1
Core(s) per socket:    1
Socket(s) per book:    1
Book(s) per drawer:    1
Drawer(s):             2
Vendor ID:             IBM/S390
Machine type:          2827
CPU dynamic MHz:       5504
CPU static MHz:        5504
BogoMIPS:              2913.00
Hypervisor vendor:     vertical
Virtualization type:   full
Dispatching mode:      horizontal
L1d cache:             96K
L1i cache:             64K
L2d cache:             1024K
L2i cache:             1024K
L3 cache:              49152K
L4 cache:              393216K
Flags:                 esan3 zarch stfle msa ldisp eimm dfp edat etf3eh
highgprs te sie


> I am trying to understand how this can happen. For now I would like to
> keep the patch on hold in case they need another change.
>

Sure.

>
> --
> blue skies,
>    Martin.
>
> "Reality continues to ruin my life." - Calvin.
>
>
Martin Schwidefsky Oct. 31, 2018, 6:46 a.m. UTC | #4
On Wed, 31 Oct 2018 14:43:38 +0800
Li Wang <liwang@redhat.com> wrote:

> On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com>
> wrote:
> 
> > On Wed, 31 Oct 2018 14:18:33 +0800
> > Li Wang <liwang@redhat.com> wrote:
> >  
> > > On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <  
> > schwidefsky@de.ibm.com  
> > > > wrote:  
> > >  
> > > > In case a fork or a clone system fails in copy_process and the error
> > > > handling does the mmput() at the bad_fork_cleanup_mm label, the
> > > > following warning messages will appear on the console:
> > > >
> > > >   BUG: non-zero pgtables_bytes on freeing mm: 16384
> > > >
> > > > The reason for that is the tricks we play with mm_inc_nr_puds() and
> > > > mm_inc_nr_pmds() in init_new_context().
> > > >
> > > > A normal 64-bit process has 3 levels of page table, the p4d level and
> > > > the pud level are folded. On process termination the free_pud_range()
> > > > function in mm/memory.c will subtract 16KB from pgtable_bytes with a
> > > > mm_dec_nr_puds() call, but there actually is not really a pud table.
> > > >
> > > > One issue with this is the fact that pgtable_bytes is usually off
> > > > by a few kilobytes, but the more severe problem is that for a failed
> > > > fork or clone the free_pgtables() function is not called. In this case
> > > > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with
> > > > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context().
> > > > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the
> > > > BUG message. The message itself is purely cosmetic, but annoying.
> > > >
> > > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded
> > > > function to check for the true size of the address space.
> > > >  
> > >
> > > I can confirm that it works to the problem, the warning message is gone
> > > after applying this patch on s390x. And I also done ltp syscalls/cve test
> > > for the patch set on x86_64 arch, there has no new regression.
> > >
> > > Tested-by: Li Wang <liwang@redhat.com>  
> >
> > Thanks for testing. Unfortunately Heiko reported another issue yesterday
> > with the patch applied. This time the other way around:
> >
> > BUG: non-zero pgtables_bytes on freeing mm: -16384
> >  
> 
> Okay, the problem is still triggered by LTP/cve-2017-17052.c?

No, unfortunately we do not have a simple testcase to trigger this new bug.
It happened once with one of our test kernels, the path that leads to this
is completely unclear.
Martin Schwidefsky Oct. 31, 2018, 9:39 a.m. UTC | #5
On Wed, 31 Oct 2018 07:46:47 +0100
Martin Schwidefsky <schwidefsky@de.ibm.com> wrote:

> On Wed, 31 Oct 2018 14:43:38 +0800
> Li Wang <liwang@redhat.com> wrote:
> 
> > On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com>
> > wrote:
> >   
> > > BUG: non-zero pgtables_bytes on freeing mm: -16384
> > >    
> > 
> > Okay, the problem is still triggered by LTP/cve-2017-17052.c?  
> 
> No, unfortunately we do not have a simple testcase to trigger this new bug.
> It happened once with one of our test kernels, the path that leads to this
> is completely unclear.
 
Ok, got it. There is a mm_inc_nr_puds(mm) missing in the s390 code:

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 76d89ee8b428..814f26520aa2 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -101,6 +101,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
                        mm->context.asce_limit = _REGION1_SIZE;
                        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
                                _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+                       mm_inc_nr_puds(mm);
                } else {
                        crst_table_init(table, _REGION1_ENTRY_EMPTY);
                        pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);

One of our test-cases did an upgrade of a 3-level page table.
I'll update the patch and send a v3.
Heiko Carstens Oct. 31, 2018, 10:09 a.m. UTC | #6
On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:
> Thanks for testing. Unfortunately Heiko reported another issue yesterday
> with the patch applied. This time the other way around:
> 
> BUG: non-zero pgtables_bytes on freeing mm: -16384
> 
> I am trying to understand how this can happen. For now I would like to
> keep the patch on hold in case they need another change.

FWIW, Kirill: is there a reason why this "BUG:" output is done with
pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?

That would to get more information with DEBUG_VM and / or
panic_on_warn=1 set. At least for automated testing it would be nice
to have such triggers.
Kirill A. Shutemov Oct. 31, 2018, 10:36 a.m. UTC | #7
On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote:
> On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:
> > Thanks for testing. Unfortunately Heiko reported another issue yesterday
> > with the patch applied. This time the other way around:
> > 
> > BUG: non-zero pgtables_bytes on freeing mm: -16384
> > 
> > I am trying to understand how this can happen. For now I would like to
> > keep the patch on hold in case they need another change.
> 
> FWIW, Kirill: is there a reason why this "BUG:" output is done with
> pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?
> 
> That would to get more information with DEBUG_VM and / or
> panic_on_warn=1 set. At least for automated testing it would be nice
> to have such triggers.

Stack trace is not helpful there. It will always show the exit path which
is useless.
Heiko Carstens Nov. 27, 2018, 7:34 a.m. UTC | #8
On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote:
> On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote:
> > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:
> > > Thanks for testing. Unfortunately Heiko reported another issue yesterday
> > > with the patch applied. This time the other way around:
> > > 
> > > BUG: non-zero pgtables_bytes on freeing mm: -16384
> > > 
> > > I am trying to understand how this can happen. For now I would like to
> > > keep the patch on hold in case they need another change.
> > 
> > FWIW, Kirill: is there a reason why this "BUG:" output is done with
> > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?
> > 
> > That would to get more information with DEBUG_VM and / or
> > panic_on_warn=1 set. At least for automated testing it would be nice
> > to have such triggers.
> 
> Stack trace is not helpful there. It will always show the exit path which
> is useless.

So, even with the updated version of these patches I can flood dmesg
and the console with

BUG: non-zero pgtables_bytes on freeing mm: 16384

messages with this complex reproducer on s390:

echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat

Besides that this needs to be fixed, I'd really like to see this
changed to either a printk_once() or a WARN_ON_ONCE() within
check_mm() so that an arbitrary user cannot flood the console.

E.g. something like the below. If there aren't any objections, I will
provide a proper patch with changelog, etc.

diff --git a/kernel/fork.c b/kernel/fork.c
index 07cddff89c7b..d7aeec03c57f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm)
 	}
 
 	if (mm_pgtables_bytes(mm))
-		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
-				mm_pgtables_bytes(mm));
+		printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+			    mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
Kirill A. Shutemov Nov. 27, 2018, 8:05 a.m. UTC | #9
On Tue, Nov 27, 2018 at 08:34:12AM +0100, Heiko Carstens wrote:
> On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote:
> > On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote:
> > > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:
> > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday
> > > > with the patch applied. This time the other way around:
> > > > 
> > > > BUG: non-zero pgtables_bytes on freeing mm: -16384
> > > > 
> > > > I am trying to understand how this can happen. For now I would like to
> > > > keep the patch on hold in case they need another change.
> > > 
> > > FWIW, Kirill: is there a reason why this "BUG:" output is done with
> > > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?
> > > 
> > > That would to get more information with DEBUG_VM and / or
> > > panic_on_warn=1 set. At least for automated testing it would be nice
> > > to have such triggers.
> > 
> > Stack trace is not helpful there. It will always show the exit path which
> > is useless.
> 
> So, even with the updated version of these patches I can flood dmesg
> and the console with
> 
> BUG: non-zero pgtables_bytes on freeing mm: 16384
> 
> messages with this complex reproducer on s390:
> 
> echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat
> 
> Besides that this needs to be fixed, I'd really like to see this
> changed to either a printk_once() or a WARN_ON_ONCE() within
> check_mm() so that an arbitrary user cannot flood the console.
> 
> E.g. something like the below. If there aren't any objections, I will
> provide a proper patch with changelog, etc.
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff89c7b..d7aeec03c57f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm)
>  	}
>  
>  	if (mm_pgtables_bytes(mm))
> -		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> -				mm_pgtables_bytes(mm));
> +		printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> +			    mm_pgtables_bytes(mm));

You can be the first user of pr_alert_once(). Don't miss a chance! ;)
Heiko Carstens Nov. 27, 2018, 8:13 a.m. UTC | #10
On Tue, Nov 27, 2018 at 11:05:15AM +0300, Kirill A. Shutemov wrote:
> > E.g. something like the below. If there aren't any objections, I will
> > provide a proper patch with changelog, etc.
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 07cddff89c7b..d7aeec03c57f 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm)
> >  	}
> >  
> >  	if (mm_pgtables_bytes(mm))
> > -		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> > -				mm_pgtables_bytes(mm));
> > +		printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> > +			    mm_pgtables_bytes(mm));
> 
> You can be the first user of pr_alert_once(). Don't miss a chance! ;)

I didn't expect that that one exists. ;) Will do.
Guenter Roeck Nov. 27, 2018, 11:47 a.m. UTC | #11
On 11/26/18 11:34 PM, Heiko Carstens wrote:
> On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote:
>> On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote:
>>> On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:
>>>> Thanks for testing. Unfortunately Heiko reported another issue yesterday
>>>> with the patch applied. This time the other way around:
>>>>
>>>> BUG: non-zero pgtables_bytes on freeing mm: -16384
>>>>
>>>> I am trying to understand how this can happen. For now I would like to
>>>> keep the patch on hold in case they need another change.
>>>
>>> FWIW, Kirill: is there a reason why this "BUG:" output is done with
>>> pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?
>>>
>>> That would to get more information with DEBUG_VM and / or
>>> panic_on_warn=1 set. At least for automated testing it would be nice
>>> to have such triggers.
>>
>> Stack trace is not helpful there. It will always show the exit path which
>> is useless.
> 
> So, even with the updated version of these patches I can flood dmesg
> and the console with
> 
> BUG: non-zero pgtables_bytes on freeing mm: 16384
> 
> messages with this complex reproducer on s390:
> 
> echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat
> 
> Besides that this needs to be fixed, I'd really like to see this
> changed to either a printk_once() or a WARN_ON_ONCE() within
> check_mm() so that an arbitrary user cannot flood the console.
> 
> E.g. something like the below. If there aren't any objections, I will
> provide a proper patch with changelog, etc.
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff89c7b..d7aeec03c57f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm)
>   	}
>   
>   	if (mm_pgtables_bytes(mm))
> -		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> -				mm_pgtables_bytes(mm));
> +		printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> +			    mm_pgtables_bytes(mm));
>   

pr_alert_once ?

Guenter

>   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
>   	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
> 
>
Heiko Carstens Nov. 27, 2018, 11:52 a.m. UTC | #12
On Tue, Nov 27, 2018 at 03:47:13AM -0800, Guenter Roeck wrote:
> >E.g. something like the below. If there aren't any objections, I will
> >provide a proper patch with changelog, etc.
> >
> >diff --git a/kernel/fork.c b/kernel/fork.c
> >index 07cddff89c7b..d7aeec03c57f 100644
> >--- a/kernel/fork.c
> >+++ b/kernel/fork.c
> >@@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm)
> >  	}
> >  	if (mm_pgtables_bytes(mm))
> >-		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> >-				mm_pgtables_bytes(mm));
> >+		printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
> >+			    mm_pgtables_bytes(mm));
> 
> pr_alert_once ?

Already changed and posted:

https://lore.kernel.org/lkml/20181127083603.39041-1-heiko.carstens@de.ibm.com/
Martin Schwidefsky Nov. 27, 2018, 2:31 p.m. UTC | #13
On Tue, 27 Nov 2018 08:34:12 +0100
Heiko Carstens <heiko.carstens@de.ibm.com> wrote:

> On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote:
> > On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote:  
> > > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote:  
> > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday
> > > > with the patch applied. This time the other way around:
> > > > 
> > > > BUG: non-zero pgtables_bytes on freeing mm: -16384
> > > > 
> > > > I am trying to understand how this can happen. For now I would like to
> > > > keep the patch on hold in case they need another change.  
> > > 
> > > FWIW, Kirill: is there a reason why this "BUG:" output is done with
> > > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants?
> > > 
> > > That would to get more information with DEBUG_VM and / or
> > > panic_on_warn=1 set. At least for automated testing it would be nice
> > > to have such triggers.  
> > 
> > Stack trace is not helpful there. It will always show the exit path which
> > is useless.  
> 
> So, even with the updated version of these patches I can flood dmesg
> and the console with
> 
> BUG: non-zero pgtables_bytes on freeing mm: 16384
> 
> messages with this complex reproducer on s390:
> 
> echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat

Forgot a hunk in the fix.. I claim not enough coffee :-/
Patch is queued and I will send a please pull by the end of the week.
--
From c0499f2aa853939984ecaf0d393012486e56c7ce Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 27 Nov 2018 14:04:04 +0100
Subject: [PATCH] s390/mm: correct pgtable_bytes on page table downgrade

The downgrade of a page table from 3 levels to 2 levels for a 31-bit compat
process removes a pmd table which has to be counted against pgtable_bytes.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pgalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 814f26520aa2..6791562779ee 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -131,6 +131,7 @@ void crst_table_downgrade(struct mm_struct *mm)
 	}
 
 	pgd = mm->pgd;
+	mm_dec_nr_pmds(mm);
 	mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 	mm->context.asce_limit = _REGION3_SIZE;
 	mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
diff mbox series

Patch

diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 0717ee76885d..f1ab9420ccfb 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -45,8 +45,6 @@  static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce_limit = STACK_TOP_MAX;
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
-		/* pgd_alloc() did not account this pud */
-		mm_inc_nr_puds(mm);
 		break;
 	case -PAGE_SIZE:
 		/* forked 5-level task, set new asce with new_mm->pgd */
@@ -62,9 +60,6 @@  static inline int init_new_context(struct task_struct *tsk,
 		/* forked 2-level compat task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-		/* pgd_alloc() did not account this pmd */
-		mm_inc_nr_pmds(mm);
-		mm_inc_nr_puds(mm);
 	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
 	return 0;
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index f0f9bcf94c03..5ee733720a57 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -36,11 +36,11 @@  static inline void crst_table_init(unsigned long *crst, unsigned long entry)
 
 static inline unsigned long pgd_entry_type(struct mm_struct *mm)
 {
-	if (mm->context.asce_limit <= _REGION3_SIZE)
+	if (mm_pmd_folded(mm))
 		return _SEGMENT_ENTRY_EMPTY;
-	if (mm->context.asce_limit <= _REGION2_SIZE)
+	if (mm_pud_folded(mm))
 		return _REGION3_ENTRY_EMPTY;
-	if (mm->context.asce_limit <= _REGION1_SIZE)
+	if (mm_p4d_folded(mm))
 		return _REGION2_ENTRY_EMPTY;
 	return _REGION1_ENTRY_EMPTY;
 }
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0e7cb0dc9c33..de05466ce50c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -485,6 +485,24 @@  static inline int is_module_addr(void *addr)
 				   _REGION_ENTRY_PROTECT | \
 				   _REGION_ENTRY_NOEXEC)
 
+static inline bool mm_p4d_folded(struct mm_struct *mm)
+{
+	return mm->context.asce_limit <= _REGION1_SIZE;
+}
+#define mm_p4d_folded(mm) mm_p4d_folded(mm)
+
+static inline bool mm_pud_folded(struct mm_struct *mm)
+{
+	return mm->context.asce_limit <= _REGION2_SIZE;
+}
+#define mm_pud_folded(mm) mm_pud_folded(mm)
+
+static inline bool mm_pmd_folded(struct mm_struct *mm)
+{
+	return mm->context.asce_limit <= _REGION3_SIZE;
+}
+#define mm_pmd_folded(mm) mm_pmd_folded(mm)
+
 static inline int mm_has_pgste(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 457b7ba0fbb6..b31c779cf581 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -136,7 +136,7 @@  static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 				unsigned long address)
 {
-	if (tlb->mm->context.asce_limit <= _REGION3_SIZE)
+	if (mm_pmd_folded(tlb->mm))
 		return;
 	pgtable_pmd_page_dtor(virt_to_page(pmd));
 	tlb_remove_table(tlb, pmd);
@@ -152,7 +152,7 @@  static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
 				unsigned long address)
 {
-	if (tlb->mm->context.asce_limit <= _REGION1_SIZE)
+	if (mm_p4d_folded(tlb->mm))
 		return;
 	tlb_remove_table(tlb, p4d);
 }
@@ -167,7 +167,7 @@  static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
 static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long address)
 {
-	if (tlb->mm->context.asce_limit <= _REGION2_SIZE)
+	if (mm_pud_folded(tlb->mm))
 		return;
 	tlb_remove_table(tlb, pud);
 }