Message ID | 1539621759-5967-4-git-send-email-schwidefsky@de.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | pgtable bytes mis-accounting v2 | expand |
On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <schwidefsky@de.ibm.com > wrote: > In case a fork or a clone system fails in copy_process and the error > handling does the mmput() at the bad_fork_cleanup_mm label, the > following warning messages will appear on the console: > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > The reason for that is the tricks we play with mm_inc_nr_puds() and > mm_inc_nr_pmds() in init_new_context(). > > A normal 64-bit process has 3 levels of page table, the p4d level and > the pud level are folded. On process termination the free_pud_range() > function in mm/memory.c will subtract 16KB from pgtable_bytes with a > mm_dec_nr_puds() call, but there actually is not really a pud table. > > One issue with this is the fact that pgtable_bytes is usually off > by a few kilobytes, but the more severe problem is that for a failed > fork or clone the free_pgtables() function is not called. In this case > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context(). > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the > BUG message. The message itself is purely cosmetic, but annoying. > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded > function to check for the true size of the address space. > I can confirm that it works to the problem, the warning message is gone after applying this patch on s390x. And I also done ltp syscalls/cve test for the patch set on x86_64 arch, there has no new regression. Tested-by: Li Wang <liwang@redhat.com> > Reported-by: Li Wang <liwang@redhat.com> > Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> > --- > arch/s390/include/asm/mmu_context.h | 5 ----- > arch/s390/include/asm/pgalloc.h | 6 +++--- > arch/s390/include/asm/pgtable.h | 18 ++++++++++++++++++ > arch/s390/include/asm/tlb.h | 6 +++--- > 4 files changed, 24 insertions(+), 11 deletions(-) > > diff --git a/arch/s390/include/asm/mmu_context.h > b/arch/s390/include/asm/mmu_context.h > index 0717ee76885d..f1ab9420ccfb 100644 > --- a/arch/s390/include/asm/mmu_context.h > +++ b/arch/s390/include/asm/mmu_context.h > @@ -45,8 +45,6 @@ static inline int init_new_context(struct task_struct > *tsk, > mm->context.asce_limit = STACK_TOP_MAX; > mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | > _ASCE_USER_BITS | _ASCE_TYPE_REGION3; > - /* pgd_alloc() did not account this pud */ > - mm_inc_nr_puds(mm); > break; > case -PAGE_SIZE: > /* forked 5-level task, set new asce with new_mm->pgd */ > @@ -62,9 +60,6 @@ static inline int init_new_context(struct task_struct > *tsk, > /* forked 2-level compat task, set new asce with new > mm->pgd */ > mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | > _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; > - /* pgd_alloc() did not account this pmd */ > - mm_inc_nr_pmds(mm); > - mm_inc_nr_puds(mm); > } > crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); > return 0; > diff --git a/arch/s390/include/asm/pgalloc.h > b/arch/s390/include/asm/pgalloc.h > index f0f9bcf94c03..5ee733720a57 100644 > --- a/arch/s390/include/asm/pgalloc.h > +++ b/arch/s390/include/asm/pgalloc.h > @@ -36,11 +36,11 @@ static inline void crst_table_init(unsigned long > *crst, unsigned long entry) > > static inline unsigned long pgd_entry_type(struct mm_struct *mm) > { > - if (mm->context.asce_limit <= _REGION3_SIZE) > + if (mm_pmd_folded(mm)) > return _SEGMENT_ENTRY_EMPTY; > - if (mm->context.asce_limit <= _REGION2_SIZE) > + if (mm_pud_folded(mm)) > return _REGION3_ENTRY_EMPTY; > - if (mm->context.asce_limit <= _REGION1_SIZE) > + if (mm_p4d_folded(mm)) > return _REGION2_ENTRY_EMPTY; > return _REGION1_ENTRY_EMPTY; > } > diff --git a/arch/s390/include/asm/pgtable.h > b/arch/s390/include/asm/pgtable.h > index 0e7cb0dc9c33..de05466ce50c 100644 > --- a/arch/s390/include/asm/pgtable.h > +++ b/arch/s390/include/asm/pgtable.h > @@ -485,6 +485,24 @@ static inline int is_module_addr(void *addr) > _REGION_ENTRY_PROTECT | \ > _REGION_ENTRY_NOEXEC) > > +static inline bool mm_p4d_folded(struct mm_struct *mm) > +{ > + return mm->context.asce_limit <= _REGION1_SIZE; > +} > +#define mm_p4d_folded(mm) mm_p4d_folded(mm) > + > +static inline bool mm_pud_folded(struct mm_struct *mm) > +{ > + return mm->context.asce_limit <= _REGION2_SIZE; > +} > +#define mm_pud_folded(mm) mm_pud_folded(mm) > + > +static inline bool mm_pmd_folded(struct mm_struct *mm) > +{ > + return mm->context.asce_limit <= _REGION3_SIZE; > +} > +#define mm_pmd_folded(mm) mm_pmd_folded(mm) > + > static inline int mm_has_pgste(struct mm_struct *mm) > { > #ifdef CONFIG_PGSTE > diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h > index 457b7ba0fbb6..b31c779cf581 100644 > --- a/arch/s390/include/asm/tlb.h > +++ b/arch/s390/include/asm/tlb.h > @@ -136,7 +136,7 @@ static inline void pte_free_tlb(struct mmu_gather > *tlb, pgtable_t pte, > static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, > unsigned long address) > { > - if (tlb->mm->context.asce_limit <= _REGION3_SIZE) > + if (mm_pmd_folded(tlb->mm)) > return; > pgtable_pmd_page_dtor(virt_to_page(pmd)); > tlb_remove_table(tlb, pmd); > @@ -152,7 +152,7 @@ static inline void pmd_free_tlb(struct mmu_gather > *tlb, pmd_t *pmd, > static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, > unsigned long address) > { > - if (tlb->mm->context.asce_limit <= _REGION1_SIZE) > + if (mm_p4d_folded(tlb->mm)) > return; > tlb_remove_table(tlb, p4d); > } > @@ -167,7 +167,7 @@ static inline void p4d_free_tlb(struct mmu_gather > *tlb, p4d_t *p4d, > static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, > unsigned long address) > { > - if (tlb->mm->context.asce_limit <= _REGION2_SIZE) > + if (mm_pud_folded(tlb->mm)) > return; > tlb_remove_table(tlb, pud); > } > -- > 2.16.4 > >
On Wed, 31 Oct 2018 14:18:33 +0800 Li Wang <liwang@redhat.com> wrote: > On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky <schwidefsky@de.ibm.com > > wrote: > > > In case a fork or a clone system fails in copy_process and the error > > handling does the mmput() at the bad_fork_cleanup_mm label, the > > following warning messages will appear on the console: > > > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > > > The reason for that is the tricks we play with mm_inc_nr_puds() and > > mm_inc_nr_pmds() in init_new_context(). > > > > A normal 64-bit process has 3 levels of page table, the p4d level and > > the pud level are folded. On process termination the free_pud_range() > > function in mm/memory.c will subtract 16KB from pgtable_bytes with a > > mm_dec_nr_puds() call, but there actually is not really a pud table. > > > > One issue with this is the fact that pgtable_bytes is usually off > > by a few kilobytes, but the more severe problem is that for a failed > > fork or clone the free_pgtables() function is not called. In this case > > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with > > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context(). > > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the > > BUG message. The message itself is purely cosmetic, but annoying. > > > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded > > function to check for the true size of the address space. > > > > I can confirm that it works to the problem, the warning message is gone > after applying this patch on s390x. And I also done ltp syscalls/cve test > for the patch set on x86_64 arch, there has no new regression. > > Tested-by: Li Wang <liwang@redhat.com> Thanks for testing. Unfortunately Heiko reported another issue yesterday with the patch applied. This time the other way around: BUG: non-zero pgtables_bytes on freeing mm: -16384 I am trying to understand how this can happen. For now I would like to keep the patch on hold in case they need another change.
On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com> wrote: > On Wed, 31 Oct 2018 14:18:33 +0800 > Li Wang <liwang@redhat.com> wrote: > > > On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky < > schwidefsky@de.ibm.com > > > wrote: > > > > > In case a fork or a clone system fails in copy_process and the error > > > handling does the mmput() at the bad_fork_cleanup_mm label, the > > > following warning messages will appear on the console: > > > > > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > > > > > The reason for that is the tricks we play with mm_inc_nr_puds() and > > > mm_inc_nr_pmds() in init_new_context(). > > > > > > A normal 64-bit process has 3 levels of page table, the p4d level and > > > the pud level are folded. On process termination the free_pud_range() > > > function in mm/memory.c will subtract 16KB from pgtable_bytes with a > > > mm_dec_nr_puds() call, but there actually is not really a pud table. > > > > > > One issue with this is the fact that pgtable_bytes is usually off > > > by a few kilobytes, but the more severe problem is that for a failed > > > fork or clone the free_pgtables() function is not called. In this case > > > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with > > > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context(). > > > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the > > > BUG message. The message itself is purely cosmetic, but annoying. > > > > > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded > > > function to check for the true size of the address space. > > > > > > > I can confirm that it works to the problem, the warning message is gone > > after applying this patch on s390x. And I also done ltp syscalls/cve test > > for the patch set on x86_64 arch, there has no new regression. > > > > Tested-by: Li Wang <liwang@redhat.com> > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > with the patch applied. This time the other way around: > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > Okay, the problem is still triggered by LTP/cve-2017-17052.c? I tried this patch on my platform and it works! My test environment as: # lscpu Architecture: s390x CPU op-mode(s): 32-bit, 64-bit Byte Order: Big Endian CPU(s): 2 On-line CPU(s) list: 0,1 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s) per book: 1 Book(s) per drawer: 1 Drawer(s): 2 Vendor ID: IBM/S390 Machine type: 2827 CPU dynamic MHz: 5504 CPU static MHz: 5504 BogoMIPS: 2913.00 Hypervisor vendor: vertical Virtualization type: full Dispatching mode: horizontal L1d cache: 96K L1i cache: 64K L2d cache: 1024K L2i cache: 1024K L3 cache: 49152K L4 cache: 393216K Flags: esan3 zarch stfle msa ldisp eimm dfp edat etf3eh highgprs te sie > I am trying to understand how this can happen. For now I would like to > keep the patch on hold in case they need another change. > Sure. > > -- > blue skies, > Martin. > > "Reality continues to ruin my life." - Calvin. > >
On Wed, 31 Oct 2018 14:43:38 +0800 Li Wang <liwang@redhat.com> wrote: > On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com> > wrote: > > > On Wed, 31 Oct 2018 14:18:33 +0800 > > Li Wang <liwang@redhat.com> wrote: > > > > > On Tue, Oct 16, 2018 at 12:42 AM, Martin Schwidefsky < > > schwidefsky@de.ibm.com > > > > wrote: > > > > > > > In case a fork or a clone system fails in copy_process and the error > > > > handling does the mmput() at the bad_fork_cleanup_mm label, the > > > > following warning messages will appear on the console: > > > > > > > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > > > > > > > The reason for that is the tricks we play with mm_inc_nr_puds() and > > > > mm_inc_nr_pmds() in init_new_context(). > > > > > > > > A normal 64-bit process has 3 levels of page table, the p4d level and > > > > the pud level are folded. On process termination the free_pud_range() > > > > function in mm/memory.c will subtract 16KB from pgtable_bytes with a > > > > mm_dec_nr_puds() call, but there actually is not really a pud table. > > > > > > > > One issue with this is the fact that pgtable_bytes is usually off > > > > by a few kilobytes, but the more severe problem is that for a failed > > > > fork or clone the free_pgtables() function is not called. In this case > > > > there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with > > > > the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context(). > > > > The pgtable_bytes will be off by 16384 or 32768 bytes and we get the > > > > BUG message. The message itself is purely cosmetic, but annoying. > > > > > > > > To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded > > > > function to check for the true size of the address space. > > > > > > > > > > I can confirm that it works to the problem, the warning message is gone > > > after applying this patch on s390x. And I also done ltp syscalls/cve test > > > for the patch set on x86_64 arch, there has no new regression. > > > > > > Tested-by: Li Wang <liwang@redhat.com> > > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > > with the patch applied. This time the other way around: > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > Okay, the problem is still triggered by LTP/cve-2017-17052.c? No, unfortunately we do not have a simple testcase to trigger this new bug. It happened once with one of our test kernels, the path that leads to this is completely unclear.
On Wed, 31 Oct 2018 07:46:47 +0100 Martin Schwidefsky <schwidefsky@de.ibm.com> wrote: > On Wed, 31 Oct 2018 14:43:38 +0800 > Li Wang <liwang@redhat.com> wrote: > > > On Wed, Oct 31, 2018 at 2:31 PM, Martin Schwidefsky <schwidefsky@de.ibm.com> > > wrote: > > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > > > > Okay, the problem is still triggered by LTP/cve-2017-17052.c? > > No, unfortunately we do not have a simple testcase to trigger this new bug. > It happened once with one of our test kernels, the path that leads to this > is completely unclear. Ok, got it. There is a mm_inc_nr_puds(mm) missing in the s390 code: diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 76d89ee8b428..814f26520aa2 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -101,6 +101,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); } else { crst_table_init(table, _REGION1_ENTRY_EMPTY); pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); One of our test-cases did an upgrade of a 3-level page table. I'll update the patch and send a v3.
On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: > Thanks for testing. Unfortunately Heiko reported another issue yesterday > with the patch applied. This time the other way around: > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > I am trying to understand how this can happen. For now I would like to > keep the patch on hold in case they need another change. FWIW, Kirill: is there a reason why this "BUG:" output is done with pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? That would to get more information with DEBUG_VM and / or panic_on_warn=1 set. At least for automated testing it would be nice to have such triggers.
On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote: > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > > with the patch applied. This time the other way around: > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > I am trying to understand how this can happen. For now I would like to > > keep the patch on hold in case they need another change. > > FWIW, Kirill: is there a reason why this "BUG:" output is done with > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? > > That would to get more information with DEBUG_VM and / or > panic_on_warn=1 set. At least for automated testing it would be nice > to have such triggers. Stack trace is not helpful there. It will always show the exit path which is useless.
On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote: > On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote: > > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > > > with the patch applied. This time the other way around: > > > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > > > I am trying to understand how this can happen. For now I would like to > > > keep the patch on hold in case they need another change. > > > > FWIW, Kirill: is there a reason why this "BUG:" output is done with > > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? > > > > That would to get more information with DEBUG_VM and / or > > panic_on_warn=1 set. At least for automated testing it would be nice > > to have such triggers. > > Stack trace is not helpful there. It will always show the exit path which > is useless. So, even with the updated version of these patches I can flood dmesg and the console with BUG: non-zero pgtables_bytes on freeing mm: 16384 messages with this complex reproducer on s390: echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat Besides that this needs to be fixed, I'd really like to see this changed to either a printk_once() or a WARN_ON_ONCE() within check_mm() so that an arbitrary user cannot flood the console. E.g. something like the below. If there aren't any objections, I will provide a proper patch with changelog, etc. diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..d7aeec03c57f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm) } if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); + printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
On Tue, Nov 27, 2018 at 08:34:12AM +0100, Heiko Carstens wrote: > On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote: > > On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote: > > > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: > > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > > > > with the patch applied. This time the other way around: > > > > > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > > > > > I am trying to understand how this can happen. For now I would like to > > > > keep the patch on hold in case they need another change. > > > > > > FWIW, Kirill: is there a reason why this "BUG:" output is done with > > > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? > > > > > > That would to get more information with DEBUG_VM and / or > > > panic_on_warn=1 set. At least for automated testing it would be nice > > > to have such triggers. > > > > Stack trace is not helpful there. It will always show the exit path which > > is useless. > > So, even with the updated version of these patches I can flood dmesg > and the console with > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > messages with this complex reproducer on s390: > > echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat > > Besides that this needs to be fixed, I'd really like to see this > changed to either a printk_once() or a WARN_ON_ONCE() within > check_mm() so that an arbitrary user cannot flood the console. > > E.g. something like the below. If there aren't any objections, I will > provide a proper patch with changelog, etc. > > diff --git a/kernel/fork.c b/kernel/fork.c > index 07cddff89c7b..d7aeec03c57f 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm) > } > > if (mm_pgtables_bytes(mm)) > - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > - mm_pgtables_bytes(mm)); > + printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > + mm_pgtables_bytes(mm)); You can be the first user of pr_alert_once(). Don't miss a chance! ;)
On Tue, Nov 27, 2018 at 11:05:15AM +0300, Kirill A. Shutemov wrote: > > E.g. something like the below. If there aren't any objections, I will > > provide a proper patch with changelog, etc. > > > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 07cddff89c7b..d7aeec03c57f 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm) > > } > > > > if (mm_pgtables_bytes(mm)) > > - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > > - mm_pgtables_bytes(mm)); > > + printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > > + mm_pgtables_bytes(mm)); > > You can be the first user of pr_alert_once(). Don't miss a chance! ;) I didn't expect that that one exists. ;) Will do.
On 11/26/18 11:34 PM, Heiko Carstens wrote: > On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote: >> On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote: >>> On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: >>>> Thanks for testing. Unfortunately Heiko reported another issue yesterday >>>> with the patch applied. This time the other way around: >>>> >>>> BUG: non-zero pgtables_bytes on freeing mm: -16384 >>>> >>>> I am trying to understand how this can happen. For now I would like to >>>> keep the patch on hold in case they need another change. >>> >>> FWIW, Kirill: is there a reason why this "BUG:" output is done with >>> pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? >>> >>> That would to get more information with DEBUG_VM and / or >>> panic_on_warn=1 set. At least for automated testing it would be nice >>> to have such triggers. >> >> Stack trace is not helpful there. It will always show the exit path which >> is useless. > > So, even with the updated version of these patches I can flood dmesg > and the console with > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > messages with this complex reproducer on s390: > > echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat > > Besides that this needs to be fixed, I'd really like to see this > changed to either a printk_once() or a WARN_ON_ONCE() within > check_mm() so that an arbitrary user cannot flood the console. > > E.g. something like the below. If there aren't any objections, I will > provide a proper patch with changelog, etc. > > diff --git a/kernel/fork.c b/kernel/fork.c > index 07cddff89c7b..d7aeec03c57f 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm) > } > > if (mm_pgtables_bytes(mm)) > - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > - mm_pgtables_bytes(mm)); > + printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > + mm_pgtables_bytes(mm)); > pr_alert_once ? Guenter > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS > VM_BUG_ON_MM(mm->pmd_huge_pte, mm); > >
On Tue, Nov 27, 2018 at 03:47:13AM -0800, Guenter Roeck wrote: > >E.g. something like the below. If there aren't any objections, I will > >provide a proper patch with changelog, etc. > > > >diff --git a/kernel/fork.c b/kernel/fork.c > >index 07cddff89c7b..d7aeec03c57f 100644 > >--- a/kernel/fork.c > >+++ b/kernel/fork.c > >@@ -647,8 +647,8 @@ static void check_mm(struct mm_struct *mm) > > } > > if (mm_pgtables_bytes(mm)) > >- pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > >- mm_pgtables_bytes(mm)); > >+ printk_once(KERN_ALERT "BUG: non-zero pgtables_bytes on freeing mm: %ld\n", > >+ mm_pgtables_bytes(mm)); > > pr_alert_once ? Already changed and posted: https://lore.kernel.org/lkml/20181127083603.39041-1-heiko.carstens@de.ibm.com/
On Tue, 27 Nov 2018 08:34:12 +0100 Heiko Carstens <heiko.carstens@de.ibm.com> wrote: > On Wed, Oct 31, 2018 at 01:36:23PM +0300, Kirill A. Shutemov wrote: > > On Wed, Oct 31, 2018 at 11:09:44AM +0100, Heiko Carstens wrote: > > > On Wed, Oct 31, 2018 at 07:31:49AM +0100, Martin Schwidefsky wrote: > > > > Thanks for testing. Unfortunately Heiko reported another issue yesterday > > > > with the patch applied. This time the other way around: > > > > > > > > BUG: non-zero pgtables_bytes on freeing mm: -16384 > > > > > > > > I am trying to understand how this can happen. For now I would like to > > > > keep the patch on hold in case they need another change. > > > > > > FWIW, Kirill: is there a reason why this "BUG:" output is done with > > > pr_alert() and not with VM_BUG_ON() or one of the WARN*() variants? > > > > > > That would to get more information with DEBUG_VM and / or > > > panic_on_warn=1 set. At least for automated testing it would be nice > > > to have such triggers. > > > > Stack trace is not helpful there. It will always show the exit path which > > is useless. > > So, even with the updated version of these patches I can flood dmesg > and the console with > > BUG: non-zero pgtables_bytes on freeing mm: 16384 > > messages with this complex reproducer on s390: > > echo "void main(void) {}" | gcc -m31 -xc -o compat - && ./compat Forgot a hunk in the fix.. I claim not enough coffee :-/ Patch is queued and I will send a please pull by the end of the week. -- From c0499f2aa853939984ecaf0d393012486e56c7ce Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky <schwidefsky@de.ibm.com> Date: Tue, 27 Nov 2018 14:04:04 +0100 Subject: [PATCH] s390/mm: correct pgtable_bytes on page table downgrade The downgrade of a page table from 3 levels to 2 levels for a 31-bit compat process removes a pmd table which has to be counted against pgtable_bytes. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- arch/s390/mm/pgalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 814f26520aa2..6791562779ee 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -131,6 +131,7 @@ void crst_table_downgrade(struct mm_struct *mm) } pgd = mm->pgd; + mm_dec_nr_pmds(mm); mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); mm->context.asce_limit = _REGION3_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 0717ee76885d..f1ab9420ccfb 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -45,8 +45,6 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; - /* pgd_alloc() did not account this pud */ - mm_inc_nr_puds(mm); break; case -PAGE_SIZE: /* forked 5-level task, set new asce with new_mm->pgd */ @@ -62,9 +60,6 @@ static inline int init_new_context(struct task_struct *tsk, /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - /* pgd_alloc() did not account this pmd */ - mm_inc_nr_pmds(mm); - mm_inc_nr_puds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); return 0; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index f0f9bcf94c03..5ee733720a57 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -36,11 +36,11 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry) static inline unsigned long pgd_entry_type(struct mm_struct *mm) { - if (mm->context.asce_limit <= _REGION3_SIZE) + if (mm_pmd_folded(mm)) return _SEGMENT_ENTRY_EMPTY; - if (mm->context.asce_limit <= _REGION2_SIZE) + if (mm_pud_folded(mm)) return _REGION3_ENTRY_EMPTY; - if (mm->context.asce_limit <= _REGION1_SIZE) + if (mm_p4d_folded(mm)) return _REGION2_ENTRY_EMPTY; return _REGION1_ENTRY_EMPTY; } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0e7cb0dc9c33..de05466ce50c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -485,6 +485,24 @@ static inline int is_module_addr(void *addr) _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION1_SIZE; +} +#define mm_p4d_folded(mm) mm_p4d_folded(mm) + +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION2_SIZE; +} +#define mm_pud_folded(mm) mm_pud_folded(mm) + +static inline bool mm_pmd_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION3_SIZE; +} +#define mm_pmd_folded(mm) mm_pmd_folded(mm) + static inline int mm_has_pgste(struct mm_struct *mm) { #ifdef CONFIG_PGSTE diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 457b7ba0fbb6..b31c779cf581 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -136,7 +136,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - if (tlb->mm->context.asce_limit <= _REGION3_SIZE) + if (mm_pmd_folded(tlb->mm)) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); tlb_remove_table(tlb, pmd); @@ -152,7 +152,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (tlb->mm->context.asce_limit <= _REGION1_SIZE) + if (mm_p4d_folded(tlb->mm)) return; tlb_remove_table(tlb, p4d); } @@ -167,7 +167,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - if (tlb->mm->context.asce_limit <= _REGION2_SIZE) + if (mm_pud_folded(tlb->mm)) return; tlb_remove_table(tlb, pud); }
In case a fork or a clone system fails in copy_process and the error handling does the mmput() at the bad_fork_cleanup_mm label, the following warning messages will appear on the console: BUG: non-zero pgtables_bytes on freeing mm: 16384 The reason for that is the tricks we play with mm_inc_nr_puds() and mm_inc_nr_pmds() in init_new_context(). A normal 64-bit process has 3 levels of page table, the p4d level and the pud level are folded. On process termination the free_pud_range() function in mm/memory.c will subtract 16KB from pgtable_bytes with a mm_dec_nr_puds() call, but there actually is not really a pud table. One issue with this is the fact that pgtable_bytes is usually off by a few kilobytes, but the more severe problem is that for a failed fork or clone the free_pgtables() function is not called. In this case there is no mm_dec_nr_puds() or mm_dec_nr_pmds() that go together with the mm_inc_nr_puds() and mm_inc_nr_pmds in init_new_context(). The pgtable_bytes will be off by 16384 or 32768 bytes and we get the BUG message. The message itself is purely cosmetic, but annoying. To fix this override the mm_pmd_folded, mm_pud_folded and mm_p4d_folded function to check for the true size of the address space. Reported-by: Li Wang <liwang@redhat.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- arch/s390/include/asm/mmu_context.h | 5 ----- arch/s390/include/asm/pgalloc.h | 6 +++--- arch/s390/include/asm/pgtable.h | 18 ++++++++++++++++++ arch/s390/include/asm/tlb.h | 6 +++--- 4 files changed, 24 insertions(+), 11 deletions(-)