diff mbox series

[v1] mm, pagemap: expose hwpoison entry

Message ID 20211004115001.1544259-1-naoya.horiguchi@linux.dev (mailing list archive)
State New
Headers show
Series [v1] mm, pagemap: expose hwpoison entry | expand

Commit Message

Naoya Horiguchi Oct. 4, 2021, 11:50 a.m. UTC
From: Naoya Horiguchi <naoya.horiguchi@nec.com>

A hwpoison entry is a non-present page table entry to report
memory error events to userspace. If we have an easy way to know
which processes have hwpoison entries, that might be useful for
user processes to take proper actions. But we don't have it now.
So make pagemap interface expose hwpoison entries to userspace.

Hwpoison entry for hugepage is also exposed by this patch. The below
example shows how pagemap is visible in the case where a memory error
hit a hugepage mapped to a process.

    $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
    voffset offset  len     flags
    700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
    700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
    700000200       12f800  1       __________B________X_______________f______w_
    700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
    700000202       12f802  1fe     __________B________X_______________f______w_

The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
flag) are considered as hwpoison entries.  So all pages in 2MB range
are inaccessible from the process.  We can get actual error location
by page-types in physical address mode.

    $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
    offset  len     flags
    12f800  1       __________B_________________________________
    12f801  1       ___________________X________________________
    12f802  1fe     __________B_________________________________

Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
 fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
 include/linux/swapops.h | 13 +++++++++++++
 tools/vm/page-types.c   |  7 ++++++-
 3 files changed, 51 insertions(+), 10 deletions(-)

Comments

David Hildenbrand Oct. 4, 2021, 11:55 a.m. UTC | #1
On 04.10.21 13:50, Naoya Horiguchi wrote:
> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> 
> A hwpoison entry is a non-present page table entry to report
> memory error events to userspace. If we have an easy way to know
> which processes have hwpoison entries, that might be useful for
> user processes to take proper actions. But we don't have it now.
> So make pagemap interface expose hwpoison entries to userspace.

Noting that this is only a way to inspect hwpoison set for private 
anonymous memory. You cannot really identify anything related to shared 
memory.

Do you also handle private hugetlb pages?

> 
> Hwpoison entry for hugepage is also exposed by this patch. The below
> example shows how pagemap is visible in the case where a memory error
> hit a hugepage mapped to a process.
> 
>      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
>      voffset offset  len     flags
>      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
>      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
>      700000200       12f800  1       __________B________X_______________f______w_
>      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
>      700000202       12f802  1fe     __________B________X_______________f______w_
> 
> The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> flag) are considered as hwpoison entries.  So all pages in 2MB range
> are inaccessible from the process.  We can get actual error location
> by page-types in physical address mode.
> 
>      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
>      offset  len     flags
>      12f800  1       __________B_________________________________
>      12f801  1       ___________________X________________________
>      12f802  1fe     __________B_________________________________
> 
> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> ---
>   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
>   include/linux/swapops.h | 13 +++++++++++++
>   tools/vm/page-types.c   |  7 ++++++-
>   3 files changed, 51 insertions(+), 10 deletions(-)


Please also update the documentation located at

Documentation/admin-guide/mm/pagemap.rst
Naoya Horiguchi Oct. 4, 2021, 2:32 p.m. UTC | #2
On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
> On 04.10.21 13:50, Naoya Horiguchi wrote:
> > From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > 
> > A hwpoison entry is a non-present page table entry to report
> > memory error events to userspace. If we have an easy way to know
> > which processes have hwpoison entries, that might be useful for
> > user processes to take proper actions. But we don't have it now.
> > So make pagemap interface expose hwpoison entries to userspace.
> 
> Noting that this is only a way to inspect hwpoison set for private anonymous
> memory. You cannot really identify anything related to shared memory.
> 
> Do you also handle private hugetlb pages?

I think yes.  As long as hugepages are mmap()ed, we should be able to
identify them with hwpoison entry (even if used via private/shared mapping).

> 
> > 
> > Hwpoison entry for hugepage is also exposed by this patch. The below
> > example shows how pagemap is visible in the case where a memory error
> > hit a hugepage mapped to a process.
> > 
> >      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
> >      voffset offset  len     flags
> >      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
> >      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
> >      700000200       12f800  1       __________B________X_______________f______w_
> >      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
> >      700000202       12f802  1fe     __________B________X_______________f______w_
> > 
> > The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> > flag) are considered as hwpoison entries.  So all pages in 2MB range
> > are inaccessible from the process.  We can get actual error location
> > by page-types in physical address mode.
> > 
> >      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
> >      offset  len     flags
> >      12f800  1       __________B_________________________________
> >      12f801  1       ___________________X________________________
> >      12f802  1fe     __________B_________________________________
> > 
> > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > ---
> >   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
> >   include/linux/swapops.h | 13 +++++++++++++
> >   tools/vm/page-types.c   |  7 ++++++-
> >   3 files changed, 51 insertions(+), 10 deletions(-)
> 
> 
> Please also update the documentation located at
> 
> Documentation/admin-guide/mm/pagemap.rst

I will do this in the next post.

Thanks,
Naoya Horigcuhi
kernel test robot Oct. 4, 2021, 8:17 p.m. UTC | #3
Hi Naoya,

I love your patch! Perhaps something to improve:

[auto build test WARNING on hnaz-mm/master]

url:    https://github.com/0day-ci/linux/commits/Naoya-Horiguchi/mm-pagemap-expose-hwpoison-entry/20211004-195200
base:   https://github.com/hnaz/linux-mm master
config: riscv-randconfig-r035-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project dec2257f354d39dbd8232e6bd1a417d91c4f14a2)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/0day-ci/linux/commit/f1735ae85d981e413e70ff0041fcb7f775525699
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Naoya-Horiguchi/mm-pagemap-expose-hwpoison-entry/20211004-195200
        git checkout f1735ae85d981e413e70ff0041fcb7f775525699
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> fs/proc/task_mmu.c:1561:4: warning: add explicit braces to avoid dangling else [-Wdangling-else]
                           else if (flags & PM_SWAP)
                           ^
   1 warning generated.


vim +1561 fs/proc/task_mmu.c

  1505	
  1506	#ifdef CONFIG_HUGETLB_PAGE
  1507	/* This function walks within one hugetlb entry in the single call */
  1508	static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
  1509					 unsigned long addr, unsigned long end,
  1510					 struct mm_walk *walk)
  1511	{
  1512		struct pagemapread *pm = walk->private;
  1513		struct vm_area_struct *vma = walk->vma;
  1514		u64 flags = 0, frame = 0;
  1515		int err = 0;
  1516		pte_t pte;
  1517		struct page *page = NULL;
  1518	
  1519		if (vma->vm_flags & VM_SOFTDIRTY)
  1520			flags |= PM_SOFT_DIRTY;
  1521	
  1522		pte = huge_ptep_get(ptep);
  1523		if (pte_present(pte)) {
  1524			page = pte_page(pte);
  1525	
  1526			flags |= PM_PRESENT;
  1527			if (pm->show_pfn)
  1528				frame = pte_pfn(pte) +
  1529					((addr & ~hmask) >> PAGE_SHIFT);
  1530		} else if (is_swap_pte(pte)) {
  1531			swp_entry_t entry = pte_to_swp_entry(pte);
  1532			unsigned long offset;
  1533	
  1534			if (pm->show_pfn) {
  1535				offset = swp_offset(entry) +
  1536					((addr & ~hmask) >> PAGE_SHIFT);
  1537				frame = swp_type(entry) |
  1538					(offset << MAX_SWAPFILES_SHIFT);
  1539			}
  1540			flags |= PM_SWAP;
  1541			if (is_migration_entry(entry))
  1542				page = compound_head(pfn_swap_entry_to_page(entry));
  1543			if (is_hwpoison_entry(entry))
  1544				flags |= PM_HWPOISON;
  1545		}
  1546	
  1547		if (page && !PageAnon(page))
  1548			flags |= PM_FILE;
  1549		if (page && page_mapcount(page) == 1)
  1550			flags |= PM_MMAP_EXCLUSIVE;
  1551	
  1552		for (; addr != end; addr += PAGE_SIZE) {
  1553			pagemap_entry_t pme = make_pme(frame, flags);
  1554	
  1555			err = add_to_pagemap(addr, &pme, pm);
  1556			if (err)
  1557				return err;
  1558			if (pm->show_pfn)
  1559				if (flags & PM_PRESENT)
  1560					frame++;
> 1561				else if (flags & PM_SWAP)
  1562					frame += (1 << MAX_SWAPFILES_SHIFT);
  1563		}
  1564	
  1565		cond_resched();
  1566	
  1567		return err;
  1568	}
  1569	#else
  1570	#define pagemap_hugetlb_range	NULL
  1571	#endif /* HUGETLB_PAGE */
  1572	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 5, 2021, 2:53 a.m. UTC | #4
Hi Naoya,

I love your patch! Perhaps something to improve:

[auto build test WARNING on hnaz-mm/master]

url:    https://github.com/0day-ci/linux/commits/Naoya-Horiguchi/mm-pagemap-expose-hwpoison-entry/20211004-195200
base:   https://github.com/hnaz/linux-mm master
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/f1735ae85d981e413e70ff0041fcb7f775525699
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Naoya-Horiguchi/mm-pagemap-expose-hwpoison-entry/20211004-195200
        git checkout f1735ae85d981e413e70ff0041fcb7f775525699
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/proc/task_mmu.c: In function 'pagemap_hugetlb_range':
>> fs/proc/task_mmu.c:1558:20: warning: suggest explicit braces to avoid ambiguous 'else' [-Wdangling-else]
    1558 |                 if (pm->show_pfn)
         |                    ^


vim +/else +1558 fs/proc/task_mmu.c

  1505	
  1506	#ifdef CONFIG_HUGETLB_PAGE
  1507	/* This function walks within one hugetlb entry in the single call */
  1508	static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
  1509					 unsigned long addr, unsigned long end,
  1510					 struct mm_walk *walk)
  1511	{
  1512		struct pagemapread *pm = walk->private;
  1513		struct vm_area_struct *vma = walk->vma;
  1514		u64 flags = 0, frame = 0;
  1515		int err = 0;
  1516		pte_t pte;
  1517		struct page *page = NULL;
  1518	
  1519		if (vma->vm_flags & VM_SOFTDIRTY)
  1520			flags |= PM_SOFT_DIRTY;
  1521	
  1522		pte = huge_ptep_get(ptep);
  1523		if (pte_present(pte)) {
  1524			page = pte_page(pte);
  1525	
  1526			flags |= PM_PRESENT;
  1527			if (pm->show_pfn)
  1528				frame = pte_pfn(pte) +
  1529					((addr & ~hmask) >> PAGE_SHIFT);
  1530		} else if (is_swap_pte(pte)) {
  1531			swp_entry_t entry = pte_to_swp_entry(pte);
  1532			unsigned long offset;
  1533	
  1534			if (pm->show_pfn) {
  1535				offset = swp_offset(entry) +
  1536					((addr & ~hmask) >> PAGE_SHIFT);
  1537				frame = swp_type(entry) |
  1538					(offset << MAX_SWAPFILES_SHIFT);
  1539			}
  1540			flags |= PM_SWAP;
  1541			if (is_migration_entry(entry))
  1542				page = compound_head(pfn_swap_entry_to_page(entry));
  1543			if (is_hwpoison_entry(entry))
  1544				flags |= PM_HWPOISON;
  1545		}
  1546	
  1547		if (page && !PageAnon(page))
  1548			flags |= PM_FILE;
  1549		if (page && page_mapcount(page) == 1)
  1550			flags |= PM_MMAP_EXCLUSIVE;
  1551	
  1552		for (; addr != end; addr += PAGE_SIZE) {
  1553			pagemap_entry_t pme = make_pme(frame, flags);
  1554	
  1555			err = add_to_pagemap(addr, &pme, pm);
  1556			if (err)
  1557				return err;
> 1558			if (pm->show_pfn)
  1559				if (flags & PM_PRESENT)
  1560					frame++;
  1561				else if (flags & PM_SWAP)
  1562					frame += (1 << MAX_SWAPFILES_SHIFT);
  1563		}
  1564	
  1565		cond_resched();
  1566	
  1567		return err;
  1568	}
  1569	#else
  1570	#define pagemap_hugetlb_range	NULL
  1571	#endif /* HUGETLB_PAGE */
  1572	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Peter Xu Oct. 13, 2021, 2:49 a.m. UTC | #5
Hi, Naoya,

On Mon, Oct 04, 2021 at 08:50:01PM +0900, Naoya Horiguchi wrote:
> +static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
> +{
> +	struct page *p = pfn_to_page(swp_offset(entry));
> +
> +	WARN_ON(!PageHWPoison(p));
> +	return p;
> +}

This is more a pure question..

I'm wondering whether that WARN_ON() could trigger.

IOW, what if we poison an anonymous page and then unpoison it?  Will there be a
hwpoison swap entry leftover in the ptes that it used to map?  Will it crash
the program when the page is accessed?

I had a feeling that when handling the page fault in do_swap_page before we
SIGBUS the program, we should double-check the PageHWPoison on the pfn page,
but I could be missing something..

Thanks,
Naoya Horiguchi Oct. 14, 2021, 1:36 p.m. UTC | #6
On Wed, Oct 13, 2021 at 10:49:04AM +0800, Peter Xu wrote:
> Hi, Naoya,
> 
> On Mon, Oct 04, 2021 at 08:50:01PM +0900, Naoya Horiguchi wrote:
> > +static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
> > +{
> > +	struct page *p = pfn_to_page(swp_offset(entry));
> > +
> > +	WARN_ON(!PageHWPoison(p));
> > +	return p;
> > +}
> 
> This is more a pure question..
> 
> I'm wondering whether that WARN_ON() could trigger.
> 
> IOW, what if we poison an anonymous page and then unpoison it?  

Thanks for the good question, this could trigger WARN for unpoisoned pages.
The impact is limited because the caller of unpoison should know that that
happens in testing workload, but maybe there's no good reason to prevent
from it. So I'll drop this WARN_ON().

> Will there be a
> hwpoison swap entry leftover in the ptes that it used to map?  

Yes it will, unpoison never affects exisiting hwpoison swap entries.

> Will it crash
> the program when the page is accessed?

Reading hwpoison_entry_to_page() via pagemap interface should not crash
because it just reads the page's metadata.
The process with the hwpoison swap entry still receives SIGBUS when doing
page fault (irrespective of doing unpoison or not) on the error address.

> 
> I had a feeling that when handling the page fault in do_swap_page before we
> SIGBUS the program, we should double-check the PageHWPoison on the pfn page,
> but I could be missing something..

The double-checking seems to allow processes to detect that the hwpoison page
is unpoisoned, some test programs could benefit from it. But maybe it could
be done independent of this patch.

Personally, I only use unpoison in cleanup phase of each test case,
and each test case newly starts test processes, so reusing error pages
with unpoison can be avoided.

Thanks,
Naoya Horiguchi
Peter Xu Oct. 14, 2021, 11:32 p.m. UTC | #7
On Thu, Oct 14, 2021 at 10:36:44PM +0900, Naoya Horiguchi wrote:
> Thanks for the good question, this could trigger WARN for unpoisoned pages.
> The impact is limited because the caller of unpoison should know that that
> happens in testing workload, but maybe there's no good reason to prevent
> from it. So I'll drop this WARN_ON().

Sounds good, thanks.

> > I had a feeling that when handling the page fault in do_swap_page before we
> > SIGBUS the program, we should double-check the PageHWPoison on the pfn page,
> > but I could be missing something..
> 
> The double-checking seems to allow processes to detect that the hwpoison page
> is unpoisoned, some test programs could benefit from it. But maybe it could
> be done independent of this patch.
> 
> Personally, I only use unpoison in cleanup phase of each test case,
> and each test case newly starts test processes, so reusing error pages
> with unpoison can be avoided.

I see.  We don't have anything like soft-online a page, do we?

I also don't know whether hwpoison can be used in any other form besides
testing the hwpoison code path.  E.g. logically it can be used to forbid
accessing a physical page for any reason then recover using unhwpoison?

Uffd has the SIGBUS mode that can do similar thing to the virtual address
space, so any access to some page will generate a SIGBUS. While hwpoison is
phsical address space based from that pov, and just like uffd it doesn't need
to change vma layout which should be a good property unlike mprotect().  But I
totally have no clue on all these... so it's just wild idea.

Anyway, agreed on above, even if it's applicable it should be a separate patch.

Thanks,
Naoya Horiguchi Oct. 26, 2021, 11:27 p.m. UTC | #8
On Mon, Oct 04, 2021 at 11:32:28PM +0900, Naoya Horiguchi wrote:
> On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
> > On 04.10.21 13:50, Naoya Horiguchi wrote:
...
> > >
> > > Hwpoison entry for hugepage is also exposed by this patch. The below
> > > example shows how pagemap is visible in the case where a memory error
> > > hit a hugepage mapped to a process.
> > >
> > >      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
> > >      voffset offset  len     flags
> > >      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
> > >      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
> > >      700000200       12f800  1       __________B________X_______________f______w_
> > >      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
> > >      700000202       12f802  1fe     __________B________X_______________f______w_
> > >
> > > The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> > > flag) are considered as hwpoison entries.  So all pages in 2MB range
> > > are inaccessible from the process.  We can get actual error location
> > > by page-types in physical address mode.
> > >
> > >      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
> > >      offset  len     flags
> > >      12f800  1       __________B_________________________________
> > >      12f801  1       ___________________X________________________
> > >      12f802  1fe     __________B_________________________________
> > >
> > > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > ---
> > >   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
> > >   include/linux/swapops.h | 13 +++++++++++++
> > >   tools/vm/page-types.c   |  7 ++++++-
> > >   3 files changed, 51 insertions(+), 10 deletions(-)
> >
> >
> > Please also update the documentation located at
> >
> > Documentation/admin-guide/mm/pagemap.rst
>
> I will do this in the next post.

Reading the document, I found that swap type is already exported so we
could identify hwpoison entry with it (without new PM_HWPOISON bit).
One problem is that the format of swap types (like SWP_HWPOISON) depends
on a few config macros like CONFIG_DEVICE_PRIVATE and CONFIG_MIGRATION,
so we also need to export how the swap type field is interpreted.

I thought of adding new interfaces for example under /sys/kernel/mm/swap/type_format/,
which shows info like below (assuming that all CONFIG_{DEVICE_PRIVATE,MIGRATION,MEMORY_FAILURE}
is enabled):

  $ ls /sys/kernel/mm/swap/type_format/
  hwpoison
  migration_read
  migration_write
  device_write
  device_read
  device_exclusive_write
  device_exclusive_read
  
  $ cat /sys/kernel/mm/swap/type_format/hwpoison
  25
  
  $ cat /sys/kernel/mm/swap/type_format/device_write
  28

Does it make sense or any better approach?

Thanks,
Naoya Horiguchi
Peter Xu Oct. 27, 2021, 2:09 a.m. UTC | #9
On Wed, Oct 27, 2021 at 08:27:36AM +0900, Naoya Horiguchi wrote:
> On Mon, Oct 04, 2021 at 11:32:28PM +0900, Naoya Horiguchi wrote:
> > On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
> > > On 04.10.21 13:50, Naoya Horiguchi wrote:
> ...
> > > >
> > > > Hwpoison entry for hugepage is also exposed by this patch. The below
> > > > example shows how pagemap is visible in the case where a memory error
> > > > hit a hugepage mapped to a process.
> > > >
> > > >      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
> > > >      voffset offset  len     flags
> > > >      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
> > > >      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
> > > >      700000200       12f800  1       __________B________X_______________f______w_
> > > >      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
> > > >      700000202       12f802  1fe     __________B________X_______________f______w_
> > > >
> > > > The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> > > > flag) are considered as hwpoison entries.  So all pages in 2MB range
> > > > are inaccessible from the process.  We can get actual error location
> > > > by page-types in physical address mode.
> > > >
> > > >      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
> > > >      offset  len     flags
> > > >      12f800  1       __________B_________________________________
> > > >      12f801  1       ___________________X________________________
> > > >      12f802  1fe     __________B_________________________________
> > > >
> > > > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > > ---
> > > >   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
> > > >   include/linux/swapops.h | 13 +++++++++++++
> > > >   tools/vm/page-types.c   |  7 ++++++-
> > > >   3 files changed, 51 insertions(+), 10 deletions(-)
> > >
> > >
> > > Please also update the documentation located at
> > >
> > > Documentation/admin-guide/mm/pagemap.rst
> >
> > I will do this in the next post.
> 
> Reading the document, I found that swap type is already exported so we
> could identify hwpoison entry with it (without new PM_HWPOISON bit).
> One problem is that the format of swap types (like SWP_HWPOISON) depends
> on a few config macros like CONFIG_DEVICE_PRIVATE and CONFIG_MIGRATION,
> so we also need to export how the swap type field is interpreted.

I had similar question before.. though it was more on the generic swap entries
not the special ones yet.

The thing is I don't know how the userspace could interpret normal swap device
indexes out of reading pagemap, say if we have two swap devices with "swapon
-s" then I've no idea how do we know which device has which swap type index
allocated.  That seems to be a similar question asked above on special swap
types - the interface seems to be incomplete, if not unused at all.

AFAIU the information on "this page is swapped out to device X on offset Y" is
not reliable too, because the pagein/pageout from kernel is transparent to the
userspace and not under control of userspace at all.  IOW, if the user reads
that swap entry, then reads data upon the disk of that offset out and put it
somewhere else, then it means the data read could already be old if kernel
paged in the page after userspace reading the pagemap but before it reading the
disk, and I don't see any way to make it right unless the userspace could stop
the kernel from page-in a swap entry.  That's why I really wonder whether we
should expose normal swap entry at all, as I don't know how it could be helpful
and used in the 100% right way.

Special swap entries seem a bit different - at least for is_pfn_swap_entry()
typed swap entries we can still expose the PFN which might be helpful, which I
can't tell.

I used to send an email to Matt Mackall <mpm@selenic.com> and Dave Hansen
<dave.hansen@linux.intel.com> asking about above but didn't get a reply. Ccing
again this time with the list copied.

> 
> I thought of adding new interfaces for example under /sys/kernel/mm/swap/type_format/,
> which shows info like below (assuming that all CONFIG_{DEVICE_PRIVATE,MIGRATION,MEMORY_FAILURE}
> is enabled):
> 
>   $ ls /sys/kernel/mm/swap/type_format/
>   hwpoison
>   migration_read
>   migration_write
>   device_write
>   device_read
>   device_exclusive_write
>   device_exclusive_read
>   
>   $ cat /sys/kernel/mm/swap/type_format/hwpoison
>   25
>   
>   $ cat /sys/kernel/mm/swap/type_format/device_write
>   28
> 
> Does it make sense or any better approach?

Then I'm wondering whether we care about the rest of the normal swap devices
too with pagemap so do we need to expose some information there too (only if
there's a real use case, though..)?  Or... should we just don't expose swap
entries at all, at least generic swap entries?  We can still expose things like
hwpoison via PM_* bits well defined in that case.

Thanks,
Naoya Horiguchi Oct. 27, 2021, 6:45 a.m. UTC | #10
On Wed, Oct 27, 2021 at 10:09:03AM +0800, Peter Xu wrote:
> On Wed, Oct 27, 2021 at 08:27:36AM +0900, Naoya Horiguchi wrote:
> > On Mon, Oct 04, 2021 at 11:32:28PM +0900, Naoya Horiguchi wrote:
> > > On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
> > > > On 04.10.21 13:50, Naoya Horiguchi wrote:
> > ...
> > > > >
> > > > > Hwpoison entry for hugepage is also exposed by this patch. The below
> > > > > example shows how pagemap is visible in the case where a memory error
> > > > > hit a hugepage mapped to a process.
> > > > >
> > > > >      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
> > > > >      voffset offset  len     flags
> > > > >      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
> > > > >      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
> > > > >      700000200       12f800  1       __________B________X_______________f______w_
> > > > >      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
> > > > >      700000202       12f802  1fe     __________B________X_______________f______w_
> > > > >
> > > > > The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> > > > > flag) are considered as hwpoison entries.  So all pages in 2MB range
> > > > > are inaccessible from the process.  We can get actual error location
> > > > > by page-types in physical address mode.
> > > > >
> > > > >      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
> > > > >      offset  len     flags
> > > > >      12f800  1       __________B_________________________________
> > > > >      12f801  1       ___________________X________________________
> > > > >      12f802  1fe     __________B_________________________________
> > > > >
> > > > > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > > > ---
> > > > >   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
> > > > >   include/linux/swapops.h | 13 +++++++++++++
> > > > >   tools/vm/page-types.c   |  7 ++++++-
> > > > >   3 files changed, 51 insertions(+), 10 deletions(-)
> > > >
> > > >
> > > > Please also update the documentation located at
> > > >
> > > > Documentation/admin-guide/mm/pagemap.rst
> > >
> > > I will do this in the next post.
> > 
> > Reading the document, I found that swap type is already exported so we
> > could identify hwpoison entry with it (without new PM_HWPOISON bit).
> > One problem is that the format of swap types (like SWP_HWPOISON) depends
> > on a few config macros like CONFIG_DEVICE_PRIVATE and CONFIG_MIGRATION,
> > so we also need to export how the swap type field is interpreted.
> 
> I had similar question before.. though it was more on the generic swap entries
> not the special ones yet.
> 
> The thing is I don't know how the userspace could interpret normal swap device
> indexes out of reading pagemap, say if we have two swap devices with "swapon
> -s" then I've no idea how do we know which device has which swap type index
> allocated.  That seems to be a similar question asked above on special swap
> types - the interface seems to be incomplete, if not unused at all.
> 
> AFAIU the information on "this page is swapped out to device X on offset Y" is
> not reliable too, because the pagein/pageout from kernel is transparent to the
> userspace and not under control of userspace at all.  IOW, if the user reads
> that swap entry, then reads data upon the disk of that offset out and put it
> somewhere else, then it means the data read could already be old if kernel
> paged in the page after userspace reading the pagemap but before it reading the
> disk, and I don't see any way to make it right unless the userspace could stop
> the kernel from page-in a swap entry.  That's why I really wonder whether we
> should expose normal swap entry at all, as I don't know how it could be helpful
> and used in the 100% right way.

Thank you for the feedback.

I think that a process interested in controlling swap-in/out behavior in its own
typically calls mincore() to get current status and madvise() to trigger swap-in/out.
That's not 100% solution for the same reason, but it mostly works well because
calling madvise(MADV_PAGEOUT) to already swapped out is not a big issue (although
some CPU/memory resource is wasted, but the amount of the waste is small if the
returned info is new enough). 
So my point is that the concern around information newness might be more generic
issue rather than just for pagemap.  If we need 100% accurate in-kernel info,
maybe it had better be done in kernel (or some cooler stuff like eBPF)?

> 
> Special swap entries seem a bit different - at least for is_pfn_swap_entry()
> typed swap entries we can still expose the PFN which might be helpful, which I
> can't tell.

I'm one who think it helpful for testing, although I know testing might not be
considered as a real usecase.

> 
> I used to send an email to Matt Mackall <mpm@selenic.com> and Dave Hansen
> <dave.hansen@linux.intel.com> asking about above but didn't get a reply. Ccing
> again this time with the list copied.
> 
> > 
> > I thought of adding new interfaces for example under /sys/kernel/mm/swap/type_format/,
> > which shows info like below (assuming that all CONFIG_{DEVICE_PRIVATE,MIGRATION,MEMORY_FAILURE}
> > is enabled):
> > 
> >   $ ls /sys/kernel/mm/swap/type_format/
> >   hwpoison
> >   migration_read
> >   migration_write
> >   device_write
> >   device_read
> >   device_exclusive_write
> >   device_exclusive_read
> >   
> >   $ cat /sys/kernel/mm/swap/type_format/hwpoison
> >   25
> >   
> >   $ cat /sys/kernel/mm/swap/type_format/device_write
> >   28
> > 
> > Does it make sense or any better approach?
> 
> Then I'm wondering whether we care about the rest of the normal swap devices
> too with pagemap so do we need to expose some information there too (only if
> there's a real use case, though..)?  Or... should we just don't expose swap
> entries at all, at least generic swap entries?  We can still expose things like
> hwpoison via PM_* bits well defined in that case.

I didn't think about normal swap devices for no reason. I'm OK to stop exposing
normal swap device part.  I don't have strong option yet about which approach
(in swaptype or PM_HWPOISON) I'll suggest next (so wait a little more for feedback).

Thanks,
Naoya Horiguchi
Peter Xu Oct. 27, 2021, 7:02 a.m. UTC | #11
On Wed, Oct 27, 2021 at 03:45:13PM +0900, Naoya Horiguchi wrote:
> On Wed, Oct 27, 2021 at 10:09:03AM +0800, Peter Xu wrote:
> > On Wed, Oct 27, 2021 at 08:27:36AM +0900, Naoya Horiguchi wrote:
> > > On Mon, Oct 04, 2021 at 11:32:28PM +0900, Naoya Horiguchi wrote:
> > > > On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
> > > > > On 04.10.21 13:50, Naoya Horiguchi wrote:
> > > ...
> > > > > >
> > > > > > Hwpoison entry for hugepage is also exposed by this patch. The below
> > > > > > example shows how pagemap is visible in the case where a memory error
> > > > > > hit a hugepage mapped to a process.
> > > > > >
> > > > > >      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
> > > > > >      voffset offset  len     flags
> > > > > >      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
> > > > > >      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
> > > > > >      700000200       12f800  1       __________B________X_______________f______w_
> > > > > >      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
> > > > > >      700000202       12f802  1fe     __________B________X_______________f______w_
> > > > > >
> > > > > > The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
> > > > > > flag) are considered as hwpoison entries.  So all pages in 2MB range
> > > > > > are inaccessible from the process.  We can get actual error location
> > > > > > by page-types in physical address mode.
> > > > > >
> > > > > >      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
> > > > > >      offset  len     flags
> > > > > >      12f800  1       __________B_________________________________
> > > > > >      12f801  1       ___________________X________________________
> > > > > >      12f802  1fe     __________B_________________________________
> > > > > >
> > > > > > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > > > > ---
> > > > > >   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
> > > > > >   include/linux/swapops.h | 13 +++++++++++++
> > > > > >   tools/vm/page-types.c   |  7 ++++++-
> > > > > >   3 files changed, 51 insertions(+), 10 deletions(-)
> > > > >
> > > > >
> > > > > Please also update the documentation located at
> > > > >
> > > > > Documentation/admin-guide/mm/pagemap.rst
> > > >
> > > > I will do this in the next post.
> > > 
> > > Reading the document, I found that swap type is already exported so we
> > > could identify hwpoison entry with it (without new PM_HWPOISON bit).
> > > One problem is that the format of swap types (like SWP_HWPOISON) depends
> > > on a few config macros like CONFIG_DEVICE_PRIVATE and CONFIG_MIGRATION,
> > > so we also need to export how the swap type field is interpreted.
> > 
> > I had similar question before.. though it was more on the generic swap entries
> > not the special ones yet.
> > 
> > The thing is I don't know how the userspace could interpret normal swap device
> > indexes out of reading pagemap, say if we have two swap devices with "swapon
> > -s" then I've no idea how do we know which device has which swap type index
> > allocated.  That seems to be a similar question asked above on special swap
> > types - the interface seems to be incomplete, if not unused at all.
> > 
> > AFAIU the information on "this page is swapped out to device X on offset Y" is
> > not reliable too, because the pagein/pageout from kernel is transparent to the
> > userspace and not under control of userspace at all.  IOW, if the user reads
> > that swap entry, then reads data upon the disk of that offset out and put it
> > somewhere else, then it means the data read could already be old if kernel
> > paged in the page after userspace reading the pagemap but before it reading the
> > disk, and I don't see any way to make it right unless the userspace could stop
> > the kernel from page-in a swap entry.  That's why I really wonder whether we
> > should expose normal swap entry at all, as I don't know how it could be helpful
> > and used in the 100% right way.
> 
> Thank you for the feedback.
> 
> I think that a process interested in controlling swap-in/out behavior in its own
> typically calls mincore() to get current status and madvise() to trigger swap-in/out.
> That's not 100% solution for the same reason, but it mostly works well because
> calling madvise(MADV_PAGEOUT) to already swapped out is not a big issue (although
> some CPU/memory resource is wasted, but the amount of the waste is small if the
> returned info is new enough). 
> So my point is that the concern around information newness might be more generic
> issue rather than just for pagemap.  If we need 100% accurate in-kernel info,
> maybe it had better be done in kernel (or some cooler stuff like eBPF)?

I fully agree the solution you mentioned with mincore() and madvise(), that is
very sane and working approach.  Though IMHO the major thing I wanted to point
out is for generic swap devices we exposed (disk_index, disk_offset) tuple as
the swap entry (besides "whether this page is swapped out or not"; that's
PM_SWAP, and as you mentioned people'll need to rely on mincore() to make it
right for shmem), though to use it we need to either record the index/offset or
read/write data from it.  However none of them will make sense, IMHO..  So I
think exposing PM_SWAP makes sense, not the swap entries on swap devices.

> 
> > 
> > Special swap entries seem a bit different - at least for is_pfn_swap_entry()
> > typed swap entries we can still expose the PFN which might be helpful, which I
> > can't tell.
> 
> I'm one who think it helpful for testing, although I know testing might not be
> considered as a real usecase.

I think testing is valid use case too.

> 
> > 
> > I used to send an email to Matt Mackall <mpm@selenic.com> and Dave Hansen
> > <dave.hansen@linux.intel.com> asking about above but didn't get a reply. Ccing
> > again this time with the list copied.
> > 
> > > 
> > > I thought of adding new interfaces for example under /sys/kernel/mm/swap/type_format/,
> > > which shows info like below (assuming that all CONFIG_{DEVICE_PRIVATE,MIGRATION,MEMORY_FAILURE}
> > > is enabled):
> > > 
> > >   $ ls /sys/kernel/mm/swap/type_format/
> > >   hwpoison
> > >   migration_read
> > >   migration_write
> > >   device_write
> > >   device_read
> > >   device_exclusive_write
> > >   device_exclusive_read
> > >   
> > >   $ cat /sys/kernel/mm/swap/type_format/hwpoison
> > >   25
> > >   
> > >   $ cat /sys/kernel/mm/swap/type_format/device_write
> > >   28
> > > 
> > > Does it make sense or any better approach?
> > 
> > Then I'm wondering whether we care about the rest of the normal swap devices
> > too with pagemap so do we need to expose some information there too (only if
> > there's a real use case, though..)?  Or... should we just don't expose swap
> > entries at all, at least generic swap entries?  We can still expose things like
> > hwpoison via PM_* bits well defined in that case.
> 
> I didn't think about normal swap devices for no reason. I'm OK to stop exposing
> normal swap device part.  I don't have strong option yet about which approach
> (in swaptype or PM_HWPOISON) I'll suggest next (so wait a little more for feedback).

No strong opinion here too. It's just that the new interface proposed reminded
me that it's partially complete if considering we're also exposing swap entries
on swap devices, so the types didn't cover those entries.  However it's more
like a pure question because I never figured out how those entries will work
anyway.  I'd be willing to know whether Dave Hanson would comment on this.

While the PM_HWPOISON approach looks always sane to me.

Thanks,
David Hildenbrand Oct. 27, 2021, 7:15 a.m. UTC | #12
On 27.10.21 09:02, Peter Xu wrote:
> On Wed, Oct 27, 2021 at 03:45:13PM +0900, Naoya Horiguchi wrote:
>> On Wed, Oct 27, 2021 at 10:09:03AM +0800, Peter Xu wrote:
>>> On Wed, Oct 27, 2021 at 08:27:36AM +0900, Naoya Horiguchi wrote:
>>>> On Mon, Oct 04, 2021 at 11:32:28PM +0900, Naoya Horiguchi wrote:
>>>>> On Mon, Oct 04, 2021 at 01:55:30PM +0200, David Hildenbrand wrote:
>>>>>> On 04.10.21 13:50, Naoya Horiguchi wrote:
>>>> ...
>>>>>>>
>>>>>>> Hwpoison entry for hugepage is also exposed by this patch. The below
>>>>>>> example shows how pagemap is visible in the case where a memory error
>>>>>>> hit a hugepage mapped to a process.
>>>>>>>
>>>>>>>      $ ./page-types --no-summary --pid $PID --raw --list --addr 0x700000000+0x400
>>>>>>>      voffset offset  len     flags
>>>>>>>      700000000       12fa00  1       ___U_______Ma__H_G_________________f_______1
>>>>>>>      700000001       12fa01  1ff     ___________Ma___TG_________________f_______1
>>>>>>>      700000200       12f800  1       __________B________X_______________f______w_
>>>>>>>      700000201       12f801  1       ___________________X_______________f______w_   // memory failure hit this page
>>>>>>>      700000202       12f802  1fe     __________B________X_______________f______w_
>>>>>>>
>>>>>>> The entries with both of "X" flag (hwpoison flag) and "w" flag (swap
>>>>>>> flag) are considered as hwpoison entries.  So all pages in 2MB range
>>>>>>> are inaccessible from the process.  We can get actual error location
>>>>>>> by page-types in physical address mode.
>>>>>>>
>>>>>>>      $ ./page-types --no-summary --addr 0x12f800+0x200 --raw --list
>>>>>>>      offset  len     flags
>>>>>>>      12f800  1       __________B_________________________________
>>>>>>>      12f801  1       ___________________X________________________
>>>>>>>      12f802  1fe     __________B_________________________________
>>>>>>>
>>>>>>> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>>>>> ---
>>>>>>>   fs/proc/task_mmu.c      | 41 ++++++++++++++++++++++++++++++++---------
>>>>>>>   include/linux/swapops.h | 13 +++++++++++++
>>>>>>>   tools/vm/page-types.c   |  7 ++++++-
>>>>>>>   3 files changed, 51 insertions(+), 10 deletions(-)
>>>>>>
>>>>>>
>>>>>> Please also update the documentation located at
>>>>>>
>>>>>> Documentation/admin-guide/mm/pagemap.rst
>>>>>
>>>>> I will do this in the next post.
>>>>
>>>> Reading the document, I found that swap type is already exported so we
>>>> could identify hwpoison entry with it (without new PM_HWPOISON bit).
>>>> One problem is that the format of swap types (like SWP_HWPOISON) depends
>>>> on a few config macros like CONFIG_DEVICE_PRIVATE and CONFIG_MIGRATION,
>>>> so we also need to export how the swap type field is interpreted.
>>>
>>> I had similar question before.. though it was more on the generic swap entries
>>> not the special ones yet.
>>>
>>> The thing is I don't know how the userspace could interpret normal swap device
>>> indexes out of reading pagemap, say if we have two swap devices with "swapon
>>> -s" then I've no idea how do we know which device has which swap type index
>>> allocated.  That seems to be a similar question asked above on special swap
>>> types - the interface seems to be incomplete, if not unused at all.
>>>
>>> AFAIU the information on "this page is swapped out to device X on offset Y" is
>>> not reliable too, because the pagein/pageout from kernel is transparent to the
>>> userspace and not under control of userspace at all.  IOW, if the user reads
>>> that swap entry, then reads data upon the disk of that offset out and put it
>>> somewhere else, then it means the data read could already be old if kernel
>>> paged in the page after userspace reading the pagemap but before it reading the
>>> disk, and I don't see any way to make it right unless the userspace could stop
>>> the kernel from page-in a swap entry.  That's why I really wonder whether we
>>> should expose normal swap entry at all, as I don't know how it could be helpful
>>> and used in the 100% right way.
>>
>> Thank you for the feedback.
>>
>> I think that a process interested in controlling swap-in/out behavior in its own
>> typically calls mincore() to get current status and madvise() to trigger swap-in/out.
>> That's not 100% solution for the same reason, but it mostly works well because
>> calling madvise(MADV_PAGEOUT) to already swapped out is not a big issue (although
>> some CPU/memory resource is wasted, but the amount of the waste is small if the
>> returned info is new enough). 
>> So my point is that the concern around information newness might be more generic
>> issue rather than just for pagemap.  If we need 100% accurate in-kernel info,
>> maybe it had better be done in kernel (or some cooler stuff like eBPF)?
> 
> I fully agree the solution you mentioned with mincore() and madvise(), that is
> very sane and working approach.  Though IMHO the major thing I wanted to point
> out is for generic swap devices we exposed (disk_index, disk_offset) tuple as
> the swap entry (besides "whether this page is swapped out or not"; that's
> PM_SWAP, and as you mentioned people'll need to rely on mincore() to make it
> right for shmem), though to use it we need to either record the index/offset or
> read/write data from it.  However none of them will make sense, IMHO..  So I
> think exposing PM_SWAP makes sense, not the swap entries on swap devices.
> 
>>
>>>
>>> Special swap entries seem a bit different - at least for is_pfn_swap_entry()
>>> typed swap entries we can still expose the PFN which might be helpful, which I
>>> can't tell.
>>
>> I'm one who think it helpful for testing, although I know testing might not be
>> considered as a real usecase.
> 
> I think testing is valid use case too.
> 
>>
>>>
>>> I used to send an email to Matt Mackall <mpm@selenic.com> and Dave Hansen
>>> <dave.hansen@linux.intel.com> asking about above but didn't get a reply. Ccing
>>> again this time with the list copied.
>>>
>>>>
>>>> I thought of adding new interfaces for example under /sys/kernel/mm/swap/type_format/,
>>>> which shows info like below (assuming that all CONFIG_{DEVICE_PRIVATE,MIGRATION,MEMORY_FAILURE}
>>>> is enabled):
>>>>
>>>>   $ ls /sys/kernel/mm/swap/type_format/
>>>>   hwpoison
>>>>   migration_read
>>>>   migration_write
>>>>   device_write
>>>>   device_read
>>>>   device_exclusive_write
>>>>   device_exclusive_read
>>>>   
>>>>   $ cat /sys/kernel/mm/swap/type_format/hwpoison
>>>>   25
>>>>   
>>>>   $ cat /sys/kernel/mm/swap/type_format/device_write
>>>>   28
>>>>
>>>> Does it make sense or any better approach?
>>>
>>> Then I'm wondering whether we care about the rest of the normal swap devices
>>> too with pagemap so do we need to expose some information there too (only if
>>> there's a real use case, though..)?  Or... should we just don't expose swap
>>> entries at all, at least generic swap entries?  We can still expose things like
>>> hwpoison via PM_* bits well defined in that case.
>>
>> I didn't think about normal swap devices for no reason. I'm OK to stop exposing
>> normal swap device part.  I don't have strong option yet about which approach
>> (in swaptype or PM_HWPOISON) I'll suggest next (so wait a little more for feedback).
> 
> No strong opinion here too. It's just that the new interface proposed reminded
> me that it's partially complete if considering we're also exposing swap entries
> on swap devices, so the types didn't cover those entries.  However it's more
> like a pure question because I never figured out how those entries will work
> anyway.  I'd be willing to know whether Dave Hanson would comment on this.
> 
> While the PM_HWPOISON approach looks always sane to me.

I consider that somehow cleaner, because how HWPOISON entries are
implemented ("fake swap entries") is somewhat an internal implementation
detail.

(I also agree that PM_SWAP makes sense, but maybe really only when we're
actually dealing with something that has been/is currently being swapped
out. Maybe we should just not expose fake swap entries via PM_SWAP and
instead use proper PM_ types for that. PM_MIGRATION, PM_HWPOISON, ...)
diff mbox series

Patch

diff --git v5.15-rc3/fs/proc/task_mmu.c v5.15-rc3_patched/fs/proc/task_mmu.c
index cf25be3e0321..bfc4772a58fb 100644
--- v5.15-rc3/fs/proc/task_mmu.c
+++ v5.15-rc3_patched/fs/proc/task_mmu.c
@@ -1298,6 +1298,7 @@  struct pagemapread {
 #define PM_SOFT_DIRTY		BIT_ULL(55)
 #define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
 #define PM_UFFD_WP		BIT_ULL(57)
+#define PM_HWPOISON		BIT_ULL(60)
 #define PM_FILE			BIT_ULL(61)
 #define PM_SWAP			BIT_ULL(62)
 #define PM_PRESENT		BIT_ULL(63)
@@ -1386,6 +1387,10 @@  static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_SWAP;
 		if (is_pfn_swap_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
+		if (is_hwpoison_entry(entry)) {
+			page = hwpoison_entry_to_page(entry);
+			flags |= PM_HWPOISON;
+		}
 	}
 
 	if (page && !PageAnon(page))
@@ -1505,34 +1510,52 @@  static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	u64 flags = 0, frame = 0;
 	int err = 0;
 	pte_t pte;
+	struct page *page = NULL;
 
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
 
 	pte = huge_ptep_get(ptep);
 	if (pte_present(pte)) {
-		struct page *page = pte_page(pte);
-
-		if (!PageAnon(page))
-			flags |= PM_FILE;
-
-		if (page_mapcount(page) == 1)
-			flags |= PM_MMAP_EXCLUSIVE;
+		page = pte_page(pte);
 
 		flags |= PM_PRESENT;
 		if (pm->show_pfn)
 			frame = pte_pfn(pte) +
 				((addr & ~hmask) >> PAGE_SHIFT);
+	} else if (is_swap_pte(pte)) {
+		swp_entry_t entry = pte_to_swp_entry(pte);
+		unsigned long offset;
+
+		if (pm->show_pfn) {
+			offset = swp_offset(entry) +
+				((addr & ~hmask) >> PAGE_SHIFT);
+			frame = swp_type(entry) |
+				(offset << MAX_SWAPFILES_SHIFT);
+		}
+		flags |= PM_SWAP;
+		if (is_migration_entry(entry))
+			page = compound_head(pfn_swap_entry_to_page(entry));
+		if (is_hwpoison_entry(entry))
+			flags |= PM_HWPOISON;
 	}
 
+	if (page && !PageAnon(page))
+		flags |= PM_FILE;
+	if (page && page_mapcount(page) == 1)
+		flags |= PM_MMAP_EXCLUSIVE;
+
 	for (; addr != end; addr += PAGE_SIZE) {
 		pagemap_entry_t pme = make_pme(frame, flags);
 
 		err = add_to_pagemap(addr, &pme, pm);
 		if (err)
 			return err;
-		if (pm->show_pfn && (flags & PM_PRESENT))
-			frame++;
+		if (pm->show_pfn)
+			if (flags & PM_PRESENT)
+				frame++;
+			else if (flags & PM_SWAP)
+				frame += (1 << MAX_SWAPFILES_SHIFT);
 	}
 
 	cond_resched();
diff --git v5.15-rc3/include/linux/swapops.h v5.15-rc3_patched/include/linux/swapops.h
index d356ab4047f7..bb6141e5c069 100644
--- v5.15-rc3/include/linux/swapops.h
+++ v5.15-rc3_patched/include/linux/swapops.h
@@ -360,6 +360,14 @@  static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
 	return swp_offset(entry);
 }
 
+static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
+{
+	struct page *p = pfn_to_page(swp_offset(entry));
+
+	WARN_ON(!PageHWPoison(p));
+	return p;
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
@@ -382,6 +390,11 @@  static inline int is_hwpoison_entry(swp_entry_t swp)
 	return 0;
 }
 
+static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
+{
+	return NULL;
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 }
diff --git v5.15-rc3/tools/vm/page-types.c v5.15-rc3_patched/tools/vm/page-types.c
index b1ed76d9a979..483e417fda41 100644
--- v5.15-rc3/tools/vm/page-types.c
+++ v5.15-rc3_patched/tools/vm/page-types.c
@@ -53,6 +53,7 @@ 
 #define PM_SWAP_OFFSET(x)	(((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
 #define PM_SOFT_DIRTY		(1ULL << 55)
 #define PM_MMAP_EXCLUSIVE	(1ULL << 56)
+#define PM_HWPOISON		(1ULL << 60)
 #define PM_FILE			(1ULL << 61)
 #define PM_SWAP			(1ULL << 62)
 #define PM_PRESENT		(1ULL << 63)
@@ -311,6 +312,8 @@  static unsigned long pagemap_pfn(uint64_t val)
 
 	if (val & PM_PRESENT)
 		pfn = PM_PFRAME(val);
+	else if (val & PM_SWAP)
+		pfn = PM_SWAP_OFFSET(val);
 	else
 		pfn = 0;
 
@@ -492,6 +495,8 @@  static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
 		flags |= BIT(FILE);
 	if (pme & PM_SWAP)
 		flags |= BIT(SWAP);
+	if (pme & PM_HWPOISON)
+		flags |= BIT(HWPOISON);
 	if (pme & PM_MMAP_EXCLUSIVE)
 		flags |= BIT(MMAP_EXCLUSIVE);
 
@@ -742,7 +747,7 @@  static void walk_vma(unsigned long index, unsigned long count)
 			pfn = pagemap_pfn(buf[i]);
 			if (pfn)
 				walk_pfn(index + i, pfn, 1, buf[i]);
-			if (buf[i] & PM_SWAP)
+			else if (buf[i] & PM_SWAP)
 				walk_swap(index + i, buf[i]);
 		}