diff mbox series

[v6,2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs

Message ID 20221109102303.851281-3-usama.anjum@collabora.com (mailing list archive)
State New, archived
Headers show
Series Implement IOCTL to get and/or the clear info about PTEs | expand

Commit Message

Muhammad Usama Anjum Nov. 9, 2022, 10:23 a.m. UTC
This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
page table entries. The following operations are supported in this ioctl:
- Get the information if the pages are soft-dirty, file mapped, present
  or swapped.
- Clear the soft-dirty PTE bit of the pages.
- Get and clear the soft-dirty PTE bit of the pages.

Only the soft-dirty bit can be read and cleared atomically. struct
pagemap_sd_args is used as the argument of the IOCTL. In this struct:
- The range is specified through start and len.
- The output buffer and size is specified as vec and vec_len.
- The optional maximum requested pages are specified in the max_pages.
- The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
  and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
- The masks are specified in rmask, amask, emask and return_mask.

This IOCTL can be extended to get information about more PTE bits.

This is based on a patch from Gabriel Krisman Bertazi.

Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
---
Changes in v6:
- Rename variables and update comments
- Make IOCTL independent of soft_dirty config
- Change masks and bitmap type to _u64
- Improve code quality

Changes in v5:
- Remove tlb flushing even for clear operation

Changes in v4:
- Update the interface and implementation

Changes in v3:
- Tighten the user-kernel interface by using explicit types and add more
  error checking

Changes in v2:
- Convert the interface from syscall to ioctl
- Remove pidfd support as it doesn't make sense in ioctl
---
 fs/proc/task_mmu.c            | 328 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h       |  56 ++++++
 tools/include/uapi/linux/fs.h |  56 ++++++
 3 files changed, 440 insertions(+)

Comments

Andrei Vagin Nov. 9, 2022, 11:54 p.m. UTC | #1
Hi Muhammad,

Here are a few inline comments.

On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
> This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
> page table entries. The following operations are supported in this ioctl:
> - Get the information if the pages are soft-dirty, file mapped, present
>   or swapped.
> - Clear the soft-dirty PTE bit of the pages.
> - Get and clear the soft-dirty PTE bit of the pages.
> 
> Only the soft-dirty bit can be read and cleared atomically. struct
> pagemap_sd_args is used as the argument of the IOCTL. In this struct:
> - The range is specified through start and len.
> - The output buffer and size is specified as vec and vec_len.
> - The optional maximum requested pages are specified in the max_pages.
> - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
>   and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
> - The masks are specified in rmask, amask, emask and return_mask.
> 
> This IOCTL can be extended to get information about more PTE bits.
> 
> This is based on a patch from Gabriel Krisman Bertazi.
> 
> Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
> ---
> Changes in v6:
> - Rename variables and update comments
> - Make IOCTL independent of soft_dirty config
> - Change masks and bitmap type to _u64
> - Improve code quality
> 
> Changes in v5:
> - Remove tlb flushing even for clear operation
> 
> Changes in v4:
> - Update the interface and implementation
> 
> Changes in v3:
> - Tighten the user-kernel interface by using explicit types and add more
>   error checking
> 
> Changes in v2:
> - Convert the interface from syscall to ioctl
> - Remove pidfd support as it doesn't make sense in ioctl
> ---
>  fs/proc/task_mmu.c            | 328 ++++++++++++++++++++++++++++++++++
>  include/uapi/linux/fs.h       |  56 ++++++
>  tools/include/uapi/linux/fs.h |  56 ++++++
>  3 files changed, 440 insertions(+)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 8235c536ac70..8d6a84ec5ef7 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -19,6 +19,9 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/uaccess.h>
>  #include <linux/pkeys.h>
> +#include <uapi/linux/fs.h>
> +#include <linux/vmalloc.h>
> +#include <linux/minmax.h>
>  
>  #include <asm/elf.h>
>  #include <asm/tlb.h>
> @@ -1775,11 +1778,336 @@ static int pagemap_release(struct inode *inode, struct file *file)
>  	return 0;
>  }
>  
> +#define PAGEMAP_OP_MASK		(PAGE_IS_SOFTDIRTY | PAGE_IS_FILE |	\
> +				 PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
> +#define PAGEMAP_NONSD_OP_MASK	(PAGE_IS_FILE |	PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
> +#define PAGEMAP_SD_FLAGS	(PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS)
> +#define IS_CLEAR_OP(a)		(a->flags & PAGEMAP_SOFTDIRTY_CLEAR)
> +#define IS_GET_OP(a)		(a->vec)
> +#define IS_SD_OP(a)		(a->flags & PAGEMAP_SD_FLAGS)
> +
> +struct pagemap_scan_private {
> +	struct page_region *vec;
> +	unsigned long vec_len;
> +	unsigned long vec_index;
> +	unsigned int max_pages;
> +	unsigned int found_pages;
> +	unsigned int flags;
> +	unsigned long required_mask;
> +	unsigned long anyof_mask;
> +	unsigned long excluded_mask;
> +	unsigned long return_mask;
> +};
> +
> +static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages))
> +		return -1;
> +
> +	if (vma->vm_flags & VM_PFNMAP)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
> +		      unsigned long addr, unsigned int len)
> +{
> +	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;

Should we define contants for each of these bits?

> +	bool cpy = true;
> +
> +	if (p->required_mask)
> +		cpy = ((p->required_mask & cur) == p->required_mask);
> +	if (cpy && p->anyof_mask)
> +		cpy = (p->anyof_mask & cur);
> +	if (cpy && p->excluded_mask)
> +		cpy = !(p->excluded_mask & cur);
> +
> +	bitmap = cur & p->return_mask;
> +
> +	if (cpy && bitmap) {
> +		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
> +		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
> +		     addr)) {

I think it is better to define a variable for p->vec_index - 1.
nit: len can be in bytes rather than pages.

> +			p->vec[p->vec_index - 1].len += len;
> +			p->found_pages += len;
> +		} else if (p->vec_index < p->vec_len) {
> +			p->vec[p->vec_index].start = addr;
> +			p->vec[p->vec_index].len = len;
> +			p->found_pages += len;
> +			p->vec[p->vec_index].bitmap = bitmap;
> +			p->vec_index++;
> +		} else {
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
> +				  unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned int len;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +	pte_t *pte;
> +	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
> +			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
> +
> +	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
> +		return 0;
> +
> +	end = min(end, walk->vma->vm_end);
> +
> +	ptl = pmd_trans_huge_lock(pmd, vma);
> +	if (ptl) {
> +		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
> +			/*
> +			 * Break huge page into small pages if operation needs to be performed is
> +			 * on a portion of the huge page or the return buffer cannot store complete
> +			 * data.
> +			 */
> +			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
> +				spin_unlock(ptl);
> +				split_huge_pmd(vma, pmd, addr);
> +				goto process_smaller_pages;
> +			}
> +
> +			if (IS_GET_OP(p)) {
> +				len = (end - addr)/PAGE_SIZE;
> +				if (p->max_pages && p->found_pages + len > p->max_pages)
> +					len = p->max_pages - p->found_pages;
> +
> +				ret = add_to_out(dirty_vma ||
> +						 check_soft_dirty_pmd(vma, addr, pmd, false),

can we reuse a return code of the previous call of check_soft_dirty_pmd?

> +						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
> +						 p, addr, len);
> +			}
> +			if (!ret && IS_CLEAR_OP(p))
> +				check_soft_dirty_pmd(vma, addr, pmd, true);

should we return a error in this case? We need to be sure that:
* we stop waking page tables after this point.
* return this error to the user-space if we are not able to add anything
  in the vector.

> +		}
> +		spin_unlock(ptl);
> +		return 0;
> +	}
> +
> +process_smaller_pages:
> +	if (pmd_trans_unstable(pmd))
> +		return 0;
> +
> +	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +	for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
> +	     ; pte++, addr += PAGE_SIZE) {
> +		if (IS_GET_OP(p))
> +			ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
> +					 vma->vm_file, pte_present(*pte),
> +					 is_swap_pte(*pte), p, addr, 1);
> +		if (!ret && IS_CLEAR_OP(p))
> +			check_soft_dirty(vma, addr, pte, true);
> +	}
> +	pte_unmap_unlock(pte - 1, ptl);
> +	cond_resched();
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
> +				 struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned int len;
> +	bool sd;
> +
> +	if (vma) {
> +		/* Individual pages haven't been allocated and written */
> +		sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
> +		     (vma->vm_flags & VM_SOFTDIRTY);
> +
> +		len = (end - addr)/PAGE_SIZE;
> +		if (p->max_pages && p->found_pages + len > p->max_pages)
> +			len = p->max_pages - p->found_pages;
> +
> +		add_to_out(sd, vma->vm_file, false, false, p, addr, len);
> +	}
> +
> +	return 0;
> +}
> +
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned long end_cut = end;
> +	int ret;
> +
> +	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
> +	    (vma->vm_flags & VM_SOFTDIRTY)) {
> +		if (vma->vm_start < start) {
> +			ret = split_vma(vma->vm_mm, vma, start, 1);
> +			if (ret)
> +				return ret;
> +		}
> +		/* Calculate end_cut because of max_pages */
> +		if (IS_GET_OP(p) && p->max_pages)
> +			end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
> +
> +		if (vma->vm_end > end_cut) {
> +			ret = split_vma(vma->vm_mm, vma, end_cut, 0);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void pagemap_scan_post_vma(struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
> +	    (vma->vm_flags & VM_SOFTDIRTY)) {
> +		vma->vm_flags &= ~VM_SOFTDIRTY;
> +		vma_set_page_prot(vma);
> +	}
> +}
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +
> +static const struct mm_walk_ops pagemap_scan_ops = {
> +	.test_walk = pagemap_scan_pmd_test_walk,
> +	.pmd_entry = pagemap_scan_pmd_entry,
> +	.pte_hole = pagemap_scan_pte_hole,
> +
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +	/* Only for clearing SD bit over VMAs */
> +	.pre_vma = pagemap_scan_pre_vma,
> +	.post_vma = pagemap_scan_post_vma,
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +};
> +
> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
> +{
> +	struct mmu_notifier_range range;
> +	unsigned long __user start, end;
> +	struct pagemap_scan_private p;
> +	int ret;
> +
> +	start = (unsigned long)untagged_addr(arg->start);
> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
> +		return -EINVAL;
> +
> +	if (IS_GET_OP(arg) &&
> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
> +		return -ENOMEM;
> +
> +#ifndef CONFIG_MEM_SOFT_DIRTY
> +	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
> +	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
> +		return -EINVAL;
> +#endif
> +
> +	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
> +	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
> +	    (arg->return_mask & ~PAGEMAP_OP_MASK))
> +		return -EINVAL;
> +
> +	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
> +		return -EINVAL;
> +
> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
> +		return -EINVAL;
> +
> +	end = start + arg->len;
> +	p.max_pages = arg->max_pages;
> +	p.found_pages = 0;
> +	p.flags = arg->flags;
> +	p.required_mask = arg->required_mask;
> +	p.anyof_mask = arg->anyof_mask;
> +	p.excluded_mask = arg->excluded_mask;
> +	p.return_mask = arg->return_mask;
> +	p.vec_index = 0;
> +	p.vec_len = arg->vec_len;
> +
> +	if (IS_GET_OP(arg)) {
> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));

I think we need to set a reasonable limit for vec_len to avoid large
allocations on the kernel. We can consider to use kmalloc or kvmalloc
here.

Thanks,
Andrei
kernel test robot Nov. 10, 2022, 5:58 p.m. UTC | #2
Hi Muhammad,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20221109]
[also build test WARNING on v6.1-rc4]
[cannot apply to shuah-kselftest/next shuah-kselftest/fixes linus/master v6.1-rc4 v6.1-rc3 v6.1-rc2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
patch link:    https://lore.kernel.org/r/20221109102303.851281-3-usama.anjum%40collabora.com
patch subject: [PATCH v6 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs
config: m68k-allyesconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/b329378abd03a741ff7250ec1b60292c893476da
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
        git checkout b329378abd03a741ff7250ec1b60292c893476da
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash fs/proc/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/proc/task_mmu.c: In function 'pagemap_scan_pmd_entry':
   fs/proc/task_mmu.c:1882:62: error: 'HPAGE_SIZE' undeclared (first use in this function); did you mean 'PAGE_SIZE'?
    1882 |                         if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
         |                                                              ^~~~~~~~~~
         |                                                              PAGE_SIZE
   fs/proc/task_mmu.c:1882:62: note: each undeclared identifier is reported only once for each function it appears in
   In file included from include/asm-generic/bug.h:5,
                    from arch/m68k/include/asm/bug.h:32,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/mm.h:6,
                    from include/linux/pagewalk.h:5,
                    from fs/proc/task_mmu.c:2:
   fs/proc/task_mmu.c: In function 'do_pagemap_sd_cmd':
>> fs/proc/task_mmu.c:2014:49: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    2014 |             ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
         |                                                 ^
   include/linux/compiler.h:77:45: note: in definition of macro 'likely'
      77 | # define likely(x)      __builtin_expect(!!(x), 1)
         |                                             ^
   fs/proc/task_mmu.c:2014:39: note: in expansion of macro 'access_ok'
    2014 |             ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
         |                                       ^~~~~~~~~
   fs/proc/task_mmu.c:2079:34: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    2079 |                 if (copy_to_user((struct page_region *)arg->vec, p.vec,
         |                                  ^


vim +2014 fs/proc/task_mmu.c

  1856	
  1857	static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
  1858					  unsigned long end, struct mm_walk *walk)
  1859	{
  1860		struct pagemap_scan_private *p = walk->private;
  1861		struct vm_area_struct *vma = walk->vma;
  1862		unsigned int len;
  1863		spinlock_t *ptl;
  1864		int ret = 0;
  1865		pte_t *pte;
  1866		bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
  1867				 (false) : (vma->vm_flags & VM_SOFTDIRTY);
  1868	
  1869		if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
  1870			return 0;
  1871	
  1872		end = min(end, walk->vma->vm_end);
  1873	
  1874		ptl = pmd_trans_huge_lock(pmd, vma);
  1875		if (ptl) {
  1876			if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
  1877				/*
  1878				 * Break huge page into small pages if operation needs to be performed is
  1879				 * on a portion of the huge page or the return buffer cannot store complete
  1880				 * data.
  1881				 */
> 1882				if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
  1883					spin_unlock(ptl);
  1884					split_huge_pmd(vma, pmd, addr);
  1885					goto process_smaller_pages;
  1886				}
  1887	
  1888				if (IS_GET_OP(p)) {
  1889					len = (end - addr)/PAGE_SIZE;
  1890					if (p->max_pages && p->found_pages + len > p->max_pages)
  1891						len = p->max_pages - p->found_pages;
  1892	
  1893					ret = add_to_out(dirty_vma ||
  1894							 check_soft_dirty_pmd(vma, addr, pmd, false),
  1895							 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
  1896							 p, addr, len);
  1897				}
  1898				if (!ret && IS_CLEAR_OP(p))
  1899					check_soft_dirty_pmd(vma, addr, pmd, true);
  1900			}
  1901			spin_unlock(ptl);
  1902			return 0;
  1903		}
  1904	
  1905	process_smaller_pages:
  1906		if (pmd_trans_unstable(pmd))
  1907			return 0;
  1908	
  1909		pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1910		for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
  1911		     ; pte++, addr += PAGE_SIZE) {
  1912			if (IS_GET_OP(p))
  1913				ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
  1914						 vma->vm_file, pte_present(*pte),
  1915						 is_swap_pte(*pte), p, addr, 1);
  1916			if (!ret && IS_CLEAR_OP(p))
  1917				check_soft_dirty(vma, addr, pte, true);
  1918		}
  1919		pte_unmap_unlock(pte - 1, ptl);
  1920		cond_resched();
  1921	
  1922		return 0;
  1923	}
  1924	
  1925	static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
  1926					 struct mm_walk *walk)
  1927	{
  1928		struct pagemap_scan_private *p = walk->private;
  1929		struct vm_area_struct *vma = walk->vma;
  1930		unsigned int len;
  1931		bool sd;
  1932	
  1933		if (vma) {
  1934			/* Individual pages haven't been allocated and written */
  1935			sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
  1936			     (vma->vm_flags & VM_SOFTDIRTY);
  1937	
  1938			len = (end - addr)/PAGE_SIZE;
  1939			if (p->max_pages && p->found_pages + len > p->max_pages)
  1940				len = p->max_pages - p->found_pages;
  1941	
  1942			add_to_out(sd, vma->vm_file, false, false, p, addr, len);
  1943		}
  1944	
  1945		return 0;
  1946	}
  1947	
  1948	#ifdef CONFIG_MEM_SOFT_DIRTY
  1949	static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
  1950	{
  1951		struct pagemap_scan_private *p = walk->private;
  1952		struct vm_area_struct *vma = walk->vma;
  1953		unsigned long end_cut = end;
  1954		int ret;
  1955	
  1956		if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
  1957		    (vma->vm_flags & VM_SOFTDIRTY)) {
  1958			if (vma->vm_start < start) {
  1959				ret = split_vma(vma->vm_mm, vma, start, 1);
  1960				if (ret)
  1961					return ret;
  1962			}
  1963			/* Calculate end_cut because of max_pages */
  1964			if (IS_GET_OP(p) && p->max_pages)
  1965				end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
  1966	
  1967			if (vma->vm_end > end_cut) {
  1968				ret = split_vma(vma->vm_mm, vma, end_cut, 0);
  1969				if (ret)
  1970					return ret;
  1971			}
  1972		}
  1973	
  1974		return 0;
  1975	}
  1976	
  1977	static void pagemap_scan_post_vma(struct mm_walk *walk)
  1978	{
  1979		struct pagemap_scan_private *p = walk->private;
  1980		struct vm_area_struct *vma = walk->vma;
  1981	
  1982		if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
  1983		    (vma->vm_flags & VM_SOFTDIRTY)) {
  1984			vma->vm_flags &= ~VM_SOFTDIRTY;
  1985			vma_set_page_prot(vma);
  1986		}
  1987	}
  1988	#endif /* CONFIG_MEM_SOFT_DIRTY */
  1989	
  1990	static const struct mm_walk_ops pagemap_scan_ops = {
  1991		.test_walk = pagemap_scan_pmd_test_walk,
  1992		.pmd_entry = pagemap_scan_pmd_entry,
  1993		.pte_hole = pagemap_scan_pte_hole,
  1994	
  1995	#ifdef CONFIG_MEM_SOFT_DIRTY
  1996		/* Only for clearing SD bit over VMAs */
  1997		.pre_vma = pagemap_scan_pre_vma,
  1998		.post_vma = pagemap_scan_post_vma,
  1999	#endif /* CONFIG_MEM_SOFT_DIRTY */
  2000	};
  2001	
  2002	static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
  2003	{
  2004		struct mmu_notifier_range range;
  2005		unsigned long __user start, end;
  2006		struct pagemap_scan_private p;
  2007		int ret;
  2008	
  2009		start = (unsigned long)untagged_addr(arg->start);
  2010		if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
  2011			return -EINVAL;
  2012	
  2013		if (IS_GET_OP(arg) &&
> 2014		    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
  2015			return -ENOMEM;
  2016
Muhammad Usama Anjum Nov. 11, 2022, 10:10 a.m. UTC | #3
Hello Andrei,

Thank you for reviewing.

On 11/10/22 4:54 AM, Andrei Vagin wrote:
[...]
>> +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
>> +		      unsigned long addr, unsigned int len)
>> +{
>> +	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
> 
> Should we define contants for each of these bits?
I think I can define a macro to hide this dirty bit shifting in the function.

> 
>> +	bool cpy = true;
>> +
>> +	if (p->required_mask)
>> +		cpy = ((p->required_mask & cur) == p->required_mask);
>> +	if (cpy && p->anyof_mask)
>> +		cpy = (p->anyof_mask & cur);
>> +	if (cpy && p->excluded_mask)
>> +		cpy = !(p->excluded_mask & cur);
>> +
>> +	bitmap = cur & p->return_mask;
>> +
>> +	if (cpy && bitmap) {
>> +		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
>> +		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
>> +		     addr)) {
> 
> I think it is better to define a variable for p->vec_index - 1.
Will do in the next revision.

> nit: len can be in bytes rather than pages.
We are considering memory in the page units. The memory given to this IOCTL
must have PAGE_SIZE alignment. Oterwise we error out (picked this from
mincore()).

>> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
>> +				  unsigned long end, struct mm_walk *walk)
>> +{
>> +	struct pagemap_scan_private *p = walk->private;
>> +	struct vm_area_struct *vma = walk->vma;
>> +	unsigned int len;
>> +	spinlock_t *ptl;
>> +	int ret = 0;
>> +	pte_t *pte;
>> +	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
>> +			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
>> +
>> +	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
>> +		return 0;
>> +
>> +	end = min(end, walk->vma->vm_end);
>> +
>> +	ptl = pmd_trans_huge_lock(pmd, vma);
>> +	if (ptl) {
>> +		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
>> +			/*
>> +			 * Break huge page into small pages if operation needs to be performed is
>> +			 * on a portion of the huge page or the return buffer cannot store complete
>> +			 * data.
>> +			 */
>> +			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>> +				spin_unlock(ptl);
>> +				split_huge_pmd(vma, pmd, addr);
>> +				goto process_smaller_pages;
>> +			}
>> +
>> +			if (IS_GET_OP(p)) {
>> +				len = (end - addr)/PAGE_SIZE;
>> +				if (p->max_pages && p->found_pages + len > p->max_pages)
>> +					len = p->max_pages - p->found_pages;
>> +
>> +				ret = add_to_out(dirty_vma ||
>> +						 check_soft_dirty_pmd(vma, addr, pmd, false),
> 
> can we reuse a return code of the previous call of check_soft_dirty_pmd?
Yes, will do.

> 
>> +						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
>> +						 p, addr, len);
>> +			}
>> +			if (!ret && IS_CLEAR_OP(p))
>> +				check_soft_dirty_pmd(vma, addr, pmd, true);
> 
> should we return a error in this case? We need to be sure that:
> * we stop waking page tables after this point.
I'll update the implementation to return error. It immediately terminates
the walk as well.
> * return this error to the user-space if we are not able to add anything
>   in the vector.
I'm not returning error to userspace if we found no page matching the
masks. The total number of filled page_region are returned from the IOCTL.
If IOCTL returns 0, it means no page found has found, but the IOCTL
executed successfully.

[...]
>> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
>> +{
>> +	struct mmu_notifier_range range;
>> +	unsigned long __user start, end;
>> +	struct pagemap_scan_private p;
>> +	int ret;
>> +
>> +	start = (unsigned long)untagged_addr(arg->start);
>> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
>> +		return -EINVAL;
>> +
>> +	if (IS_GET_OP(arg) &&
>> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
>> +		return -ENOMEM;
>> +
>> +#ifndef CONFIG_MEM_SOFT_DIRTY
>> +	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
>> +	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
>> +		return -EINVAL;
>> +#endif
>> +
>> +	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
>> +	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
>> +	    (arg->return_mask & ~PAGEMAP_OP_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
>> +		return -EINVAL;
>> +
>> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
>> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
>> +		return -EINVAL;
>> +
>> +	end = start + arg->len;
>> +	p.max_pages = arg->max_pages;
>> +	p.found_pages = 0;
>> +	p.flags = arg->flags;
>> +	p.required_mask = arg->required_mask;
>> +	p.anyof_mask = arg->anyof_mask;
>> +	p.excluded_mask = arg->excluded_mask;
>> +	p.return_mask = arg->return_mask;
>> +	p.vec_index = 0;
>> +	p.vec_len = arg->vec_len;
>> +
>> +	if (IS_GET_OP(arg)) {
>> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
> 
> I think we need to set a reasonable limit for vec_len to avoid large
> allocations on the kernel. We can consider to use kmalloc or kvmalloc
> here.
I'll update to kvzalloc which uses vmalloc if kmalloc fails. It'll use
kmalloc for smaller allocations. Thanks for suggesting it. But it'll not
limit the memory allocation.

> 
> Thanks,
> Andrei
kernel test robot Nov. 11, 2022, 5:13 p.m. UTC | #4
Hi Muhammad,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on next-20221109]
[also build test ERROR on v6.1-rc4]
[cannot apply to shuah-kselftest/next shuah-kselftest/fixes linus/master v6.1-rc4 v6.1-rc3 v6.1-rc2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
patch link:    https://lore.kernel.org/r/20221109102303.851281-3-usama.anjum%40collabora.com
patch subject: [PATCH v6 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs
config: arm-buildonly-randconfig-r006-20221111
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/intel-lab-lkp/linux/commit/b329378abd03a741ff7250ec1b60292c893476da
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
        git checkout b329378abd03a741ff7250ec1b60292c893476da
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash fs/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> fs/proc/task_mmu.c:1882:41: error: use of undeclared identifier 'HPAGE_SIZE'
                           if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
                                                                ^
   1 error generated.


vim +/HPAGE_SIZE +1882 fs/proc/task_mmu.c

  1856	
  1857	static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
  1858					  unsigned long end, struct mm_walk *walk)
  1859	{
  1860		struct pagemap_scan_private *p = walk->private;
  1861		struct vm_area_struct *vma = walk->vma;
  1862		unsigned int len;
  1863		spinlock_t *ptl;
  1864		int ret = 0;
  1865		pte_t *pte;
  1866		bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
  1867				 (false) : (vma->vm_flags & VM_SOFTDIRTY);
  1868	
  1869		if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
  1870			return 0;
  1871	
  1872		end = min(end, walk->vma->vm_end);
  1873	
  1874		ptl = pmd_trans_huge_lock(pmd, vma);
  1875		if (ptl) {
  1876			if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
  1877				/*
  1878				 * Break huge page into small pages if operation needs to be performed is
  1879				 * on a portion of the huge page or the return buffer cannot store complete
  1880				 * data.
  1881				 */
> 1882				if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
  1883					spin_unlock(ptl);
  1884					split_huge_pmd(vma, pmd, addr);
  1885					goto process_smaller_pages;
  1886				}
  1887	
  1888				if (IS_GET_OP(p)) {
  1889					len = (end - addr)/PAGE_SIZE;
  1890					if (p->max_pages && p->found_pages + len > p->max_pages)
  1891						len = p->max_pages - p->found_pages;
  1892	
  1893					ret = add_to_out(dirty_vma ||
  1894							 check_soft_dirty_pmd(vma, addr, pmd, false),
  1895							 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
  1896							 p, addr, len);
  1897				}
  1898				if (!ret && IS_CLEAR_OP(p))
  1899					check_soft_dirty_pmd(vma, addr, pmd, true);
  1900			}
  1901			spin_unlock(ptl);
  1902			return 0;
  1903		}
  1904	
  1905	process_smaller_pages:
  1906		if (pmd_trans_unstable(pmd))
  1907			return 0;
  1908	
  1909		pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1910		for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
  1911		     ; pte++, addr += PAGE_SIZE) {
  1912			if (IS_GET_OP(p))
  1913				ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
  1914						 vma->vm_file, pte_present(*pte),
  1915						 is_swap_pte(*pte), p, addr, 1);
  1916			if (!ret && IS_CLEAR_OP(p))
  1917				check_soft_dirty(vma, addr, pte, true);
  1918		}
  1919		pte_unmap_unlock(pte - 1, ptl);
  1920		cond_resched();
  1921	
  1922		return 0;
  1923	}
  1924
Muhammad Usama Anjum Nov. 11, 2022, 5:53 p.m. UTC | #5
I've fixed these build failure for the next patch iteration. Please comment
and review on the patch. I'll wait for a few days before sending next version.

On 11/11/22 10:13 PM, kernel test robot wrote:
> Hi Muhammad,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on next-20221109]
> [also build test ERROR on v6.1-rc4]
> [cannot apply to shuah-kselftest/next shuah-kselftest/fixes linus/master v6.1-rc4 v6.1-rc3 v6.1-rc2]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
> patch link:    https://lore.kernel.org/r/20221109102303.851281-3-usama.anjum%40collabora.com
> patch subject: [PATCH v6 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs
> config: arm-buildonly-randconfig-r006-20221111
> compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install arm cross compiling tool for clang build
>         # apt-get install binutils-arm-linux-gnueabi
>         # https://github.com/intel-lab-lkp/linux/commit/b329378abd03a741ff7250ec1b60292c893476da
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
>         git checkout b329378abd03a741ff7250ec1b60292c893476da
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash fs/
> 
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>>> fs/proc/task_mmu.c:1882:41: error: use of undeclared identifier 'HPAGE_SIZE'
>                            if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>                                                                 ^
>    1 error generated.
> 
> 
> vim +/HPAGE_SIZE +1882 fs/proc/task_mmu.c
> 
>   1856	
>   1857	static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
>   1858					  unsigned long end, struct mm_walk *walk)
>   1859	{
>   1860		struct pagemap_scan_private *p = walk->private;
>   1861		struct vm_area_struct *vma = walk->vma;
>   1862		unsigned int len;
>   1863		spinlock_t *ptl;
>   1864		int ret = 0;
>   1865		pte_t *pte;
>   1866		bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
>   1867				 (false) : (vma->vm_flags & VM_SOFTDIRTY);
>   1868	
>   1869		if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
>   1870			return 0;
>   1871	
>   1872		end = min(end, walk->vma->vm_end);
>   1873	
>   1874		ptl = pmd_trans_huge_lock(pmd, vma);
>   1875		if (ptl) {
>   1876			if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
>   1877				/*
>   1878				 * Break huge page into small pages if operation needs to be performed is
>   1879				 * on a portion of the huge page or the return buffer cannot store complete
>   1880				 * data.
>   1881				 */
>> 1882				if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>   1883					spin_unlock(ptl);
>   1884					split_huge_pmd(vma, pmd, addr);
>   1885					goto process_smaller_pages;
>   1886				}
>   1887	
>   1888				if (IS_GET_OP(p)) {
>   1889					len = (end - addr)/PAGE_SIZE;
>   1890					if (p->max_pages && p->found_pages + len > p->max_pages)
>   1891						len = p->max_pages - p->found_pages;
>   1892	
>   1893					ret = add_to_out(dirty_vma ||
>   1894							 check_soft_dirty_pmd(vma, addr, pmd, false),
>   1895							 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
>   1896							 p, addr, len);
>   1897				}
>   1898				if (!ret && IS_CLEAR_OP(p))
>   1899					check_soft_dirty_pmd(vma, addr, pmd, true);
>   1900			}
>   1901			spin_unlock(ptl);
>   1902			return 0;
>   1903		}
>   1904	
>   1905	process_smaller_pages:
>   1906		if (pmd_trans_unstable(pmd))
>   1907			return 0;
>   1908	
>   1909		pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>   1910		for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
>   1911		     ; pte++, addr += PAGE_SIZE) {
>   1912			if (IS_GET_OP(p))
>   1913				ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
>   1914						 vma->vm_file, pte_present(*pte),
>   1915						 is_swap_pte(*pte), p, addr, 1);
>   1916			if (!ret && IS_CLEAR_OP(p))
>   1917				check_soft_dirty(vma, addr, pte, true);
>   1918		}
>   1919		pte_unmap_unlock(pte - 1, ptl);
>   1920		cond_resched();
>   1921	
>   1922		return 0;
>   1923	}
>   1924	
>
kernel test robot Nov. 18, 2022, 1:32 a.m. UTC | #6
Hi Muhammad,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20221109]
[also build test WARNING on v6.1-rc5]
[cannot apply to shuah-kselftest/next shuah-kselftest/fixes linus/master v6.1-rc4 v6.1-rc3 v6.1-rc2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
patch link:    https://lore.kernel.org/r/20221109102303.851281-3-usama.anjum%40collabora.com
patch subject: [PATCH v6 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs
config: i386-randconfig-s003
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.4-39-gce1a6720-dirty
        # https://github.com/intel-lab-lkp/linux/commit/b329378abd03a741ff7250ec1b60292c893476da
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
        git checkout b329378abd03a741ff7250ec1b60292c893476da
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=i386 SHELL=/bin/bash fs/proc/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

sparse warnings: (new ones prefixed by >>)
>> fs/proc/task_mmu.c:2014:39: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected void const [noderef] __user *ptr @@     got struct page_region * @@
   fs/proc/task_mmu.c:2014:39: sparse:     expected void const [noderef] __user *ptr
   fs/proc/task_mmu.c:2014:39: sparse:     got struct page_region *
>> fs/proc/task_mmu.c:2079:35: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected void [noderef] __user *to @@     got struct page_region * @@
   fs/proc/task_mmu.c:2079:35: sparse:     expected void [noderef] __user *to
   fs/proc/task_mmu.c:2079:35: sparse:     got struct page_region *
   fs/proc/task_mmu.c:631:17: sparse: sparse: context imbalance in 'smaps_pte_range' - unexpected unlock
   fs/proc/task_mmu.c:1215:28: sparse: sparse: context imbalance in 'clear_refs_pte_range' - unexpected unlock
   fs/proc/task_mmu.c:1563:28: sparse: sparse: context imbalance in 'pagemap_pmd_range' - unexpected unlock
   fs/proc/task_mmu.c:1901:28: sparse: sparse: context imbalance in 'pagemap_scan_pmd_entry' - unexpected unlock
>> fs/proc/task_mmu.c:2009:9: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2010:15: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2010:50: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2010:50: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2035:9: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2035:15: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2057:17: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2057:17: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2057:17: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2057:17: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2064:35: sparse: sparse: dereference of noderef expression
   fs/proc/task_mmu.c:2064:42: sparse: sparse: dereference of noderef expression
>> fs/proc/task_mmu.c:2014:39: sparse: sparse: non size-preserving integer to pointer cast
   fs/proc/task_mmu.c:2079:56: sparse: sparse: non size-preserving integer to pointer cast

vim +2014 fs/proc/task_mmu.c

  2001	
  2002	static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
  2003	{
  2004		struct mmu_notifier_range range;
  2005		unsigned long __user start, end;
  2006		struct pagemap_scan_private p;
  2007		int ret;
  2008	
> 2009		start = (unsigned long)untagged_addr(arg->start);
  2010		if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
  2011			return -EINVAL;
  2012	
  2013		if (IS_GET_OP(arg) &&
> 2014		    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
  2015			return -ENOMEM;
  2016	
  2017	#ifndef CONFIG_MEM_SOFT_DIRTY
  2018		if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
  2019		    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
  2020			return -EINVAL;
  2021	#endif
  2022	
  2023		if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
  2024		    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
  2025		    (arg->return_mask & ~PAGEMAP_OP_MASK))
  2026			return -EINVAL;
  2027	
  2028		if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
  2029			return -EINVAL;
  2030	
  2031		if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
  2032		     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
  2033			return -EINVAL;
  2034	
  2035		end = start + arg->len;
  2036		p.max_pages = arg->max_pages;
  2037		p.found_pages = 0;
  2038		p.flags = arg->flags;
  2039		p.required_mask = arg->required_mask;
  2040		p.anyof_mask = arg->anyof_mask;
  2041		p.excluded_mask = arg->excluded_mask;
  2042		p.return_mask = arg->return_mask;
  2043		p.vec_index = 0;
  2044		p.vec_len = arg->vec_len;
  2045	
  2046		if (IS_GET_OP(arg)) {
  2047			p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
  2048			if (!p.vec)
  2049				return -ENOMEM;
  2050		} else {
  2051			p.vec = NULL;
  2052		}
  2053	
  2054		if (IS_CLEAR_OP(arg)) {
  2055			mmap_write_lock(mm);
  2056	
  2057			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end);
  2058			mmu_notifier_invalidate_range_start(&range);
  2059			inc_tlb_flush_pending(mm);
  2060		} else {
  2061			mmap_read_lock(mm);
  2062		}
  2063	
  2064		ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p);
  2065	
  2066		if (IS_CLEAR_OP(arg)) {
  2067			mmu_notifier_invalidate_range_end(&range);
  2068			dec_tlb_flush_pending(mm);
  2069	
  2070			mmap_write_unlock(mm);
  2071		} else {
  2072			mmap_read_unlock(mm);
  2073		}
  2074	
  2075		if (ret < 0)
  2076			goto free_data;
  2077	
  2078		if (IS_GET_OP(arg) && p.vec_index) {
> 2079			if (copy_to_user((struct page_region *)arg->vec, p.vec,
  2080					 p.vec_index * sizeof(struct page_region))) {
  2081				ret = -EFAULT;
  2082				goto free_data;
  2083			}
  2084			ret = p.vec_index;
  2085		} else {
  2086			ret = 0;
  2087		}
  2088	
  2089	free_data:
  2090		if (IS_GET_OP(arg))
  2091			vfree(p.vec);
  2092	
  2093		return ret;
  2094	}
  2095
Cyrill Gorcunov Dec. 12, 2022, 8:42 p.m. UTC | #7
On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
...
> +
> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
> +{
> +	struct mmu_notifier_range range;
> +	unsigned long __user start, end;
> +	struct pagemap_scan_private p;
> +	int ret;
> +
> +	start = (unsigned long)untagged_addr(arg->start);
> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
> +		return -EINVAL;
> +
> +	if (IS_GET_OP(arg) &&
> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
> +		return -ENOMEM;
> +
> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
> +		return -EINVAL;
> +
> +	end = start + arg->len;
> +	p.max_pages = arg->max_pages;
> +	p.found_pages = 0;
> +	p.flags = arg->flags;
> +	p.required_mask = arg->required_mask;
> +	p.anyof_mask = arg->anyof_mask;
> +	p.excluded_mask = arg->excluded_mask;
> +	p.return_mask = arg->return_mask;
> +	p.vec_index = 0;
> +	p.vec_len = arg->vec_len;
> +
> +	if (IS_GET_OP(arg)) {
> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
> +		if (!p.vec)
> +			return -ENOMEM;
> +	} else {
> +		p.vec = NULL;
> +	}

Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
step in yet). Anyway, while in general such interface looks reasonable here are
few moments which really bothers me: as far as I undertstand you don't need
vzalloc here, plain vmalloc should works as well since you copy only filled
results back to userspace. Next -- there is no restriction on vec_len parameter,
is not here a door for DoS from userspace? Say I could start a number of ioctl
on same pagemap and try to allocate very big amount of vec_len in summay causing
big pressure on kernel's memory. Or I miss something obvious here?
Muhammad Usama Anjum Dec. 13, 2022, 1:04 p.m. UTC | #8
On 12/13/22 1:42 AM, Cyrill Gorcunov wrote:
> On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
> ...
>> +
>> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
>> +{
>> +	struct mmu_notifier_range range;
>> +	unsigned long __user start, end;
>> +	struct pagemap_scan_private p;
>> +	int ret;
>> +
>> +	start = (unsigned long)untagged_addr(arg->start);
>> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
>> +		return -EINVAL;
>> +
>> +	if (IS_GET_OP(arg) &&
>> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
>> +		return -ENOMEM;
>> +
>> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
>> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
>> +		return -EINVAL;
>> +
>> +	end = start + arg->len;
>> +	p.max_pages = arg->max_pages;
>> +	p.found_pages = 0;
>> +	p.flags = arg->flags;
>> +	p.required_mask = arg->required_mask;
>> +	p.anyof_mask = arg->anyof_mask;
>> +	p.excluded_mask = arg->excluded_mask;
>> +	p.return_mask = arg->return_mask;
>> +	p.vec_index = 0;
>> +	p.vec_len = arg->vec_len;
>> +
>> +	if (IS_GET_OP(arg)) {
>> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
>> +		if (!p.vec)
>> +			return -ENOMEM;
>> +	} else {
>> +		p.vec = NULL;
>> +	}
> 
> Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
> step in yet). Anyway, while in general such interface looks reasonable here are
> few moments which really bothers me: as far as I undertstand you don't need
> vzalloc here, plain vmalloc should works as well since you copy only filled
> results back to userspace. Thank you for reviewing. Correct, I'll update to use vmalloc.

> Next -- there is no restriction on vec_len parameter,
> is not here a door for DoS from userspace? Say I could start a number of ioctl
> on same pagemap and try to allocate very big amount of vec_len in summay causing
> big pressure on kernel's memory. Or I miss something obvious here?
Yes, there is a chance that a large chunk of kernel memory can get
allocated here as vec_len can be very large. We need to think of limiting
this buffer in the current implementation. Any reasonable limit should
work. I'm not sure what would be the reasonable limit. Maybe couple of
hundred MBs? I'll think about it. Or I should update the implementation
such that less amount of intermediate buffer can be used like mincore does.
But this can complicate the implementation further as we are already using
page ranges instead of keeping just the flags. I'll see what can be done.
Cyrill Gorcunov Dec. 13, 2022, 10:22 p.m. UTC | #9
On Tue, Dec 13, 2022 at 06:04:04PM +0500, Muhammad Usama Anjum wrote:
> > Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
> > step in yet). Anyway, while in general such interface looks reasonable here are
> > few moments which really bothers me: as far as I undertstand you don't need
> > vzalloc here, plain vmalloc should works as well since you copy only filled
> > results back to userspace.
>
> Thank you for reviewing. Correct, I'll update to use vmalloc.
> 
> > Next -- there is no restriction on vec_len parameter,
> > is not here a door for DoS from userspace? Say I could start a number of ioctl
> > on same pagemap and try to allocate very big amount of vec_len in summay causing
> > big pressure on kernel's memory. Or I miss something obvious here?
>
> Yes, there is a chance that a large chunk of kernel memory can get
> allocated here as vec_len can be very large. We need to think of limiting
> this buffer in the current implementation. Any reasonable limit should
> work. I'm not sure what would be the reasonable limit. Maybe couple of
> hundred MBs? I'll think about it. Or I should update the implementation
> such that less amount of intermediate buffer can be used like mincore does.
> But this can complicate the implementation further as we are already using
> page ranges instead of keeping just the flags. I'll see what can be done.

You know, I'm not yet convinced about overall design. This is new uapi which
should be reviewed very very carefully, once merged in we can't step back and
will have to live with it forever. As to buffer size: look how pagemap_read
is implemented, it allocates PAGEMAP_WALK_SIZE buffer array to gather results
then copies it back to userspace. If the main idea to be able to walk over
memory of a process with mm context locked it still doesn't bring much benefit
because once ioctl is complete the state of mm can be changed so precise results
are only possible if target process is not running.

Maybe all of these aspects are been discussed already I probably need to read
all previous converstaions first :)
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8235c536ac70..8d6a84ec5ef7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,9 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <uapi/linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/minmax.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1775,11 +1778,336 @@  static int pagemap_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#define PAGEMAP_OP_MASK		(PAGE_IS_SOFTDIRTY | PAGE_IS_FILE |	\
+				 PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
+#define PAGEMAP_NONSD_OP_MASK	(PAGE_IS_FILE |	PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
+#define PAGEMAP_SD_FLAGS	(PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS)
+#define IS_CLEAR_OP(a)		(a->flags & PAGEMAP_SOFTDIRTY_CLEAR)
+#define IS_GET_OP(a)		(a->vec)
+#define IS_SD_OP(a)		(a->flags & PAGEMAP_SD_FLAGS)
+
+struct pagemap_scan_private {
+	struct page_region *vec;
+	unsigned long vec_len;
+	unsigned long vec_index;
+	unsigned int max_pages;
+	unsigned int found_pages;
+	unsigned int flags;
+	unsigned long required_mask;
+	unsigned long anyof_mask;
+	unsigned long excluded_mask;
+	unsigned long return_mask;
+};
+
+static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages))
+		return -1;
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	return 0;
+}
+
+static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
+		      unsigned long addr, unsigned int len)
+{
+	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
+	bool cpy = true;
+
+	if (p->required_mask)
+		cpy = ((p->required_mask & cur) == p->required_mask);
+	if (cpy && p->anyof_mask)
+		cpy = (p->anyof_mask & cur);
+	if (cpy && p->excluded_mask)
+		cpy = !(p->excluded_mask & cur);
+
+	bitmap = cur & p->return_mask;
+
+	if (cpy && bitmap) {
+		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
+		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
+		     addr)) {
+			p->vec[p->vec_index - 1].len += len;
+			p->found_pages += len;
+		} else if (p->vec_index < p->vec_len) {
+			p->vec[p->vec_index].start = addr;
+			p->vec[p->vec_index].len = len;
+			p->found_pages += len;
+			p->vec[p->vec_index].bitmap = bitmap;
+			p->vec_index++;
+		} else {
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned int len;
+	spinlock_t *ptl;
+	int ret = 0;
+	pte_t *pte;
+	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
+			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
+
+	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
+		return 0;
+
+	end = min(end, walk->vma->vm_end);
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
+		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
+			/*
+			 * Break huge page into small pages if operation needs to be performed is
+			 * on a portion of the huge page or the return buffer cannot store complete
+			 * data.
+			 */
+			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
+				spin_unlock(ptl);
+				split_huge_pmd(vma, pmd, addr);
+				goto process_smaller_pages;
+			}
+
+			if (IS_GET_OP(p)) {
+				len = (end - addr)/PAGE_SIZE;
+				if (p->max_pages && p->found_pages + len > p->max_pages)
+					len = p->max_pages - p->found_pages;
+
+				ret = add_to_out(dirty_vma ||
+						 check_soft_dirty_pmd(vma, addr, pmd, false),
+						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
+						 p, addr, len);
+			}
+			if (!ret && IS_CLEAR_OP(p))
+				check_soft_dirty_pmd(vma, addr, pmd, true);
+		}
+		spin_unlock(ptl);
+		return 0;
+	}
+
+process_smaller_pages:
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
+	     ; pte++, addr += PAGE_SIZE) {
+		if (IS_GET_OP(p))
+			ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
+					 vma->vm_file, pte_present(*pte),
+					 is_swap_pte(*pte), p, addr, 1);
+		if (!ret && IS_CLEAR_OP(p))
+			check_soft_dirty(vma, addr, pte, true);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
+				 struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned int len;
+	bool sd;
+
+	if (vma) {
+		/* Individual pages haven't been allocated and written */
+		sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
+		     (vma->vm_flags & VM_SOFTDIRTY);
+
+		len = (end - addr)/PAGE_SIZE;
+		if (p->max_pages && p->found_pages + len > p->max_pages)
+			len = p->max_pages - p->found_pages;
+
+		add_to_out(sd, vma->vm_file, false, false, p, addr, len);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long end_cut = end;
+	int ret;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		if (vma->vm_start < start) {
+			ret = split_vma(vma->vm_mm, vma, start, 1);
+			if (ret)
+				return ret;
+		}
+		/* Calculate end_cut because of max_pages */
+		if (IS_GET_OP(p) && p->max_pages)
+			end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
+
+		if (vma->vm_end > end_cut) {
+			ret = split_vma(vma->vm_mm, vma, end_cut, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void pagemap_scan_post_vma(struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		vma->vm_flags &= ~VM_SOFTDIRTY;
+		vma_set_page_prot(vma);
+	}
+}
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+	.test_walk = pagemap_scan_pmd_test_walk,
+	.pmd_entry = pagemap_scan_pmd_entry,
+	.pte_hole = pagemap_scan_pte_hole,
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	/* Only for clearing SD bit over VMAs */
+	.pre_vma = pagemap_scan_pre_vma,
+	.post_vma = pagemap_scan_post_vma,
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+};
+
+static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
+{
+	struct mmu_notifier_range range;
+	unsigned long __user start, end;
+	struct pagemap_scan_private p;
+	int ret;
+
+	start = (unsigned long)untagged_addr(arg->start);
+	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
+		return -EINVAL;
+
+	if (IS_GET_OP(arg) &&
+	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
+		return -ENOMEM;
+
+#ifndef CONFIG_MEM_SOFT_DIRTY
+	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
+	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
+		return -EINVAL;
+#endif
+
+	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
+	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
+	    (arg->return_mask & ~PAGEMAP_OP_MASK))
+		return -EINVAL;
+
+	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
+		return -EINVAL;
+
+	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
+	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
+		return -EINVAL;
+
+	end = start + arg->len;
+	p.max_pages = arg->max_pages;
+	p.found_pages = 0;
+	p.flags = arg->flags;
+	p.required_mask = arg->required_mask;
+	p.anyof_mask = arg->anyof_mask;
+	p.excluded_mask = arg->excluded_mask;
+	p.return_mask = arg->return_mask;
+	p.vec_index = 0;
+	p.vec_len = arg->vec_len;
+
+	if (IS_GET_OP(arg)) {
+		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
+		if (!p.vec)
+			return -ENOMEM;
+	} else {
+		p.vec = NULL;
+	}
+
+	if (IS_CLEAR_OP(arg)) {
+		mmap_write_lock(mm);
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end);
+		mmu_notifier_invalidate_range_start(&range);
+		inc_tlb_flush_pending(mm);
+	} else {
+		mmap_read_lock(mm);
+	}
+
+	ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p);
+
+	if (IS_CLEAR_OP(arg)) {
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(mm);
+
+		mmap_write_unlock(mm);
+	} else {
+		mmap_read_unlock(mm);
+	}
+
+	if (ret < 0)
+		goto free_data;
+
+	if (IS_GET_OP(arg) && p.vec_index) {
+		if (copy_to_user((struct page_region *)arg->vec, p.vec,
+				 p.vec_index * sizeof(struct page_region))) {
+			ret = -EFAULT;
+			goto free_data;
+		}
+		ret = p.vec_index;
+	} else {
+		ret = 0;
+	}
+
+free_data:
+	if (IS_GET_OP(arg))
+		vfree(p.vec);
+
+	return ret;
+}
+
+static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pagemap_scan_arg __user *uarg = (struct pagemap_scan_arg __user *)arg;
+	struct mm_struct *mm = file->private_data;
+	struct pagemap_scan_arg argument;
+
+	if (cmd == PAGEMAP_SCAN) {
+		if (copy_from_user(&argument, uarg, sizeof(struct pagemap_scan_arg)))
+			return -EFAULT;
+		return do_pagemap_sd_cmd(mm, &argument);
+	}
+	return -EINVAL;
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 	.open		= pagemap_open,
 	.release	= pagemap_release,
+	.unlocked_ioctl = pagemap_sd_ioctl,
+	.compat_ioctl	= pagemap_sd_ioctl,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..11d232cfc9b3 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,60 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SOFTDIRTY	(1 << 0)
+#define PAGE_IS_FILE		(1 << 1)
+#define PAGE_IS_PRESENT		(1 << 2)
+#define PAGE_IS_SWAPPED		(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u64 bitmap;
+};
+
+/*
+ * struct pagemap_scan_arg - Pagemap ioctl argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages
+ * @flags:		Flags for the IOCTL
+ * @required_mask:	Required mask - All of these bits have to be set in the PTE
+ * @anyof_mask:		Any mask - Any of these bits are set in the PTE
+ * @excluded_mask:	Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that are to be reported in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u64 required_mask;
+	__u64 anyof_mask;
+	__u64 excluded_mask;
+	__u64 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SOFTDIRTY_CLEAR		(1 << 0)
+/*
+ * Depend only on the soft dirty PTE bit of individual pages and don't check the soft dirty bit
+ * of the VMA to decide if the region is dirty or not. By using this flag, the newly created
+ * memory doesn't appear to be soft dirty through the IOCTL until the region is written.
+ */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index b7b56871029c..11d232cfc9b3 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -305,4 +305,60 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SOFTDIRTY	(1 << 0)
+#define PAGE_IS_FILE		(1 << 1)
+#define PAGE_IS_PRESENT		(1 << 2)
+#define PAGE_IS_SWAPPED		(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u64 bitmap;
+};
+
+/*
+ * struct pagemap_scan_arg - Pagemap ioctl argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages
+ * @flags:		Flags for the IOCTL
+ * @required_mask:	Required mask - All of these bits have to be set in the PTE
+ * @anyof_mask:		Any mask - Any of these bits are set in the PTE
+ * @excluded_mask:	Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that are to be reported in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u64 required_mask;
+	__u64 anyof_mask;
+	__u64 excluded_mask;
+	__u64 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SOFTDIRTY_CLEAR		(1 << 0)
+/*
+ * Depend only on the soft dirty PTE bit of individual pages and don't check the soft dirty bit
+ * of the VMA to decide if the region is dirty or not. By using this flag, the newly created
+ * memory doesn't appear to be soft dirty through the IOCTL until the region is written.
+ */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */