diff mbox series

[v6,2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs

Message ID 20221109102303.851281-3-usama.anjum@collabora.com (mailing list archive)
State New
Headers show
Series Implement IOCTL to get and/or the clear info about PTEs | expand

Commit Message

Muhammad Usama Anjum Nov. 9, 2022, 10:23 a.m. UTC
This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
page table entries. The following operations are supported in this ioctl:
- Get the information if the pages are soft-dirty, file mapped, present
  or swapped.
- Clear the soft-dirty PTE bit of the pages.
- Get and clear the soft-dirty PTE bit of the pages.

Only the soft-dirty bit can be read and cleared atomically. struct
pagemap_sd_args is used as the argument of the IOCTL. In this struct:
- The range is specified through start and len.
- The output buffer and size is specified as vec and vec_len.
- The optional maximum requested pages are specified in the max_pages.
- The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
  and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
- The masks are specified in rmask, amask, emask and return_mask.

This IOCTL can be extended to get information about more PTE bits.

This is based on a patch from Gabriel Krisman Bertazi.

Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
---
Changes in v6:
- Rename variables and update comments
- Make IOCTL independent of soft_dirty config
- Change masks and bitmap type to _u64
- Improve code quality

Changes in v5:
- Remove tlb flushing even for clear operation

Changes in v4:
- Update the interface and implementation

Changes in v3:
- Tighten the user-kernel interface by using explicit types and add more
  error checking

Changes in v2:
- Convert the interface from syscall to ioctl
- Remove pidfd support as it doesn't make sense in ioctl
---
 fs/proc/task_mmu.c            | 328 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h       |  56 ++++++
 tools/include/uapi/linux/fs.h |  56 ++++++
 3 files changed, 440 insertions(+)

Comments

Andrei Vagin Nov. 9, 2022, 11:54 p.m. UTC | #1
Hi Muhammad,

Here are a few inline comments.

On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
> This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
> page table entries. The following operations are supported in this ioctl:
> - Get the information if the pages are soft-dirty, file mapped, present
>   or swapped.
> - Clear the soft-dirty PTE bit of the pages.
> - Get and clear the soft-dirty PTE bit of the pages.
> 
> Only the soft-dirty bit can be read and cleared atomically. struct
> pagemap_sd_args is used as the argument of the IOCTL. In this struct:
> - The range is specified through start and len.
> - The output buffer and size is specified as vec and vec_len.
> - The optional maximum requested pages are specified in the max_pages.
> - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
>   and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
> - The masks are specified in rmask, amask, emask and return_mask.
> 
> This IOCTL can be extended to get information about more PTE bits.
> 
> This is based on a patch from Gabriel Krisman Bertazi.
> 
> Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
> ---
> Changes in v6:
> - Rename variables and update comments
> - Make IOCTL independent of soft_dirty config
> - Change masks and bitmap type to _u64
> - Improve code quality
> 
> Changes in v5:
> - Remove tlb flushing even for clear operation
> 
> Changes in v4:
> - Update the interface and implementation
> 
> Changes in v3:
> - Tighten the user-kernel interface by using explicit types and add more
>   error checking
> 
> Changes in v2:
> - Convert the interface from syscall to ioctl
> - Remove pidfd support as it doesn't make sense in ioctl
> ---
>  fs/proc/task_mmu.c            | 328 ++++++++++++++++++++++++++++++++++
>  include/uapi/linux/fs.h       |  56 ++++++
>  tools/include/uapi/linux/fs.h |  56 ++++++
>  3 files changed, 440 insertions(+)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 8235c536ac70..8d6a84ec5ef7 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -19,6 +19,9 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/uaccess.h>
>  #include <linux/pkeys.h>
> +#include <uapi/linux/fs.h>
> +#include <linux/vmalloc.h>
> +#include <linux/minmax.h>
>  
>  #include <asm/elf.h>
>  #include <asm/tlb.h>
> @@ -1775,11 +1778,336 @@ static int pagemap_release(struct inode *inode, struct file *file)
>  	return 0;
>  }
>  
> +#define PAGEMAP_OP_MASK		(PAGE_IS_SOFTDIRTY | PAGE_IS_FILE |	\
> +				 PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
> +#define PAGEMAP_NONSD_OP_MASK	(PAGE_IS_FILE |	PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
> +#define PAGEMAP_SD_FLAGS	(PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS)
> +#define IS_CLEAR_OP(a)		(a->flags & PAGEMAP_SOFTDIRTY_CLEAR)
> +#define IS_GET_OP(a)		(a->vec)
> +#define IS_SD_OP(a)		(a->flags & PAGEMAP_SD_FLAGS)
> +
> +struct pagemap_scan_private {
> +	struct page_region *vec;
> +	unsigned long vec_len;
> +	unsigned long vec_index;
> +	unsigned int max_pages;
> +	unsigned int found_pages;
> +	unsigned int flags;
> +	unsigned long required_mask;
> +	unsigned long anyof_mask;
> +	unsigned long excluded_mask;
> +	unsigned long return_mask;
> +};
> +
> +static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages))
> +		return -1;
> +
> +	if (vma->vm_flags & VM_PFNMAP)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
> +		      unsigned long addr, unsigned int len)
> +{
> +	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;

Should we define contants for each of these bits?

> +	bool cpy = true;
> +
> +	if (p->required_mask)
> +		cpy = ((p->required_mask & cur) == p->required_mask);
> +	if (cpy && p->anyof_mask)
> +		cpy = (p->anyof_mask & cur);
> +	if (cpy && p->excluded_mask)
> +		cpy = !(p->excluded_mask & cur);
> +
> +	bitmap = cur & p->return_mask;
> +
> +	if (cpy && bitmap) {
> +		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
> +		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
> +		     addr)) {

I think it is better to define a variable for p->vec_index - 1.
nit: len can be in bytes rather than pages.

> +			p->vec[p->vec_index - 1].len += len;
> +			p->found_pages += len;
> +		} else if (p->vec_index < p->vec_len) {
> +			p->vec[p->vec_index].start = addr;
> +			p->vec[p->vec_index].len = len;
> +			p->found_pages += len;
> +			p->vec[p->vec_index].bitmap = bitmap;
> +			p->vec_index++;
> +		} else {
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
> +				  unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned int len;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +	pte_t *pte;
> +	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
> +			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
> +
> +	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
> +		return 0;
> +
> +	end = min(end, walk->vma->vm_end);
> +
> +	ptl = pmd_trans_huge_lock(pmd, vma);
> +	if (ptl) {
> +		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
> +			/*
> +			 * Break huge page into small pages if operation needs to be performed is
> +			 * on a portion of the huge page or the return buffer cannot store complete
> +			 * data.
> +			 */
> +			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
> +				spin_unlock(ptl);
> +				split_huge_pmd(vma, pmd, addr);
> +				goto process_smaller_pages;
> +			}
> +
> +			if (IS_GET_OP(p)) {
> +				len = (end - addr)/PAGE_SIZE;
> +				if (p->max_pages && p->found_pages + len > p->max_pages)
> +					len = p->max_pages - p->found_pages;
> +
> +				ret = add_to_out(dirty_vma ||
> +						 check_soft_dirty_pmd(vma, addr, pmd, false),

can we reuse a return code of the previous call of check_soft_dirty_pmd?

> +						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
> +						 p, addr, len);
> +			}
> +			if (!ret && IS_CLEAR_OP(p))
> +				check_soft_dirty_pmd(vma, addr, pmd, true);

should we return a error in this case? We need to be sure that:
* we stop waking page tables after this point.
* return this error to the user-space if we are not able to add anything
  in the vector.

> +		}
> +		spin_unlock(ptl);
> +		return 0;
> +	}
> +
> +process_smaller_pages:
> +	if (pmd_trans_unstable(pmd))
> +		return 0;
> +
> +	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +	for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
> +	     ; pte++, addr += PAGE_SIZE) {
> +		if (IS_GET_OP(p))
> +			ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
> +					 vma->vm_file, pte_present(*pte),
> +					 is_swap_pte(*pte), p, addr, 1);
> +		if (!ret && IS_CLEAR_OP(p))
> +			check_soft_dirty(vma, addr, pte, true);
> +	}
> +	pte_unmap_unlock(pte - 1, ptl);
> +	cond_resched();
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
> +				 struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned int len;
> +	bool sd;
> +
> +	if (vma) {
> +		/* Individual pages haven't been allocated and written */
> +		sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
> +		     (vma->vm_flags & VM_SOFTDIRTY);
> +
> +		len = (end - addr)/PAGE_SIZE;
> +		if (p->max_pages && p->found_pages + len > p->max_pages)
> +			len = p->max_pages - p->found_pages;
> +
> +		add_to_out(sd, vma->vm_file, false, false, p, addr, len);
> +	}
> +
> +	return 0;
> +}
> +
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned long end_cut = end;
> +	int ret;
> +
> +	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
> +	    (vma->vm_flags & VM_SOFTDIRTY)) {
> +		if (vma->vm_start < start) {
> +			ret = split_vma(vma->vm_mm, vma, start, 1);
> +			if (ret)
> +				return ret;
> +		}
> +		/* Calculate end_cut because of max_pages */
> +		if (IS_GET_OP(p) && p->max_pages)
> +			end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
> +
> +		if (vma->vm_end > end_cut) {
> +			ret = split_vma(vma->vm_mm, vma, end_cut, 0);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void pagemap_scan_post_vma(struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
> +	    (vma->vm_flags & VM_SOFTDIRTY)) {
> +		vma->vm_flags &= ~VM_SOFTDIRTY;
> +		vma_set_page_prot(vma);
> +	}
> +}
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +
> +static const struct mm_walk_ops pagemap_scan_ops = {
> +	.test_walk = pagemap_scan_pmd_test_walk,
> +	.pmd_entry = pagemap_scan_pmd_entry,
> +	.pte_hole = pagemap_scan_pte_hole,
> +
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +	/* Only for clearing SD bit over VMAs */
> +	.pre_vma = pagemap_scan_pre_vma,
> +	.post_vma = pagemap_scan_post_vma,
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +};
> +
> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
> +{
> +	struct mmu_notifier_range range;
> +	unsigned long __user start, end;
> +	struct pagemap_scan_private p;
> +	int ret;
> +
> +	start = (unsigned long)untagged_addr(arg->start);
> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
> +		return -EINVAL;
> +
> +	if (IS_GET_OP(arg) &&
> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
> +		return -ENOMEM;
> +
> +#ifndef CONFIG_MEM_SOFT_DIRTY
> +	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
> +	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
> +		return -EINVAL;
> +#endif
> +
> +	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
> +	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
> +	    (arg->return_mask & ~PAGEMAP_OP_MASK))
> +		return -EINVAL;
> +
> +	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
> +		return -EINVAL;
> +
> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
> +		return -EINVAL;
> +
> +	end = start + arg->len;
> +	p.max_pages = arg->max_pages;
> +	p.found_pages = 0;
> +	p.flags = arg->flags;
> +	p.required_mask = arg->required_mask;
> +	p.anyof_mask = arg->anyof_mask;
> +	p.excluded_mask = arg->excluded_mask;
> +	p.return_mask = arg->return_mask;
> +	p.vec_index = 0;
> +	p.vec_len = arg->vec_len;
> +
> +	if (IS_GET_OP(arg)) {
> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));

I think we need to set a reasonable limit for vec_len to avoid large
allocations on the kernel. We can consider to use kmalloc or kvmalloc
here.

Thanks,
Andrei
Muhammad Usama Anjum Nov. 11, 2022, 10:10 a.m. UTC | #2
Hello Andrei,

Thank you for reviewing.

On 11/10/22 4:54 AM, Andrei Vagin wrote:
[...]
>> +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
>> +		      unsigned long addr, unsigned int len)
>> +{
>> +	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
> 
> Should we define contants for each of these bits?
I think I can define a macro to hide this dirty bit shifting in the function.

> 
>> +	bool cpy = true;
>> +
>> +	if (p->required_mask)
>> +		cpy = ((p->required_mask & cur) == p->required_mask);
>> +	if (cpy && p->anyof_mask)
>> +		cpy = (p->anyof_mask & cur);
>> +	if (cpy && p->excluded_mask)
>> +		cpy = !(p->excluded_mask & cur);
>> +
>> +	bitmap = cur & p->return_mask;
>> +
>> +	if (cpy && bitmap) {
>> +		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
>> +		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
>> +		     addr)) {
> 
> I think it is better to define a variable for p->vec_index - 1.
Will do in the next revision.

> nit: len can be in bytes rather than pages.
We are considering memory in the page units. The memory given to this IOCTL
must have PAGE_SIZE alignment. Oterwise we error out (picked this from
mincore()).

>> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
>> +				  unsigned long end, struct mm_walk *walk)
>> +{
>> +	struct pagemap_scan_private *p = walk->private;
>> +	struct vm_area_struct *vma = walk->vma;
>> +	unsigned int len;
>> +	spinlock_t *ptl;
>> +	int ret = 0;
>> +	pte_t *pte;
>> +	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
>> +			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
>> +
>> +	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
>> +		return 0;
>> +
>> +	end = min(end, walk->vma->vm_end);
>> +
>> +	ptl = pmd_trans_huge_lock(pmd, vma);
>> +	if (ptl) {
>> +		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
>> +			/*
>> +			 * Break huge page into small pages if operation needs to be performed is
>> +			 * on a portion of the huge page or the return buffer cannot store complete
>> +			 * data.
>> +			 */
>> +			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>> +				spin_unlock(ptl);
>> +				split_huge_pmd(vma, pmd, addr);
>> +				goto process_smaller_pages;
>> +			}
>> +
>> +			if (IS_GET_OP(p)) {
>> +				len = (end - addr)/PAGE_SIZE;
>> +				if (p->max_pages && p->found_pages + len > p->max_pages)
>> +					len = p->max_pages - p->found_pages;
>> +
>> +				ret = add_to_out(dirty_vma ||
>> +						 check_soft_dirty_pmd(vma, addr, pmd, false),
> 
> can we reuse a return code of the previous call of check_soft_dirty_pmd?
Yes, will do.

> 
>> +						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
>> +						 p, addr, len);
>> +			}
>> +			if (!ret && IS_CLEAR_OP(p))
>> +				check_soft_dirty_pmd(vma, addr, pmd, true);
> 
> should we return a error in this case? We need to be sure that:
> * we stop waking page tables after this point.
I'll update the implementation to return error. It immediately terminates
the walk as well.
> * return this error to the user-space if we are not able to add anything
>   in the vector.
I'm not returning error to userspace if we found no page matching the
masks. The total number of filled page_region are returned from the IOCTL.
If IOCTL returns 0, it means no page found has found, but the IOCTL
executed successfully.

[...]
>> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
>> +{
>> +	struct mmu_notifier_range range;
>> +	unsigned long __user start, end;
>> +	struct pagemap_scan_private p;
>> +	int ret;
>> +
>> +	start = (unsigned long)untagged_addr(arg->start);
>> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
>> +		return -EINVAL;
>> +
>> +	if (IS_GET_OP(arg) &&
>> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
>> +		return -ENOMEM;
>> +
>> +#ifndef CONFIG_MEM_SOFT_DIRTY
>> +	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
>> +	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
>> +		return -EINVAL;
>> +#endif
>> +
>> +	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
>> +	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
>> +	    (arg->return_mask & ~PAGEMAP_OP_MASK))
>> +		return -EINVAL;
>> +
>> +	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
>> +		return -EINVAL;
>> +
>> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
>> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
>> +		return -EINVAL;
>> +
>> +	end = start + arg->len;
>> +	p.max_pages = arg->max_pages;
>> +	p.found_pages = 0;
>> +	p.flags = arg->flags;
>> +	p.required_mask = arg->required_mask;
>> +	p.anyof_mask = arg->anyof_mask;
>> +	p.excluded_mask = arg->excluded_mask;
>> +	p.return_mask = arg->return_mask;
>> +	p.vec_index = 0;
>> +	p.vec_len = arg->vec_len;
>> +
>> +	if (IS_GET_OP(arg)) {
>> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
> 
> I think we need to set a reasonable limit for vec_len to avoid large
> allocations on the kernel. We can consider to use kmalloc or kvmalloc
> here.
I'll update to kvzalloc which uses vmalloc if kmalloc fails. It'll use
kmalloc for smaller allocations. Thanks for suggesting it. But it'll not
limit the memory allocation.

> 
> Thanks,
> Andrei
Muhammad Usama Anjum Nov. 11, 2022, 5:53 p.m. UTC | #3
I've fixed these build failure for the next patch iteration. Please comment
and review on the patch. I'll wait for a few days before sending next version.

On 11/11/22 10:13 PM, kernel test robot wrote:
> Hi Muhammad,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on next-20221109]
> [also build test ERROR on v6.1-rc4]
> [cannot apply to shuah-kselftest/next shuah-kselftest/fixes linus/master v6.1-rc4 v6.1-rc3 v6.1-rc2]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
> patch link:    https://lore.kernel.org/r/20221109102303.851281-3-usama.anjum%40collabora.com
> patch subject: [PATCH v6 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs
> config: arm-buildonly-randconfig-r006-20221111
> compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install arm cross compiling tool for clang build
>         # apt-get install binutils-arm-linux-gnueabi
>         # https://github.com/intel-lab-lkp/linux/commit/b329378abd03a741ff7250ec1b60292c893476da
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Muhammad-Usama-Anjum/Implement-IOCTL-to-get-and-or-the-clear-info-about-PTEs/20221109-182618
>         git checkout b329378abd03a741ff7250ec1b60292c893476da
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash fs/
> 
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>>> fs/proc/task_mmu.c:1882:41: error: use of undeclared identifier 'HPAGE_SIZE'
>                            if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>                                                                 ^
>    1 error generated.
> 
> 
> vim +/HPAGE_SIZE +1882 fs/proc/task_mmu.c
> 
>   1856	
>   1857	static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
>   1858					  unsigned long end, struct mm_walk *walk)
>   1859	{
>   1860		struct pagemap_scan_private *p = walk->private;
>   1861		struct vm_area_struct *vma = walk->vma;
>   1862		unsigned int len;
>   1863		spinlock_t *ptl;
>   1864		int ret = 0;
>   1865		pte_t *pte;
>   1866		bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
>   1867				 (false) : (vma->vm_flags & VM_SOFTDIRTY);
>   1868	
>   1869		if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
>   1870			return 0;
>   1871	
>   1872		end = min(end, walk->vma->vm_end);
>   1873	
>   1874		ptl = pmd_trans_huge_lock(pmd, vma);
>   1875		if (ptl) {
>   1876			if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
>   1877				/*
>   1878				 * Break huge page into small pages if operation needs to be performed is
>   1879				 * on a portion of the huge page or the return buffer cannot store complete
>   1880				 * data.
>   1881				 */
>> 1882				if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
>   1883					spin_unlock(ptl);
>   1884					split_huge_pmd(vma, pmd, addr);
>   1885					goto process_smaller_pages;
>   1886				}
>   1887	
>   1888				if (IS_GET_OP(p)) {
>   1889					len = (end - addr)/PAGE_SIZE;
>   1890					if (p->max_pages && p->found_pages + len > p->max_pages)
>   1891						len = p->max_pages - p->found_pages;
>   1892	
>   1893					ret = add_to_out(dirty_vma ||
>   1894							 check_soft_dirty_pmd(vma, addr, pmd, false),
>   1895							 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
>   1896							 p, addr, len);
>   1897				}
>   1898				if (!ret && IS_CLEAR_OP(p))
>   1899					check_soft_dirty_pmd(vma, addr, pmd, true);
>   1900			}
>   1901			spin_unlock(ptl);
>   1902			return 0;
>   1903		}
>   1904	
>   1905	process_smaller_pages:
>   1906		if (pmd_trans_unstable(pmd))
>   1907			return 0;
>   1908	
>   1909		pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>   1910		for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
>   1911		     ; pte++, addr += PAGE_SIZE) {
>   1912			if (IS_GET_OP(p))
>   1913				ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
>   1914						 vma->vm_file, pte_present(*pte),
>   1915						 is_swap_pte(*pte), p, addr, 1);
>   1916			if (!ret && IS_CLEAR_OP(p))
>   1917				check_soft_dirty(vma, addr, pte, true);
>   1918		}
>   1919		pte_unmap_unlock(pte - 1, ptl);
>   1920		cond_resched();
>   1921	
>   1922		return 0;
>   1923	}
>   1924	
>
Cyrill Gorcunov Dec. 12, 2022, 8:42 p.m. UTC | #4
On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
...
> +
> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
> +{
> +	struct mmu_notifier_range range;
> +	unsigned long __user start, end;
> +	struct pagemap_scan_private p;
> +	int ret;
> +
> +	start = (unsigned long)untagged_addr(arg->start);
> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
> +		return -EINVAL;
> +
> +	if (IS_GET_OP(arg) &&
> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
> +		return -ENOMEM;
> +
> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
> +		return -EINVAL;
> +
> +	end = start + arg->len;
> +	p.max_pages = arg->max_pages;
> +	p.found_pages = 0;
> +	p.flags = arg->flags;
> +	p.required_mask = arg->required_mask;
> +	p.anyof_mask = arg->anyof_mask;
> +	p.excluded_mask = arg->excluded_mask;
> +	p.return_mask = arg->return_mask;
> +	p.vec_index = 0;
> +	p.vec_len = arg->vec_len;
> +
> +	if (IS_GET_OP(arg)) {
> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
> +		if (!p.vec)
> +			return -ENOMEM;
> +	} else {
> +		p.vec = NULL;
> +	}

Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
step in yet). Anyway, while in general such interface looks reasonable here are
few moments which really bothers me: as far as I undertstand you don't need
vzalloc here, plain vmalloc should works as well since you copy only filled
results back to userspace. Next -- there is no restriction on vec_len parameter,
is not here a door for DoS from userspace? Say I could start a number of ioctl
on same pagemap and try to allocate very big amount of vec_len in summay causing
big pressure on kernel's memory. Or I miss something obvious here?
Muhammad Usama Anjum Dec. 13, 2022, 1:04 p.m. UTC | #5
On 12/13/22 1:42 AM, Cyrill Gorcunov wrote:
> On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
> ...
>> +
>> +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
>> +{
>> +	struct mmu_notifier_range range;
>> +	unsigned long __user start, end;
>> +	struct pagemap_scan_private p;
>> +	int ret;
>> +
>> +	start = (unsigned long)untagged_addr(arg->start);
>> +	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
>> +		return -EINVAL;
>> +
>> +	if (IS_GET_OP(arg) &&
>> +	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
>> +		return -ENOMEM;
>> +
>> +	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
>> +	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
>> +		return -EINVAL;
>> +
>> +	end = start + arg->len;
>> +	p.max_pages = arg->max_pages;
>> +	p.found_pages = 0;
>> +	p.flags = arg->flags;
>> +	p.required_mask = arg->required_mask;
>> +	p.anyof_mask = arg->anyof_mask;
>> +	p.excluded_mask = arg->excluded_mask;
>> +	p.return_mask = arg->return_mask;
>> +	p.vec_index = 0;
>> +	p.vec_len = arg->vec_len;
>> +
>> +	if (IS_GET_OP(arg)) {
>> +		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
>> +		if (!p.vec)
>> +			return -ENOMEM;
>> +	} else {
>> +		p.vec = NULL;
>> +	}
> 
> Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
> step in yet). Anyway, while in general such interface looks reasonable here are
> few moments which really bothers me: as far as I undertstand you don't need
> vzalloc here, plain vmalloc should works as well since you copy only filled
> results back to userspace. Thank you for reviewing. Correct, I'll update to use vmalloc.

> Next -- there is no restriction on vec_len parameter,
> is not here a door for DoS from userspace? Say I could start a number of ioctl
> on same pagemap and try to allocate very big amount of vec_len in summay causing
> big pressure on kernel's memory. Or I miss something obvious here?
Yes, there is a chance that a large chunk of kernel memory can get
allocated here as vec_len can be very large. We need to think of limiting
this buffer in the current implementation. Any reasonable limit should
work. I'm not sure what would be the reasonable limit. Maybe couple of
hundred MBs? I'll think about it. Or I should update the implementation
such that less amount of intermediate buffer can be used like mincore does.
But this can complicate the implementation further as we are already using
page ranges instead of keeping just the flags. I'll see what can be done.
Cyrill Gorcunov Dec. 13, 2022, 10:22 p.m. UTC | #6
On Tue, Dec 13, 2022 at 06:04:04PM +0500, Muhammad Usama Anjum wrote:
> > Hi Muhammad! I'm really sorry for diving in such late (unfortunatelly too busy to
> > step in yet). Anyway, while in general such interface looks reasonable here are
> > few moments which really bothers me: as far as I undertstand you don't need
> > vzalloc here, plain vmalloc should works as well since you copy only filled
> > results back to userspace.
>
> Thank you for reviewing. Correct, I'll update to use vmalloc.
> 
> > Next -- there is no restriction on vec_len parameter,
> > is not here a door for DoS from userspace? Say I could start a number of ioctl
> > on same pagemap and try to allocate very big amount of vec_len in summay causing
> > big pressure on kernel's memory. Or I miss something obvious here?
>
> Yes, there is a chance that a large chunk of kernel memory can get
> allocated here as vec_len can be very large. We need to think of limiting
> this buffer in the current implementation. Any reasonable limit should
> work. I'm not sure what would be the reasonable limit. Maybe couple of
> hundred MBs? I'll think about it. Or I should update the implementation
> such that less amount of intermediate buffer can be used like mincore does.
> But this can complicate the implementation further as we are already using
> page ranges instead of keeping just the flags. I'll see what can be done.

You know, I'm not yet convinced about overall design. This is new uapi which
should be reviewed very very carefully, once merged in we can't step back and
will have to live with it forever. As to buffer size: look how pagemap_read
is implemented, it allocates PAGEMAP_WALK_SIZE buffer array to gather results
then copies it back to userspace. If the main idea to be able to walk over
memory of a process with mm context locked it still doesn't bring much benefit
because once ioctl is complete the state of mm can be changed so precise results
are only possible if target process is not running.

Maybe all of these aspects are been discussed already I probably need to read
all previous converstaions first :)
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8235c536ac70..8d6a84ec5ef7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,9 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <uapi/linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/minmax.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1775,11 +1778,336 @@  static int pagemap_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#define PAGEMAP_OP_MASK		(PAGE_IS_SOFTDIRTY | PAGE_IS_FILE |	\
+				 PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
+#define PAGEMAP_NONSD_OP_MASK	(PAGE_IS_FILE |	PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
+#define PAGEMAP_SD_FLAGS	(PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS)
+#define IS_CLEAR_OP(a)		(a->flags & PAGEMAP_SOFTDIRTY_CLEAR)
+#define IS_GET_OP(a)		(a->vec)
+#define IS_SD_OP(a)		(a->flags & PAGEMAP_SD_FLAGS)
+
+struct pagemap_scan_private {
+	struct page_region *vec;
+	unsigned long vec_len;
+	unsigned long vec_index;
+	unsigned int max_pages;
+	unsigned int found_pages;
+	unsigned int flags;
+	unsigned long required_mask;
+	unsigned long anyof_mask;
+	unsigned long excluded_mask;
+	unsigned long return_mask;
+};
+
+static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages))
+		return -1;
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	return 0;
+}
+
+static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
+		      unsigned long addr, unsigned int len)
+{
+	unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
+	bool cpy = true;
+
+	if (p->required_mask)
+		cpy = ((p->required_mask & cur) == p->required_mask);
+	if (cpy && p->anyof_mask)
+		cpy = (p->anyof_mask & cur);
+	if (cpy && p->excluded_mask)
+		cpy = !(p->excluded_mask & cur);
+
+	bitmap = cur & p->return_mask;
+
+	if (cpy && bitmap) {
+		if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
+		    (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
+		     addr)) {
+			p->vec[p->vec_index - 1].len += len;
+			p->found_pages += len;
+		} else if (p->vec_index < p->vec_len) {
+			p->vec[p->vec_index].start = addr;
+			p->vec[p->vec_index].len = len;
+			p->found_pages += len;
+			p->vec[p->vec_index].bitmap = bitmap;
+			p->vec_index++;
+		} else {
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned int len;
+	spinlock_t *ptl;
+	int ret = 0;
+	pte_t *pte;
+	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
+			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
+
+	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
+		return 0;
+
+	end = min(end, walk->vma->vm_end);
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
+		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
+			/*
+			 * Break huge page into small pages if operation needs to be performed is
+			 * on a portion of the huge page or the return buffer cannot store complete
+			 * data.
+			 */
+			if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
+				spin_unlock(ptl);
+				split_huge_pmd(vma, pmd, addr);
+				goto process_smaller_pages;
+			}
+
+			if (IS_GET_OP(p)) {
+				len = (end - addr)/PAGE_SIZE;
+				if (p->max_pages && p->found_pages + len > p->max_pages)
+					len = p->max_pages - p->found_pages;
+
+				ret = add_to_out(dirty_vma ||
+						 check_soft_dirty_pmd(vma, addr, pmd, false),
+						 vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
+						 p, addr, len);
+			}
+			if (!ret && IS_CLEAR_OP(p))
+				check_soft_dirty_pmd(vma, addr, pmd, true);
+		}
+		spin_unlock(ptl);
+		return 0;
+	}
+
+process_smaller_pages:
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
+	     ; pte++, addr += PAGE_SIZE) {
+		if (IS_GET_OP(p))
+			ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
+					 vma->vm_file, pte_present(*pte),
+					 is_swap_pte(*pte), p, addr, 1);
+		if (!ret && IS_CLEAR_OP(p))
+			check_soft_dirty(vma, addr, pte, true);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
+				 struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned int len;
+	bool sd;
+
+	if (vma) {
+		/* Individual pages haven't been allocated and written */
+		sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
+		     (vma->vm_flags & VM_SOFTDIRTY);
+
+		len = (end - addr)/PAGE_SIZE;
+		if (p->max_pages && p->found_pages + len > p->max_pages)
+			len = p->max_pages - p->found_pages;
+
+		add_to_out(sd, vma->vm_file, false, false, p, addr, len);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long end_cut = end;
+	int ret;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		if (vma->vm_start < start) {
+			ret = split_vma(vma->vm_mm, vma, start, 1);
+			if (ret)
+				return ret;
+		}
+		/* Calculate end_cut because of max_pages */
+		if (IS_GET_OP(p) && p->max_pages)
+			end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
+
+		if (vma->vm_end > end_cut) {
+			ret = split_vma(vma->vm_mm, vma, end_cut, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void pagemap_scan_post_vma(struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		vma->vm_flags &= ~VM_SOFTDIRTY;
+		vma_set_page_prot(vma);
+	}
+}
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+	.test_walk = pagemap_scan_pmd_test_walk,
+	.pmd_entry = pagemap_scan_pmd_entry,
+	.pte_hole = pagemap_scan_pte_hole,
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	/* Only for clearing SD bit over VMAs */
+	.pre_vma = pagemap_scan_pre_vma,
+	.post_vma = pagemap_scan_post_vma,
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+};
+
+static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
+{
+	struct mmu_notifier_range range;
+	unsigned long __user start, end;
+	struct pagemap_scan_private p;
+	int ret;
+
+	start = (unsigned long)untagged_addr(arg->start);
+	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
+		return -EINVAL;
+
+	if (IS_GET_OP(arg) &&
+	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
+		return -ENOMEM;
+
+#ifndef CONFIG_MEM_SOFT_DIRTY
+	if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
+	    (arg->anyof_mask & PAGE_IS_SOFTDIRTY))
+		return -EINVAL;
+#endif
+
+	if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
+	    (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
+	    (arg->return_mask & ~PAGEMAP_OP_MASK))
+		return -EINVAL;
+
+	if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
+		return -EINVAL;
+
+	if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
+	     (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
+		return -EINVAL;
+
+	end = start + arg->len;
+	p.max_pages = arg->max_pages;
+	p.found_pages = 0;
+	p.flags = arg->flags;
+	p.required_mask = arg->required_mask;
+	p.anyof_mask = arg->anyof_mask;
+	p.excluded_mask = arg->excluded_mask;
+	p.return_mask = arg->return_mask;
+	p.vec_index = 0;
+	p.vec_len = arg->vec_len;
+
+	if (IS_GET_OP(arg)) {
+		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
+		if (!p.vec)
+			return -ENOMEM;
+	} else {
+		p.vec = NULL;
+	}
+
+	if (IS_CLEAR_OP(arg)) {
+		mmap_write_lock(mm);
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end);
+		mmu_notifier_invalidate_range_start(&range);
+		inc_tlb_flush_pending(mm);
+	} else {
+		mmap_read_lock(mm);
+	}
+
+	ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p);
+
+	if (IS_CLEAR_OP(arg)) {
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(mm);
+
+		mmap_write_unlock(mm);
+	} else {
+		mmap_read_unlock(mm);
+	}
+
+	if (ret < 0)
+		goto free_data;
+
+	if (IS_GET_OP(arg) && p.vec_index) {
+		if (copy_to_user((struct page_region *)arg->vec, p.vec,
+				 p.vec_index * sizeof(struct page_region))) {
+			ret = -EFAULT;
+			goto free_data;
+		}
+		ret = p.vec_index;
+	} else {
+		ret = 0;
+	}
+
+free_data:
+	if (IS_GET_OP(arg))
+		vfree(p.vec);
+
+	return ret;
+}
+
+static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pagemap_scan_arg __user *uarg = (struct pagemap_scan_arg __user *)arg;
+	struct mm_struct *mm = file->private_data;
+	struct pagemap_scan_arg argument;
+
+	if (cmd == PAGEMAP_SCAN) {
+		if (copy_from_user(&argument, uarg, sizeof(struct pagemap_scan_arg)))
+			return -EFAULT;
+		return do_pagemap_sd_cmd(mm, &argument);
+	}
+	return -EINVAL;
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 	.open		= pagemap_open,
 	.release	= pagemap_release,
+	.unlocked_ioctl = pagemap_sd_ioctl,
+	.compat_ioctl	= pagemap_sd_ioctl,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..11d232cfc9b3 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,60 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SOFTDIRTY	(1 << 0)
+#define PAGE_IS_FILE		(1 << 1)
+#define PAGE_IS_PRESENT		(1 << 2)
+#define PAGE_IS_SWAPPED		(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u64 bitmap;
+};
+
+/*
+ * struct pagemap_scan_arg - Pagemap ioctl argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages
+ * @flags:		Flags for the IOCTL
+ * @required_mask:	Required mask - All of these bits have to be set in the PTE
+ * @anyof_mask:		Any mask - Any of these bits are set in the PTE
+ * @excluded_mask:	Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that are to be reported in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u64 required_mask;
+	__u64 anyof_mask;
+	__u64 excluded_mask;
+	__u64 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SOFTDIRTY_CLEAR		(1 << 0)
+/*
+ * Depend only on the soft dirty PTE bit of individual pages and don't check the soft dirty bit
+ * of the VMA to decide if the region is dirty or not. By using this flag, the newly created
+ * memory doesn't appear to be soft dirty through the IOCTL until the region is written.
+ */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index b7b56871029c..11d232cfc9b3 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -305,4 +305,60 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SOFTDIRTY	(1 << 0)
+#define PAGE_IS_FILE		(1 << 1)
+#define PAGE_IS_PRESENT		(1 << 2)
+#define PAGE_IS_SWAPPED		(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u64 bitmap;
+};
+
+/*
+ * struct pagemap_scan_arg - Pagemap ioctl argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages
+ * @flags:		Flags for the IOCTL
+ * @required_mask:	Required mask - All of these bits have to be set in the PTE
+ * @anyof_mask:		Any mask - Any of these bits are set in the PTE
+ * @excluded_mask:	Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that are to be reported in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u64 required_mask;
+	__u64 anyof_mask;
+	__u64 excluded_mask;
+	__u64 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SOFTDIRTY_CLEAR		(1 << 0)
+/*
+ * Depend only on the soft dirty PTE bit of individual pages and don't check the soft dirty bit
+ * of the VMA to decide if the region is dirty or not. By using this flag, the newly created
+ * memory doesn't appear to be soft dirty through the IOCTL until the region is written.
+ */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */