diff mbox series

[v5,2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs

Message ID 20221103145353.3049303-3-usama.anjum@collabora.com (mailing list archive)
State New
Headers show
Series Implement IOCTL to get and/or the clear info about PTEs | expand

Commit Message

Muhammad Usama Anjum Nov. 3, 2022, 2:53 p.m. UTC
This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
page table entries. The following operations are supported in this ioctl:
- Get the information if the pages are soft-dirty, file mapped, present
  or swapped.
- Clear the soft-dirty PTE bit of the pages.
- Get and clear the soft-dirty PTE bit of the pages.

Only the soft-dirty bit can be read and cleared atomically. struct
pagemap_sd_args is used as the argument of the IOCTL. In this struct:
- The range is specified through start and len.
- The output buffer and size is specified as vec and vec_len.
- The optional maximum requested pages are specified in the max_pages.
- The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
  and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
- The masks are specified in rmask, amask, emask and return_mask.

This IOCTL can be extended to get information about more PTE bits.

This is based on a patch from Gabriel Krisman Bertazi.

Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
---
Changes in v5:
- Remove tlb flushing even for clear operation

Changes in v4:
- Update the interface and implementation

Changes in v3:
- Tighten the user-kernel interface by using explicit types and add more
  error checking

Changes in v2:
- Convert the interface from syscall to ioctl
- Remove pidfd support as it doesn't make sense in ioctl
---
 fs/proc/task_mmu.c            | 314 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h       |  53 ++++++
 tools/include/uapi/linux/fs.h |  53 ++++++
 3 files changed, 420 insertions(+)

Comments

Michał Mirosław Nov. 7, 2022, 12:26 p.m. UTC | #1
On Thu, 3 Nov 2022 at 15:54, Muhammad Usama Anjum
<usama.anjum@collabora.com> wrote:
> This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about
> page table entries. The following operations are supported in this ioctl:
> - Get the information if the pages are soft-dirty, file mapped, present
>   or swapped.
> - Clear the soft-dirty PTE bit of the pages.
> - Get and clear the soft-dirty PTE bit of the pages.
>
> Only the soft-dirty bit can be read and cleared atomically. struct
> pagemap_sd_args is used as the argument of the IOCTL. In this struct:
> - The range is specified through start and len.
> - The output buffer and size is specified as vec and vec_len.
> - The optional maximum requested pages are specified in the max_pages.
> - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR
>   and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
> - The masks are specified in rmask, amask, emask and return_mask.
[...]
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -305,4 +305,57 @@ typedef int __bitwise __kernel_rwf_t;
>  #define RWF_SUPPORTED  (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
>                          RWF_APPEND)
>
> +/* PAGEMAP IOCTL */
> +#define PAGEMAP_SCAN   _IOWR('f', 16, struct pagemap_scan_arg)
> +
> +/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
> +#define PAGE_IS_SD     (1 << 0)

Can we name it PAGE_IS_SOFTDIRTY? "SD" can mean so many things.

> +#define PAGE_IS_FILE   (1 << 1)
> +#define PAGE_IS_PRESENT        (1 << 2)
> +#define PAGE_IS_SWAPED (1 << 3)

PAGE_IS_SWAPPED?

> +
> +/*
> + * struct page_region - Page region with bitmap flags
> + * @start:     Start of the region
> + * @len:       Length of the region
> + * bitmap:     Bits sets for the region
> + */
> +struct page_region {
> +       __u64 start;
> +       __u64 len;
> +       __u32 bitmap;
> +       __u32 __reserved;

"u64 flags"? If an extension is needed it would already require a new
ioctl or something in the `arg` struct.

> +
> +/*
> + * struct pagemap_scan_arg - Soft-dirty IOCTL argument

Since this is no longer a soft-dirty-specific call, it might be better
to describe it as "VM scan ioctl" or similar. BTW, the implementation
is currently guarded by CONFIG_MEM_SOFT_DIRTY, but CRIU doesn't need
that but needs the other bits handling.

> + * @start:             Starting address of the region
> + * @len:               Length of the region (All the pages in this length are included)
> + * @vec:               Address of page_region struct array for output
> + * @vec_len:           Length of the page_region struct array
> + * @max_pages:         Optional max return pages (It must be less than vec_len if specified)

I think we discussed that this is not counting the same things as
vec_len, so there should not be a reference between the two. The limit
is whatever fits under both conditions (IOW: n_vecs <= vec_len &&
(!max_pages || n_pages <= max_pages).

> + * @flags:             Special flags for the IOCTL

Just "Flags for the IOCTL".

> + * @rmask:             Required mask - All of these bits have to be set in the PTE
> + * @amask:             Any mask - Any of these bits are set in the PTE
> + * @emask:             Exclude mask - None of these bits are set in the PTE

It might be easier for developers if those were named e.g.
"required_mask", "anyof_mask", "excluded_mask".

> + * @return_mask:       Bits that have to be reported to the user in page_region

"Bits that are to be reported in page_region"?

> + */
> +struct pagemap_scan_arg {
> +       __u64 start;
> +       __u64 len;
> +       __u64 vec;
> +       __u64 vec_len;
> +       __u32 max_pages;
> +       __u32 flags;
> +       __u32 rmask;
> +       __u32 amask;
> +       __u32 emask;
> +       __u32 return_mask;
> +};
> +
> +/* Special flags */
> +#define PAGEMAP_SD_CLEAR               (1 << 0)

SD -> SOFTDIRTY

> +/* Check the individual pages if they are soft-dirty to find dirty pages faster. */
> +#define PAGEMAP_NO_REUSED_REGIONS      (1 << 1)

Please include the description from commitmsg of what this flag does
(i.e. how the behaviour differs because of the flag). I'd drop the
part about it being faster, as if so - why have the flag at all
instead of just always using the faster way?

(I only reviewed the API now. The implementation I think could be
simpler, but let's leave that to after the API is agreed on.)

Best Regards
Michał Mirosław
Muhammad Usama Anjum Nov. 8, 2022, 2:24 p.m. UTC | #2
Hi Michał,

Thank you so much for reviewing.

On 11/7/22 5:26 PM, Michał Mirosław wrote:
>> +
>> +/*
>> + * struct page_region - Page region with bitmap flags
>> + * @start:     Start of the region
>> + * @len:       Length of the region
>> + * bitmap:     Bits sets for the region
>> + */
>> +struct page_region {
>> +       __u64 start;
>> +       __u64 len;
>> +       __u32 bitmap;
>> +       __u32 __reserved;
> 
> "u64 flags"? If an extension is needed it would already require a new
> ioctl or something in the `arg` struct.
I feel like the masks must have the same type as this bitmap variable as 
the return_mask specifies the flags to be returned in bitmap. All the 
masks are of type __u32. This is why I'd kept the bitmap of type _u32 as 
well. I've kept them of 32 bit size as currently we are adding support 
for 4 flags and there is still room to add 28 more bits in the future. 
Do you still think that I should update the masks and bitmap to _u64?

>> + * @start:             Starting address of the region
>> + * @len:               Length of the region (All the pages in this length are included)
>> + * @vec:               Address of page_region struct array for output
>> + * @vec_len:           Length of the page_region struct array
>> + * @max_pages:         Optional max return pages (It must be less than vec_len if specified)
> 
> I think we discussed that this is not counting the same things as
> vec_len, so there should not be a reference between the two. The limit
> is whatever fits under both conditions (IOW: n_vecs <= vec_len &&
> (!max_pages || n_pages <= max_pages).
In worse case when pages cannot be folded into the page_region, the one 
page_region may have information of only one page. This is why I've 
compared them. I want to communicate to the user that if max_pages is 
used, the vec_len should be of equal or greater size (to cater worse 
case which can happen at any time). Otherwise in worse case, the api can 
return without finding the max_pages number of pages. I don't know how 
should I put this in the comment.

> (I only reviewed the API now. The implementation I think could be
> simpler, but let's leave that to after the API is agreed on.)
> 
> Best Regards
> Michał Mirosław
Michał Mirosław Nov. 8, 2022, 4 p.m. UTC | #3
On Tue, 8 Nov 2022 at 15:25, Muhammad Usama Anjum
<usama.anjum@collabora.com> wrote:
>
> Hi Michał,
>
> Thank you so much for reviewing.
>
> On 11/7/22 5:26 PM, Michał Mirosław wrote:
> >> +
> >> +/*
> >> + * struct page_region - Page region with bitmap flags
> >> + * @start:     Start of the region
> >> + * @len:       Length of the region
> >> + * bitmap:     Bits sets for the region
> >> + */
> >> +struct page_region {
> >> +       __u64 start;
> >> +       __u64 len;
> >> +       __u32 bitmap;
> >> +       __u32 __reserved;
> >
> > "u64 flags"? If an extension is needed it would already require a new
> > ioctl or something in the `arg` struct.
> I feel like the masks must have the same type as this bitmap variable as
> the return_mask specifies the flags to be returned in bitmap. All the
> masks are of type __u32. This is why I'd kept the bitmap of type _u32 as
> well. I've kept them of 32 bit size as currently we are adding support
> for 4 flags and there is still room to add 28 more bits in the future.
> Do you still think that I should update the masks and bitmap to _u64?

I agree that the `bitmap` (I'd rather call it `flags` though) should
have the type matching the masks in the request. But the size I'm not
sure about if u32 is enough compared to what is used (or will be in
the future) for page flags in MM code. I suppose the ioctl() is not
expected to be a fast path, so I would go with u64 and assume that
overhead of the extra bytes read by the kernel won't matter.

> >> + * @start:             Starting address of the region
> >> + * @len:               Length of the region (All the pages in this length are included)
> >> + * @vec:               Address of page_region struct array for output
> >> + * @vec_len:           Length of the page_region struct array
> >> + * @max_pages:         Optional max return pages (It must be less than vec_len if specified)
> >
> > I think we discussed that this is not counting the same things as
> > vec_len, so there should not be a reference between the two. The limit
> > is whatever fits under both conditions (IOW: n_vecs <= vec_len &&
> > (!max_pages || n_pages <= max_pages).
> In worse case when pages cannot be folded into the page_region, the one
> page_region may have information of only one page. This is why I've
> compared them. I want to communicate to the user that if max_pages is
> used, the vec_len should be of equal or greater size (to cater worse
> case which can happen at any time). Otherwise in worse case, the api can
> return without finding the max_pages number of pages. I don't know how
> should I put this in the comment.

I'm not sure you need to, as this conclusion follows from the range vs
page distinction.
A user who wants to cater for the worst case will provide big-enough
`vec` array, but another, who might be memory-constrained, could
instead just retry the call with `start` updated to just after the
last returned page until the ioctl() returns less ranges than
`vec_len` allows.

Best Regards
Michał Mirosław
Muhammad Usama Anjum Nov. 8, 2022, 5:51 p.m. UTC | #4
On 11/8/22 9:00 PM, Michał Mirosław wrote:
>>>> + * @start:             Starting address of the region
>>>> + * @len:               Length of the region (All the pages in this length are included)
>>>> + * @vec:               Address of page_region struct array for output
>>>> + * @vec_len:           Length of the page_region struct array
>>>> + * @max_pages:         Optional max return pages (It must be less than vec_len if specified)
>>> I think we discussed that this is not counting the same things as
>>> vec_len, so there should not be a reference between the two. The limit
>>> is whatever fits under both conditions (IOW: n_vecs <= vec_len &&
>>> (!max_pages || n_pages <= max_pages).
>> In worse case when pages cannot be folded into the page_region, the one
>> page_region may have information of only one page. This is why I've
>> compared them. I want to communicate to the user that if max_pages is
>> used, the vec_len should be of equal or greater size (to cater worse
>> case which can happen at any time). Otherwise in worse case, the api can
>> return without finding the max_pages number of pages. I don't know how
>> should I put this in the comment.
> I'm not sure you need to, as this conclusion follows from the range vs
> page distinction.
> A user who wants to cater for the worst case will provide big-enough
> `vec` array, but another, who might be memory-constrained, could
> instead just retry the call with `start` updated to just after the
> last returned page until the ioctl() returns less ranges than
> `vec_len` allows.
Makes sense. I'll update and send next revision.

Thanks,
Usama
> 
> Best Regards
> Michał Mirosław
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8235c536ac70..9690a44eb1fc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,9 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <uapi/linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/minmax.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1775,11 +1778,322 @@  static int pagemap_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+
+#define PAGEMAP_OP_MASK		(PAGE_IS_SD | PAGE_IS_FILE |		\
+				 PAGE_IS_PRESENT | PAGE_IS_SWAPED)
+#define PAGEMAP_NON_SD_MASK	(PAGE_IS_FILE |	PAGE_IS_PRESENT | PAGE_IS_SWAPED)
+#define PAGEMAP_SD_FLAGS_MASK	(PAGEMAP_SD_CLEAR | PAGEMAP_NO_REUSED_REGIONS)
+#define IS_CLEAR_OP(flags)	(flags & PAGEMAP_SD_CLEAR)
+#define IS_GET_OP(vec)		(vec)
+
+struct pagemap_scan_private {
+	struct page_region *vec;
+	unsigned long vec_len;
+	unsigned long index;
+	unsigned int max_pages;
+	unsigned int found_pages;
+	unsigned int flags;
+	unsigned int rmask;
+	unsigned int amask;
+	unsigned int emask;
+	unsigned int return_mask;
+};
+
+static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
+		      unsigned long addr, unsigned int len)
+{
+	unsigned int bitmap, cpy = true, cur = sd | file << 1 | pres << 2 | swap << 3;
+
+	if (p->rmask)
+		cpy = ((p->rmask & cur) == p->rmask) ? true : false;
+	if (cpy && p->amask)
+		cpy = (p->amask & cur) ? true : false;
+	if (cpy && p->emask)
+		cpy = (p->emask & cur) ? false : true;
+
+	bitmap = cur & p->return_mask;
+
+	if (cpy && bitmap) {
+		if (p->index && p->vec[p->index - 1].bitmap == bitmap &&
+		    p->vec[p->index - 1].start + p->vec[p->index - 1].len * PAGE_SIZE == addr) {
+			p->vec[p->index - 1].len += len;
+			p->found_pages += len;
+		} else if (p->index < p->vec_len) {
+			p->vec[p->index].start = addr;
+			p->vec[p->index].len = len;
+			p->vec[p->index].bitmap = bitmap;
+			p->index++;
+			p->found_pages += len;
+		} else {
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	int dirty, ret = 0;
+	spinlock_t *ptl;
+	pte_t *pte;
+	bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
+			 (false) : (vma->vm_flags & VM_SOFTDIRTY);
+
+	if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
+		return 0;
+
+	end = min(end, walk->vma->vm_end);
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
+		if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
+			/*
+			 * Break huge page into small pages if operation needs to be performed is
+			 * on a portion of the huge page or the return buffer cannot store complete
+			 * data.
+			 */
+			if ((IS_CLEAR_OP(p->flags) && (end - addr < HPAGE_SIZE)) ||
+			    (IS_GET_OP(p->vec) && p->max_pages &&
+			    (p->found_pages + HPAGE_SIZE/PAGE_SIZE > p->max_pages))) {
+				spin_unlock(ptl);
+				split_huge_pmd(vma, pmd, addr);
+				goto process_smaller_pages;
+			} else {
+				dirty = check_soft_dirty_pmd(vma, addr, pmd, IS_CLEAR_OP(p->flags));
+				if (IS_GET_OP(p->vec))
+					add_to_out(dirty_vma || dirty, vma->vm_file,
+						   pmd_present(*pmd), is_swap_pmd(*pmd), p,
+						   addr, (end - addr)/PAGE_SIZE);
+			}
+		}
+		spin_unlock(ptl);
+		return 0;
+	}
+
+process_smaller_pages:
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr < end && !ret; pte++, addr += PAGE_SIZE) {
+		dirty = check_soft_dirty(vma, addr, pte, IS_CLEAR_OP(p->flags));
+		if (IS_GET_OP(p->vec)) {
+			ret = add_to_out(dirty_vma || dirty, vma->vm_file, pte_present(*pte),
+					 is_swap_pte(*pte), p, addr, 1);
+			if (p->max_pages && (p->found_pages == p->max_pages))
+				break;
+		}
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
+				 struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned int len;
+	bool sd;
+
+	if (vma) {
+		/* Individual pages haven't been allocated and written */
+		sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
+		     (vma->vm_flags & VM_SOFTDIRTY);
+
+		len = (end - addr)/PAGE_SIZE;
+		if (p->max_pages && p->max_pages - p->found_pages < len)
+			len = p->max_pages - p->found_pages;
+
+		add_to_out(sd, vma->vm_file, false, false, p, addr, len);
+	}
+
+	return 0;
+}
+
+static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long end_cut = end;
+	int ret;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		if (vma->vm_start < start) {
+			ret = split_vma(vma->vm_mm, vma, start, 1);
+			if (ret)
+				return ret;
+		}
+		/* Calculate end_cut because of max_pages */
+		if (IS_GET_OP(p->vec) && p->max_pages)
+			end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
+
+		if (vma->vm_end > end_cut) {
+			ret = split_vma(vma->vm_mm, vma, end_cut, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void pagemap_scan_post_vma(struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) &&
+	    (vma->vm_flags & VM_SOFTDIRTY)) {
+		vma->vm_flags &= ~VM_SOFTDIRTY;
+		vma_set_page_prot(vma);
+	}
+}
+
+static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (IS_GET_OP(p->vec) && p->max_pages && (p->found_pages == p->max_pages))
+		return -1;
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	return 0;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+	.test_walk = pagemap_scan_pmd_test_walk,
+	.pmd_entry = pagemap_scan_pmd_entry,
+	.pte_hole = pagemap_scan_pte_hole,
+
+	/* Only for clearing SD bit over VMAs */
+	.pre_vma = pagemap_scan_pre_vma,
+	.post_vma = pagemap_scan_post_vma,
+};
+
+static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg)
+{
+	struct mmu_notifier_range range;
+	unsigned long __user start, end;
+	struct pagemap_scan_private p;
+	int ret;
+
+	start = (unsigned long)untagged_addr(arg->start);
+	if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
+		return -EINVAL;
+
+	if (IS_GET_OP(arg->vec) &&
+	    ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
+		return -ENOMEM;
+
+	if ((arg->flags & ~PAGEMAP_SD_FLAGS_MASK) || (arg->rmask & ~PAGEMAP_OP_MASK) ||
+	    (arg->amask & ~PAGEMAP_OP_MASK) || (arg->emask & ~PAGEMAP_OP_MASK) ||
+	    (arg->return_mask & ~PAGEMAP_OP_MASK))
+		return -EINVAL;
+
+	if ((!arg->rmask && !arg->amask && !arg->emask) || !arg->return_mask)
+		return -EINVAL;
+
+	if ((arg->flags & PAGEMAP_SD_FLAGS_MASK) && ((arg->rmask & PAGEMAP_NON_SD_MASK) ||
+	     (arg->amask & PAGEMAP_NON_SD_MASK)))
+		return -EINVAL;
+
+	end = start + arg->len;
+	p.max_pages = arg->max_pages;
+	p.found_pages = 0;
+	p.flags = arg->flags;
+	p.rmask = arg->rmask;
+	p.amask = arg->amask;
+	p.emask = arg->emask;
+	p.return_mask = arg->return_mask;
+	p.index = 0;
+	p.vec_len = arg->vec_len;
+
+	if (IS_GET_OP(arg->vec)) {
+		p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
+		if (!p.vec)
+			return -ENOMEM;
+	} else {
+		p.vec = NULL;
+	}
+
+	if (IS_CLEAR_OP(arg->flags)) {
+		mmap_write_lock(mm);
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end);
+		mmu_notifier_invalidate_range_start(&range);
+		inc_tlb_flush_pending(mm);
+	} else {
+		mmap_read_lock(mm);
+	}
+
+	ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p);
+
+	if (IS_CLEAR_OP(arg->flags)) {
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(mm);
+
+		mmap_write_unlock(mm);
+	} else {
+		mmap_read_unlock(mm);
+	}
+
+	if (ret < 0)
+		goto free_data;
+
+	if (IS_GET_OP(arg->vec) && p.index) {
+		if (copy_to_user((struct page_region *)arg->vec, p.vec,
+				 p.index * sizeof(struct page_region))) {
+			ret = -EFAULT;
+			goto free_data;
+		}
+		ret = p.index;
+	} else {
+		ret = 0;
+	}
+
+free_data:
+	if (IS_GET_OP(arg->vec))
+		vfree(p.vec);
+
+	return ret;
+}
+
+static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pagemap_scan_arg __user *uarg = (struct pagemap_scan_arg __user *)arg;
+	struct mm_struct *mm = file->private_data;
+	struct pagemap_scan_arg argument;
+
+	if (cmd == PAGEMAP_SCAN) {
+		if (copy_from_user(&argument, uarg, sizeof(struct pagemap_scan_arg)))
+			return -EFAULT;
+		return do_pagemap_sd_cmd(mm, &argument);
+	}
+	return -EINVAL;
+}
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 	.open		= pagemap_open,
 	.release	= pagemap_release,
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	.unlocked_ioctl = pagemap_sd_ioctl,
+	.compat_ioctl = pagemap_sd_ioctl,
+#endif /* CONFIG_MEM_SOFT_DIRTY */
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..5d6c0d85dac4 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,57 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* PAGEMAP IOCTL */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SD	(1 << 0)
+#define PAGE_IS_FILE	(1 << 1)
+#define PAGE_IS_PRESENT	(1 << 2)
+#define PAGE_IS_SWAPED	(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u32 bitmap;
+	__u32 __reserved;
+};
+
+/*
+ * struct pagemap_scan_arg - Soft-dirty IOCTL argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages (It must be less than vec_len if specified)
+ * @flags:		Special flags for the IOCTL
+ * @rmask:		Required mask - All of these bits have to be set in the PTE
+ * @amask:		Any mask - Any of these bits are set in the PTE
+ * @emask:		Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that have to be reported to the user in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u32 rmask;
+	__u32 amask;
+	__u32 emask;
+	__u32 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SD_CLEAR		(1 << 0)
+/* Check the individual pages if they are soft-dirty to find dirty pages faster. */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index b7b56871029c..5d6c0d85dac4 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -305,4 +305,57 @@  typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* PAGEMAP IOCTL */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pagemap_scan_arg)
+
+/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */
+#define PAGE_IS_SD	(1 << 0)
+#define PAGE_IS_FILE	(1 << 1)
+#define PAGE_IS_PRESENT	(1 << 2)
+#define PAGE_IS_SWAPED	(1 << 3)
+
+/*
+ * struct page_region - Page region with bitmap flags
+ * @start:	Start of the region
+ * @len:	Length of the region
+ * bitmap:	Bits sets for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 len;
+	__u32 bitmap;
+	__u32 __reserved;
+};
+
+/*
+ * struct pagemap_scan_arg - Soft-dirty IOCTL argument
+ * @start:		Starting address of the region
+ * @len:		Length of the region (All the pages in this length are included)
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional max return pages (It must be less than vec_len if specified)
+ * @flags:		Special flags for the IOCTL
+ * @rmask:		Required mask - All of these bits have to be set in the PTE
+ * @amask:		Any mask - Any of these bits are set in the PTE
+ * @emask:		Exclude mask - None of these bits are set in the PTE
+ * @return_mask:	Bits that have to be reported to the user in page_region
+ */
+struct pagemap_scan_arg {
+	__u64 start;
+	__u64 len;
+	__u64 vec;
+	__u64 vec_len;
+	__u32 max_pages;
+	__u32 flags;
+	__u32 rmask;
+	__u32 amask;
+	__u32 emask;
+	__u32 return_mask;
+};
+
+/* Special flags */
+#define PAGEMAP_SD_CLEAR		(1 << 0)
+/* Check the individual pages if they are soft-dirty to find dirty pages faster. */
+#define PAGEMAP_NO_REUSED_REGIONS	(1 << 1)
+
 #endif /* _UAPI_LINUX_FS_H */