From patchwork Thu Nov 3 14:53:51 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Muhammad Usama Anjum X-Patchwork-Id: 13030115 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 28479C433FE for ; Thu, 3 Nov 2022 14:54:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231661AbiKCOya (ORCPT ); Thu, 3 Nov 2022 10:54:30 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38916 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231863AbiKCOyR (ORCPT ); Thu, 3 Nov 2022 10:54:17 -0400 Received: from madras.collabora.co.uk (madras.collabora.co.uk [IPv6:2a00:1098:0:82:1000:25:2eeb:e5ab]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 673AE1A075; Thu, 3 Nov 2022 07:54:16 -0700 (PDT) Received: from localhost.localdomain (unknown [39.45.244.84]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) (Authenticated sender: usama.anjum) by madras.collabora.co.uk (Postfix) with ESMTPSA id 09BEB6602955; Thu, 3 Nov 2022 14:54:09 +0000 (GMT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=collabora.com; s=mail; t=1667487255; bh=MLOajoALxs2BA+DAlpPUi9P0UdFtJcqV/tFJX2ymRCg=; h=From:To:Subject:Date:In-Reply-To:References:From; b=AAnOzF0n/SoUaRZoVIPXEu1SpiWzosOAAn2khppgHR+ruBt8KkKIzo0KdrrBm7Uyg APRyCc1fwiqU+l883WiusxFJjt4+qPqBy/ki/LsPKXnDf31v/AymkQH0viMfsqVncB /RGQgFp6H+Up9rx3y4OQaIgbmT1AxIMnydGNE8+auGp2GMBVmExLdgnQBsPqq78GZR nPelptzgcaXhsaVBGZhGe6259ciFiJ7PC3OvKEGDzZ87ZTdNz4RyFPd2hCFnA4Hs3k wC8GgSTe7e3MvkPYmffFXj0WtU4BVu/NDaPT1Pw5KYjQjQinYqbsgkjpeqx2mC8nQT 4bz+ULBrrEPBw== From: Muhammad Usama Anjum To: Andrei Vagin , Danylo Mocherniuk , Alexander Viro , Andrew Morton , =?utf-8?b?TWljaGHFgiBNaXJvc8WC?= =?utf-8?b?YXc=?= , Suren Baghdasaryan , Greg KH , Christian Brauner , Peter Xu , Yang Shi , Vlastimil Babka , "Zach O'Keefe" , "Matthew Wilcox (Oracle)" , "Gustavo A. R. Silva" , Dan Williams , Muhammad Usama Anjum , kernel@collabora.com, Gabriel Krisman Bertazi , David Hildenbrand , Peter Enderborg , "open list : KERNEL SELFTEST FRAMEWORK" , Shuah Khan , open list , "open list : PROC FILESYSTEM" , "open list : MEMORY MANAGEMENT" Subject: [PATCH v5 1/3] fs/proc/task_mmu: update functions to clear the soft-dirty PTE bit Date: Thu, 3 Nov 2022 19:53:51 +0500 Message-Id: <20221103145353.3049303-2-usama.anjum@collabora.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221103145353.3049303-1-usama.anjum@collabora.com> References: <20221103145353.3049303-1-usama.anjum@collabora.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-fsdevel@vger.kernel.org Update the clear_soft_dirty() and clear_soft_dirty_pmd() to optionally clear and return the status if page is dirty. Signed-off-by: Muhammad Usama Anjum --- Changes in v2: - Move back the functions back to their original file --- fs/proc/task_mmu.c | 82 ++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a74cdcc9af0..8235c536ac70 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1095,8 +1095,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return page_maybe_dma_pinned(page); } -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) +static inline bool check_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, bool clear) { /* * The soft-dirty tracker uses #PF-s to catch writes @@ -1105,55 +1105,75 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; + int dirty = 0; if (pte_present(ptent)) { pte_t old_pte; - if (pte_is_pinned(vma, addr, ptent)) - return; - old_pte = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_wrprotect(old_pte); - ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + dirty = pte_soft_dirty(ptent); + + if (dirty && clear && !pte_is_pinned(vma, addr, ptent)) { + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } } else if (is_swap_pte(ptent)) { - ptent = pte_swp_clear_soft_dirty(ptent); - set_pte_at(vma->vm_mm, addr, pte, ptent); + dirty = pte_swp_soft_dirty(ptent); + + if (dirty && clear) { + ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } } + + return !!dirty; } #else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) +static inline bool check_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, bool clear) { + return false; } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool check_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, bool clear) { pmd_t old, pmd = *pmdp; + int dirty = 0; if (pmd_present(pmd)) { - /* See comment in change_huge_pmd() */ - old = pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(old)) - pmd = pmd_mkdirty(pmd); - if (pmd_young(old)) - pmd = pmd_mkyoung(pmd); - - pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_soft_dirty(pmd); - - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + dirty = pmd_soft_dirty(pmd); + if (dirty && clear) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { - pmd = pmd_swp_clear_soft_dirty(pmd); - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + dirty = pmd_swp_soft_dirty(pmd); + + if (dirty && clear) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } + return !!dirty; } #else -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool check_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, bool clear) { + return false; } #endif @@ -1169,7 +1189,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { - clear_soft_dirty_pmd(vma, addr, pmd); + check_soft_dirty_pmd(vma, addr, pmd, true); goto out; } @@ -1195,7 +1215,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, ptent = *pte; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { - clear_soft_dirty(vma, addr, pte); + check_soft_dirty(vma, addr, pte, true); continue; } From patchwork Thu Nov 3 14:53:52 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Muhammad Usama Anjum X-Patchwork-Id: 13030116 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7CDC6C433FE for ; Thu, 3 Nov 2022 14:54:54 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231372AbiKCOyt (ORCPT ); Thu, 3 Nov 2022 10:54:49 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36224 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231903AbiKCOyX (ORCPT ); Thu, 3 Nov 2022 10:54:23 -0400 Received: from madras.collabora.co.uk (madras.collabora.co.uk [46.235.227.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1C10F19C3C; Thu, 3 Nov 2022 07:54:22 -0700 (PDT) Received: from localhost.localdomain (unknown [39.45.244.84]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) (Authenticated sender: usama.anjum) by madras.collabora.co.uk (Postfix) with ESMTPSA id AC60A660295F; Thu, 3 Nov 2022 14:54:15 +0000 (GMT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=collabora.com; s=mail; t=1667487260; bh=64+/I7eM6+dxGlkOgfDm0jJ14KY/mWubWijISSh+l40=; h=From:To:Subject:Date:In-Reply-To:References:From; b=gBHb2RVQo0Cd2R8u3mF9mD5e/r7rDCf6pD++VRu/jzsPYhBYW/Q/iIDarXU3NFrm1 yksylaYVzVZIY+BAG7/Ucgm6vW8w3vucve/kp9PCpFANByB1q/eWUL53tnnX4/DEIa qEkOox9ytGn/Q0n8RBq+qKDzlMISpk7JJJUbCrTmriNB0gj5+p91BPekTVdGRejaza 0ZYZoDauaGajoMSbB9DxoWmlE2kcukKreGuWfWTGn7uGDNV+6OlaITFPHzAStd/DkN T3JS5BbJ6IW8j286exfe/E6KJPh+V8NUfHacSv8vI/pYXnw2RK7dVuT0SKvuhxmvGf o8/LXwLl/AAdQ== From: Muhammad Usama Anjum To: Andrei Vagin , Danylo Mocherniuk , Alexander Viro , Andrew Morton , =?utf-8?b?TWljaGHFgiBNaXJvc8WC?= =?utf-8?b?YXc=?= , Suren Baghdasaryan , Greg KH , Christian Brauner , Peter Xu , Yang Shi , Vlastimil Babka , "Zach O'Keefe" , "Matthew Wilcox (Oracle)" , "Gustavo A. R. Silva" , Dan Williams , Muhammad Usama Anjum , kernel@collabora.com, Gabriel Krisman Bertazi , David Hildenbrand , Peter Enderborg , "open list : KERNEL SELFTEST FRAMEWORK" , Shuah Khan , open list , "open list : PROC FILESYSTEM" , "open list : MEMORY MANAGEMENT" Subject: [PATCH v5 2/3] fs/proc/task_mmu: Implement IOCTL to get and/or the clear info about PTEs Date: Thu, 3 Nov 2022 19:53:52 +0500 Message-Id: <20221103145353.3049303-3-usama.anjum@collabora.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221103145353.3049303-1-usama.anjum@collabora.com> References: <20221103145353.3049303-1-usama.anjum@collabora.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-fsdevel@vger.kernel.org This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about page table entries. The following operations are supported in this ioctl: - Get the information if the pages are soft-dirty, file mapped, present or swapped. - Clear the soft-dirty PTE bit of the pages. - Get and clear the soft-dirty PTE bit of the pages. Only the soft-dirty bit can be read and cleared atomically. struct pagemap_sd_args is used as the argument of the IOCTL. In this struct: - The range is specified through start and len. - The output buffer and size is specified as vec and vec_len. - The optional maximum requested pages are specified in the max_pages. - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR and PAGEMAP_SD_NO_REUSED_REGIONS are supported. - The masks are specified in rmask, amask, emask and return_mask. This IOCTL can be extended to get information about more PTE bits. This is based on a patch from Gabriel Krisman Bertazi. Signed-off-by: Muhammad Usama Anjum --- Changes in v5: - Remove tlb flushing even for clear operation Changes in v4: - Update the interface and implementation Changes in v3: - Tighten the user-kernel interface by using explicit types and add more error checking Changes in v2: - Convert the interface from syscall to ioctl - Remove pidfd support as it doesn't make sense in ioctl --- fs/proc/task_mmu.c | 314 ++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 53 ++++++ tools/include/uapi/linux/fs.h | 53 ++++++ 3 files changed, 420 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8235c536ac70..9690a44eb1fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1775,11 +1778,322 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_MEM_SOFT_DIRTY + +#define PAGEMAP_OP_MASK (PAGE_IS_SD | PAGE_IS_FILE | \ + PAGE_IS_PRESENT | PAGE_IS_SWAPED) +#define PAGEMAP_NON_SD_MASK (PAGE_IS_FILE | PAGE_IS_PRESENT | PAGE_IS_SWAPED) +#define PAGEMAP_SD_FLAGS_MASK (PAGEMAP_SD_CLEAR | PAGEMAP_NO_REUSED_REGIONS) +#define IS_CLEAR_OP(flags) (flags & PAGEMAP_SD_CLEAR) +#define IS_GET_OP(vec) (vec) + +struct pagemap_scan_private { + struct page_region *vec; + unsigned long vec_len; + unsigned long index; + unsigned int max_pages; + unsigned int found_pages; + unsigned int flags; + unsigned int rmask; + unsigned int amask; + unsigned int emask; + unsigned int return_mask; +}; + +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p, + unsigned long addr, unsigned int len) +{ + unsigned int bitmap, cpy = true, cur = sd | file << 1 | pres << 2 | swap << 3; + + if (p->rmask) + cpy = ((p->rmask & cur) == p->rmask) ? true : false; + if (cpy && p->amask) + cpy = (p->amask & cur) ? true : false; + if (cpy && p->emask) + cpy = (p->emask & cur) ? false : true; + + bitmap = cur & p->return_mask; + + if (cpy && bitmap) { + if (p->index && p->vec[p->index - 1].bitmap == bitmap && + p->vec[p->index - 1].start + p->vec[p->index - 1].len * PAGE_SIZE == addr) { + p->vec[p->index - 1].len += len; + p->found_pages += len; + } else if (p->index < p->vec_len) { + p->vec[p->index].start = addr; + p->vec[p->index].len = len; + p->vec[p->index].bitmap = bitmap; + p->index++; + p->found_pages += len; + } else { + return -ENOMEM; + } + } + + return 0; +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int dirty, ret = 0; + spinlock_t *ptl; + pte_t *pte; + bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? + (false) : (vma->vm_flags & VM_SOFTDIRTY); + + if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages)) + return 0; + + end = min(end, walk->vma->vm_end); + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) { + /* + * Break huge page into small pages if operation needs to be performed is + * on a portion of the huge page or the return buffer cannot store complete + * data. + */ + if ((IS_CLEAR_OP(p->flags) && (end - addr < HPAGE_SIZE)) || + (IS_GET_OP(p->vec) && p->max_pages && + (p->found_pages + HPAGE_SIZE/PAGE_SIZE > p->max_pages))) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + goto process_smaller_pages; + } else { + dirty = check_soft_dirty_pmd(vma, addr, pmd, IS_CLEAR_OP(p->flags)); + if (IS_GET_OP(p->vec)) + add_to_out(dirty_vma || dirty, vma->vm_file, + pmd_present(*pmd), is_swap_pmd(*pmd), p, + addr, (end - addr)/PAGE_SIZE); + } + } + spin_unlock(ptl); + return 0; + } + +process_smaller_pages: + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr < end && !ret; pte++, addr += PAGE_SIZE) { + dirty = check_soft_dirty(vma, addr, pte, IS_CLEAR_OP(p->flags)); + if (IS_GET_OP(p->vec)) { + ret = add_to_out(dirty_vma || dirty, vma->vm_file, pte_present(*pte), + is_swap_pte(*pte), p, addr, 1); + if (p->max_pages && (p->found_pages == p->max_pages)) + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned int len; + bool sd; + + if (vma) { + /* Individual pages haven't been allocated and written */ + sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) : + (vma->vm_flags & VM_SOFTDIRTY); + + len = (end - addr)/PAGE_SIZE; + if (p->max_pages && p->max_pages - p->found_pages < len) + len = p->max_pages - p->found_pages; + + add_to_out(sd, vma->vm_file, false, false, p, addr, len); + } + + return 0; +} + +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long end_cut = end; + int ret; + + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) && + (vma->vm_flags & VM_SOFTDIRTY)) { + if (vma->vm_start < start) { + ret = split_vma(vma->vm_mm, vma, start, 1); + if (ret) + return ret; + } + /* Calculate end_cut because of max_pages */ + if (IS_GET_OP(p->vec) && p->max_pages) + end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end); + + if (vma->vm_end > end_cut) { + ret = split_vma(vma->vm_mm, vma, end_cut, 0); + if (ret) + return ret; + } + } + + return 0; +} + +static void pagemap_scan_post_vma(struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) && + (vma->vm_flags & VM_SOFTDIRTY)) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } +} + +static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (IS_GET_OP(p->vec) && p->max_pages && (p->found_pages == p->max_pages)) + return -1; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + return 0; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_pmd_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + + /* Only for clearing SD bit over VMAs */ + .pre_vma = pagemap_scan_pre_vma, + .post_vma = pagemap_scan_post_vma, +}; + +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg) +{ + struct mmu_notifier_range range; + unsigned long __user start, end; + struct pagemap_scan_private p; + int ret; + + start = (unsigned long)untagged_addr(arg->start); + if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len))) + return -EINVAL; + + if (IS_GET_OP(arg->vec) && + ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len)))) + return -ENOMEM; + + if ((arg->flags & ~PAGEMAP_SD_FLAGS_MASK) || (arg->rmask & ~PAGEMAP_OP_MASK) || + (arg->amask & ~PAGEMAP_OP_MASK) || (arg->emask & ~PAGEMAP_OP_MASK) || + (arg->return_mask & ~PAGEMAP_OP_MASK)) + return -EINVAL; + + if ((!arg->rmask && !arg->amask && !arg->emask) || !arg->return_mask) + return -EINVAL; + + if ((arg->flags & PAGEMAP_SD_FLAGS_MASK) && ((arg->rmask & PAGEMAP_NON_SD_MASK) || + (arg->amask & PAGEMAP_NON_SD_MASK))) + return -EINVAL; + + end = start + arg->len; + p.max_pages = arg->max_pages; + p.found_pages = 0; + p.flags = arg->flags; + p.rmask = arg->rmask; + p.amask = arg->amask; + p.emask = arg->emask; + p.return_mask = arg->return_mask; + p.index = 0; + p.vec_len = arg->vec_len; + + if (IS_GET_OP(arg->vec)) { + p.vec = vzalloc(arg->vec_len * sizeof(struct page_region)); + if (!p.vec) + return -ENOMEM; + } else { + p.vec = NULL; + } + + if (IS_CLEAR_OP(arg->flags)) { + mmap_write_lock(mm); + + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end); + mmu_notifier_invalidate_range_start(&range); + inc_tlb_flush_pending(mm); + } else { + mmap_read_lock(mm); + } + + ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p); + + if (IS_CLEAR_OP(arg->flags)) { + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(mm); + + mmap_write_unlock(mm); + } else { + mmap_read_unlock(mm); + } + + if (ret < 0) + goto free_data; + + if (IS_GET_OP(arg->vec) && p.index) { + if (copy_to_user((struct page_region *)arg->vec, p.vec, + p.index * sizeof(struct page_region))) { + ret = -EFAULT; + goto free_data; + } + ret = p.index; + } else { + ret = 0; + } + +free_data: + if (IS_GET_OP(arg->vec)) + vfree(p.vec); + + return ret; +} + +static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pagemap_scan_arg __user *uarg = (struct pagemap_scan_arg __user *)arg; + struct mm_struct *mm = file->private_data; + struct pagemap_scan_arg argument; + + if (cmd == PAGEMAP_SCAN) { + if (copy_from_user(&argument, uarg, sizeof(struct pagemap_scan_arg))) + return -EFAULT; + return do_pagemap_sd_cmd(mm, &argument); + } + return -EINVAL; +} +#endif /* CONFIG_MEM_SOFT_DIRTY */ + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, +#ifdef CONFIG_MEM_SOFT_DIRTY + .unlocked_ioctl = pagemap_sd_ioctl, + .compat_ioctl = pagemap_sd_ioctl, +#endif /* CONFIG_MEM_SOFT_DIRTY */ }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c..5d6c0d85dac4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,57 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* PAGEMAP IOCTL */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pagemap_scan_arg) + +/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */ +#define PAGE_IS_SD (1 << 0) +#define PAGE_IS_FILE (1 << 1) +#define PAGE_IS_PRESENT (1 << 2) +#define PAGE_IS_SWAPED (1 << 3) + +/* + * struct page_region - Page region with bitmap flags + * @start: Start of the region + * @len: Length of the region + * bitmap: Bits sets for the region + */ +struct page_region { + __u64 start; + __u64 len; + __u32 bitmap; + __u32 __reserved; +}; + +/* + * struct pagemap_scan_arg - Soft-dirty IOCTL argument + * @start: Starting address of the region + * @len: Length of the region (All the pages in this length are included) + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional max return pages (It must be less than vec_len if specified) + * @flags: Special flags for the IOCTL + * @rmask: Required mask - All of these bits have to be set in the PTE + * @amask: Any mask - Any of these bits are set in the PTE + * @emask: Exclude mask - None of these bits are set in the PTE + * @return_mask: Bits that have to be reported to the user in page_region + */ +struct pagemap_scan_arg { + __u64 start; + __u64 len; + __u64 vec; + __u64 vec_len; + __u32 max_pages; + __u32 flags; + __u32 rmask; + __u32 amask; + __u32 emask; + __u32 return_mask; +}; + +/* Special flags */ +#define PAGEMAP_SD_CLEAR (1 << 0) +/* Check the individual pages if they are soft-dirty to find dirty pages faster. */ +#define PAGEMAP_NO_REUSED_REGIONS (1 << 1) + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index b7b56871029c..5d6c0d85dac4 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -305,4 +305,57 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* PAGEMAP IOCTL */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pagemap_scan_arg) + +/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */ +#define PAGE_IS_SD (1 << 0) +#define PAGE_IS_FILE (1 << 1) +#define PAGE_IS_PRESENT (1 << 2) +#define PAGE_IS_SWAPED (1 << 3) + +/* + * struct page_region - Page region with bitmap flags + * @start: Start of the region + * @len: Length of the region + * bitmap: Bits sets for the region + */ +struct page_region { + __u64 start; + __u64 len; + __u32 bitmap; + __u32 __reserved; +}; + +/* + * struct pagemap_scan_arg - Soft-dirty IOCTL argument + * @start: Starting address of the region + * @len: Length of the region (All the pages in this length are included) + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional max return pages (It must be less than vec_len if specified) + * @flags: Special flags for the IOCTL + * @rmask: Required mask - All of these bits have to be set in the PTE + * @amask: Any mask - Any of these bits are set in the PTE + * @emask: Exclude mask - None of these bits are set in the PTE + * @return_mask: Bits that have to be reported to the user in page_region + */ +struct pagemap_scan_arg { + __u64 start; + __u64 len; + __u64 vec; + __u64 vec_len; + __u32 max_pages; + __u32 flags; + __u32 rmask; + __u32 amask; + __u32 emask; + __u32 return_mask; +}; + +/* Special flags */ +#define PAGEMAP_SD_CLEAR (1 << 0) +/* Check the individual pages if they are soft-dirty to find dirty pages faster. */ +#define PAGEMAP_NO_REUSED_REGIONS (1 << 1) + #endif /* _UAPI_LINUX_FS_H */ From patchwork Thu Nov 3 14:53:53 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Muhammad Usama Anjum X-Patchwork-Id: 13030117 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 5127DC43219 for ; Thu, 3 Nov 2022 14:55:01 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231278AbiKCOy7 (ORCPT ); Thu, 3 Nov 2022 10:54:59 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39230 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231860AbiKCOyc (ORCPT ); Thu, 3 Nov 2022 10:54:32 -0400 Received: from madras.collabora.co.uk (madras.collabora.co.uk [IPv6:2a00:1098:0:82:1000:25:2eeb:e5ab]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B7B36B864; Thu, 3 Nov 2022 07:54:27 -0700 (PDT) Received: from localhost.localdomain (unknown [39.45.244.84]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) (Authenticated sender: usama.anjum) by madras.collabora.co.uk (Postfix) with ESMTPSA id 552ED66015E4; Thu, 3 Nov 2022 14:54:21 +0000 (GMT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=collabora.com; s=mail; t=1667487266; bh=626n+z9DNf3YKf+IKKCXggA9tiB1CJ99myronEy1b5w=; h=From:To:Subject:Date:In-Reply-To:References:From; b=HT0Dlz7ncTZzJVnf0iYYOZtv0MjsGkertB7Mw1YMvhfjWDFFB9i1YBEB8HpyBTtS7 YQTyO38oDngfc46GWPRHpk7G9e8ALi5mmFzTY1pcTa7LTcLfxRhJR2n7dTDaL37ugd iTY/+MnFXg+0QCfuGwCbP7KiV0kJoI2T/6VsCBpDd2zWUcLzbt6z1ZCHgtXCCkm7Pj ILIgLIuPXMEHkPRUcH9+hmjheqHVX6nR58xUrPcdFAIqPvjWcNra/WbD+kQHWhGpSC fzesuQGSNPrPhKe6lXdpAEBfCZ72UOK3+BfVS0uGSTkivtZRL+ZwUCMD3uq0NU0Vz/ ms0nnEmqPcycw== From: Muhammad Usama Anjum To: Andrei Vagin , Danylo Mocherniuk , Alexander Viro , Andrew Morton , =?utf-8?b?TWljaGHFgiBNaXJvc8WC?= =?utf-8?b?YXc=?= , Suren Baghdasaryan , Greg KH , Christian Brauner , Peter Xu , Yang Shi , Vlastimil Babka , "Zach O'Keefe" , "Matthew Wilcox (Oracle)" , "Gustavo A. R. Silva" , Dan Williams , Muhammad Usama Anjum , kernel@collabora.com, Gabriel Krisman Bertazi , David Hildenbrand , Peter Enderborg , "open list : KERNEL SELFTEST FRAMEWORK" , Shuah Khan , open list , "open list : PROC FILESYSTEM" , "open list : MEMORY MANAGEMENT" Subject: [PATCH v5 3/3] selftests: vm: add pagemap ioctl tests Date: Thu, 3 Nov 2022 19:53:53 +0500 Message-Id: <20221103145353.3049303-4-usama.anjum@collabora.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221103145353.3049303-1-usama.anjum@collabora.com> References: <20221103145353.3049303-1-usama.anjum@collabora.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-fsdevel@vger.kernel.org Add pagemap ioctl tests. Add several different types of tests to judge the correction of the interface. Signed-off-by: Muhammad Usama Anjum --- Changes in v4: - Updated all the tests to conform to new IOCTL Changes in v3: - Add another test to do sanity of flags Changes in v2: - Update the tests to use the ioctl interface instead of syscall TAP version 13 1..59 ok 1 sanity_tests_sd wrong flag specified ok 2 sanity_tests_sd wrong mask specified ok 3 sanity_tests_sd wrong return mask specified ok 4 sanity_tests_sd mixture of correct and wrong flag ok 5 sanity_tests_sd Clear area with larger vec size ok 6 sanity_tests_sd Repeated pattern of dirty and non-dirty pages ok 7 sanity_tests_sd Repeated pattern of dirty and non-dirty pages in parts ok 8 sanity_tests_sd Two regions ok 9 Page testing: all new pages must be soft dirty ok 10 Page testing: all pages must not be soft dirty ok 11 Page testing: all pages dirty other than first and the last one ok 12 Page testing: only middle page dirty ok 13 Page testing: only two middle pages dirty ok 14 Page testing: only get 2 dirty pages and clear them as well ok 15 Page testing: Range clear only ok 16 Large Page testing: all new pages must be soft dirty ok 17 Large Page testing: all pages must not be soft dirty ok 18 Large Page testing: all pages dirty other than first and the last one ok 19 Large Page testing: only middle page dirty ok 20 Large Page testing: only two middle pages dirty ok 21 Large Page testing: only get 2 dirty pages and clear them as well ok 22 Large Page testing: Range clear only ok 23 Huge page testing: all new pages must be soft dirty ok 24 Huge page testing: all pages must not be soft dirty ok 25 Huge page testing: all pages dirty other than first and the last one ok 26 Huge page testing: only middle page dirty ok 27 Huge page testing: only two middle pages dirty ok 28 Huge page testing: only get 2 dirty pages and clear them as well ok 29 Huge page testing: Range clear only ok 30 Performance Page testing: all new pages must be soft dirty ok 31 Performance Page testing: all pages must not be soft dirty ok 32 Performance Page testing: all pages dirty other than first and the last one ok 33 Performance Page testing: only middle page dirty ok 34 Performance Page testing: only two middle pages dirty ok 35 Performance Page testing: only get 2 dirty pages and clear them as well ok 36 Performance Page testing: Range clear only ok 37 hpage_unit_tests all new huge page must be dirty ok 38 hpage_unit_tests all the huge page must not be dirty ok 39 hpage_unit_tests all the huge page must be dirty and clear ok 40 hpage_unit_tests only middle page dirty ok 41 hpage_unit_tests clear first half of huge page ok 42 hpage_unit_tests clear first half of huge page with limited buffer ok 43 hpage_unit_tests clear second half huge page ok 44 unmapped_region_tests Get dirty pages ok 45 unmapped_region_tests Get dirty pages ok 46 Test test_simple ok 47 sanity_tests clear op can only be specified with PAGE_IS_DIRTY ok 48 sanity_tests rmask specified ok 49 sanity_tests amask specified ok 50 sanity_tests emask specified ok 51 sanity_tests rmask and amask specified ok 52 sanity_tests rmask and amask specified ok 53 sanity_tests Get sd and present pages with amask ok 54 sanity_tests Get all the pages with rmask ok 55 sanity_tests Get sd and present pages with rmask and amask ok 56 sanity_tests Don't get sd pages ok 57 sanity_tests Don't get present pages ok 58 sanity_tests Find dirty present pages with return mask ok 59 sanity_tests Memory mapped file # Totals: pass:59 fail:0 xfail:0 xpass:0 skip:0 error:0 --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 5 +- tools/testing/selftests/vm/pagemap_ioctl.c | 681 +++++++++++++++++++++ 3 files changed, 685 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/vm/pagemap_ioctl.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 8a536c731e3c..4a73983e3e58 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -17,6 +17,7 @@ mremap_dontunmap mremap_test on-fault-limit transhuge-stress +pagemap_ioctl protection_keys protection_keys_32 protection_keys_64 diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 0986bd60c19f..2325bcdb9fae 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -24,9 +24,8 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p # things despite using incorrect values such as an *occasionally* incomplete # LDLIBS. MAKEFLAGS += --no-builtin-rules - CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) -LDLIBS = -lrt -lpthread +LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = anon_cow TEST_GEN_FILES += compaction_test TEST_GEN_FILES += gup_test @@ -52,6 +51,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += pagemap_ioctl TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests @@ -103,6 +103,7 @@ $(OUTPUT)/anon_cow: vm_util.c $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/ksm_functional_tests: vm_util.c $(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/pagemap_ioctl: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c $(OUTPUT)/userfaultfd: vm_util.c diff --git a/tools/testing/selftests/vm/pagemap_ioctl.c b/tools/testing/selftests/vm/pagemap_ioctl.c new file mode 100644 index 000000000000..c55a0efa39f5 --- /dev/null +++ b/tools/testing/selftests/vm/pagemap_ioctl.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" +#include "../kselftest.h" +#include +#include +#include +#include +#include + +#define PAGEMAP_OP_MASK (PAGE_IS_SD | PAGE_IS_FILE | \ + PAGE_IS_PRESENT | PAGE_IS_SWAPED) +#define TEST_ITERATIONS 10 +#define PAGEMAP "/proc/self/pagemap" +int pagemap_fd; + +static long pagemap_ioctl(void *start, int len, void *vec, int vec_len, int flag, + int max_pages, int rmask, int amask, int emask, int return_mask) +{ + struct pagemap_scan_arg arg; + int ret; + + arg.start = (uintptr_t)start; + arg.len = len; + arg.vec = (uintptr_t)vec; + arg.vec_len = vec_len; + arg.flags = flag; + arg.max_pages = max_pages; + arg.rmask = rmask; + arg.amask = amask; + arg.emask = emask; + arg.return_mask = return_mask; + + ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); + + return ret; +} + +int sanity_tests_sd(int page_size) +{ + char *mem, *m[2]; + int mem_size, vec_size, ret, ret2, i; + struct page_region *vec; + + /* 1. wrong operation */ + vec_size = 100; + mem_size = 10 * page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem || !vec) + ksft_exit_fail_msg("error nomem\n"); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, -1, + 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD) < 0, + "%s wrong flag specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 8, + 0, 0x1111, 0, 0, PAGE_IS_SD) < 0, + "%s wrong mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_SD, 0, 0, 0x1000) < 0, + "%s wrong return mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, PAGEMAP_SD_CLEAR | 0x32, + 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD) < 0, + "%s mixture of correct and wrong flag\n", __func__); + + /* 2. Clear area with larger vec size */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + ksft_test_result(ret >= 0, "%s Clear area with larger vec size\n", __func__); + + /* 3. Repeated pattern of dirty and non-dirty pages */ + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == mem_size/(page_size * 2), + "%s Repeated pattern of dirty and non-dirty pages\n", __func__); + + /* 4. Repeated pattern of dirty and non-dirty pages in parts*/ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, mem_size/(page_size * 2) - 2, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result((ret + ret2) == mem_size/(page_size * 2), + "%s Repeated pattern of dirty and non-dirty pages in parts\n", __func__); + + munmap(mem, mem_size); + + /* 5. Two regions */ + m[0] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!m[0]) + ksft_exit_fail_msg("error nomem\n"); + m[1] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!m[1]) + ksft_exit_fail_msg("error nomem\n"); + + ret = pagemap_ioctl(m[0], mem_size, NULL, 0, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len == mem_size/page_size, + "%s Two regions\n", __func__); + + munmap(m[0], mem_size); + munmap(m[1], mem_size); + + free(vec); + return 0; +} + +int base_tests(char *prefix, char *mem, int mem_size, int page_size, int skip, int flags) +{ + int vec_size, ret, dirty, dirty2; + struct page_region *vec, *vec2; + + if (skip) { + ksft_test_result_skip("%s all new pages must be soft dirty\n", prefix); + ksft_test_result_skip("%s all pages must not be soft dirty\n", prefix); + ksft_test_result_skip("%s all pages dirty other than first and the last one\n", + prefix); + ksft_test_result_skip("%s only middle page dirty\n", prefix); + ksft_test_result_skip("%s only two middle pages dirty\n", prefix); + ksft_test_result_skip("%s only get 2 dirty pages and clear them as well\n", prefix); + ksft_test_result_skip("%s Range clear only\n", prefix); + return 0; + } + + vec_size = mem_size/page_size; + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + + /* 1. all new pages must be soft dirty if PAGEMAP_NO_REUSED_REGIONS isn't used */ + dirty = pagemap_ioctl(mem, mem_size, vec, 1, flags | PAGEMAP_SD_CLEAR, vec_size - 2, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + dirty2 = pagemap_ioctl(mem, mem_size, vec2, 1, flags | PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty2, errno, strerror(errno)); + + if (flags != PAGEMAP_NO_REUSED_REGIONS) + ksft_test_result(dirty == 1 && vec[0].start == (unsigned long)mem && + vec[0].len == vec_size - 2 && vec[0].bitmap == PAGE_IS_SD && + dirty2 == 1 && + vec2[0].start == (unsigned long)(mem + mem_size - (2 * page_size)) + && vec2[0].len == 2 && vec[0].bitmap == PAGE_IS_SD, + "%s all new pages must be soft dirty\n", prefix); + else + ksft_test_result(dirty == 0 && dirty2 == 0, + "%s all new pages must be soft dirty\n", prefix); + + // 2. all pages must not be soft dirty + dirty = pagemap_ioctl(mem, mem_size, vec, 1, flags, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(dirty == 0, "%s all pages must not be soft dirty\n", prefix); + + // 3. all pages dirty other than first and the last one + memset(mem + page_size, -1, mem_size - (2 * page_size)); + + dirty = pagemap_ioctl(mem, mem_size, vec, 1, flags, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(dirty == 1 && vec[0].len >= vec_size - 2 && vec[0].len <= vec_size, + "%s all pages dirty other than first and the last one\n", prefix); + + // 4. only middle page dirty + clear_softdirty(); + mem[vec_size/2 * page_size]++; + + dirty = pagemap_ioctl(mem, mem_size, vec, vec_size, flags, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(vec[0].start == (uintptr_t)(mem + vec_size/2 * page_size), + "%s only middle page dirty\n", prefix); + + // 5. only two middle pages dirty and walk over only middle pages + clear_softdirty(); + mem[vec_size/2 * page_size]++; + mem[(vec_size/2 + 1) * page_size]++; + + dirty = pagemap_ioctl(&mem[vec_size/2 * page_size], 2 * page_size, vec, 1, flags, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(dirty == 1 && vec[0].start == (uintptr_t)(&mem[vec_size/2 * page_size]) && + vec[0].len == 2, + "%s only two middle pages dirty\n", prefix); + + /* 6. only get 2 dirty pages and clear them as well */ + memset(mem, -1, mem_size); + + /* get and clear second and third pages */ + ret = pagemap_ioctl(mem + page_size, 2 * page_size, vec, 1, flags | PAGEMAP_SD_CLEAR, 2, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + dirty = pagemap_ioctl(mem, mem_size, vec2, vec_size, flags, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len == 2 && + vec[0].start == (uintptr_t)(mem + page_size) && + dirty == 2 && vec2[0].len == 1 && vec2[0].start == (uintptr_t)mem && + vec2[1].len == vec_size - 3 && + vec2[1].start == (uintptr_t)(mem + 3 * page_size), + "%s only get 2 dirty pages and clear them as well\n", prefix); + + /* 7. Range clear only */ + memset(mem, -1, mem_size); + + dirty = pagemap_ioctl(mem, mem_size, NULL, 0, flags | PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + dirty2 = pagemap_ioctl(mem, mem_size, vec, vec_size, flags, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty2, errno, strerror(errno)); + + ksft_test_result(dirty == 0 && dirty2 == 0, "%s Range clear only\n", + prefix); + + free(vec); + free(vec2); + return 0; +} + +void *gethugepage(int map_size) +{ + int ret; + char *map; + size_t hpage_len = read_pmd_pagesize(); + + map = memalign(hpage_len, map_size); + if (!map) + ksft_exit_fail_msg("memalign failed %d %s\n", errno, strerror(errno)); + + ret = madvise(map, map_size, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d %d %s\n", ret, errno, strerror(errno)); + + memset(map, 0, map_size); + + if (check_huge_anon(map, map_size/hpage_len, hpage_len)) + return map; + + free(map); + return NULL; + +} + +int hpage_unit_tests(int page_size) +{ + char *map; + int ret; + size_t hpage_len = read_pmd_pagesize(); + size_t num_pages = 10; + int map_size = hpage_len * num_pages; + int vec_size = map_size/page_size; + struct page_region *vec, *vec2; + + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + if (!vec || !vec2) + ksft_exit_fail_msg("malloc failed\n"); + + map = gethugepage(map_size); + if (map) { + // 1. all new huge page must be dirty + ret = pagemap_ioctl(map, map_size, vec, vec_size, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].start == (uintptr_t)map && + vec[0].len == vec_size && vec[0].bitmap == PAGE_IS_SD, + "%s all new huge page must be dirty\n", __func__); + + // 2. all the huge page must not be dirty + ret = pagemap_ioctl(map, map_size, vec, vec_size, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 0, "%s all the huge page must not be dirty\n", __func__); + + // 3. all the huge page must be dirty and clear dirty as well + memset(map, -1, map_size); + ret = pagemap_ioctl(map, map_size, vec, vec_size, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].start == (uintptr_t)map && + vec[0].len == vec_size && vec[0].bitmap == PAGE_IS_SD, + "%s all the huge page must be dirty and clear\n", __func__); + + // 4. only middle page dirty + free(map); + map = gethugepage(map_size); + clear_softdirty(); + map[vec_size/2 * page_size]++; + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len > 0, + "%s only middle page dirty\n", __func__); + + free(map); + } else { + ksft_test_result_skip("all new huge page must be dirty\n"); + ksft_test_result_skip("all the huge page must not be dirty\n"); + ksft_test_result_skip("all the huge page must be dirty and clear\n"); + ksft_test_result_skip("only middle page dirty\n"); + } + + // 5. clear first half of huge page + map = gethugepage(map_size); + if (map) { + ret = pagemap_ioctl(map, map_size/2, NULL, 0, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page\n", __func__); + free(map); + } else { + ksft_test_result_skip("clear first half of huge page\n"); + } + + // 6. clear first half of huge page with limited buffer + map = gethugepage(map_size); + if (map) { + ret = pagemap_ioctl(map, map_size, vec, vec_size, PAGEMAP_SD_CLEAR, vec_size/2, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page with limited buffer\n", + __func__); + + free(map); + } else { + ksft_test_result_skip("clear first half of huge page with limited buffer\n"); + } + + // 7. clear second half of huge page + map = gethugepage(map_size); + if (map) { + memset(map, -1, map_size); + ret = pagemap_ioctl(map + map_size/2, map_size, NULL, 0, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].len == vec_size/2, + "%s clear second half huge page\n", __func__); + free(map); + } else { + ksft_test_result_skip("clear second half huge page\n"); + } + + free(vec); + free(vec2); + return 0; +} + +int unmapped_region_tests(int page_size) +{ + void *start = (void *)0x10000000; + int dirty, len = 0x00040000; + int vec_size = len / page_size; + struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); + + /* 1. Get dirty pages */ + dirty = pagemap_ioctl(start, len, vec, vec_size, 0, 0, PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(dirty >= 0, "%s Get dirty pages\n", __func__); + + /* 2. Clear dirty bit of whole address space */ + dirty = pagemap_ioctl(0, 0x7FFFFFFF, NULL, 0, PAGEMAP_SD_CLEAR, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD); + if (dirty < 0) + ksft_exit_fail_msg("error %d %d %s\n", dirty, errno, strerror(errno)); + + ksft_test_result(dirty == 0, "%s Get dirty pages\n", __func__); + + free(vec); + return 0; +} + +static void test_simple(int page_size) +{ + int i; + char *map; + struct page_region vec; + + map = aligned_alloc(page_size, page_size); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_SD, 0, 0, PAGE_IS_SD) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +int sanity_tests(int page_size) +{ + char *mem, *fmem; + int mem_size, vec_size, ret; + struct page_region *vec; + + /* 1. wrong operation */ + mem_size = 10 * page_size; + vec_size = mem_size / page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem || !vec) + ksft_exit_fail_msg("error nomem\n"); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PAGEMAP_SD_CLEAR | PAGEMAP_NO_REUSED_REGIONS, 0, + PAGEMAP_OP_MASK, 0, 0, PAGEMAP_OP_MASK) < 0, + "%s clear op can only be specified with PAGE_IS_DIRTY\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_OP_MASK, 0, 0, PAGEMAP_OP_MASK) >= 0, + "%s rmask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_OP_MASK, 0, PAGEMAP_OP_MASK) >= 0, + "%s amask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, 0, PAGEMAP_OP_MASK, PAGEMAP_OP_MASK) >= 0, + "%s emask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_OP_MASK, PAGEMAP_OP_MASK, 0, PAGEMAP_OP_MASK) >= 0, + "%s rmask and amask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, PAGEMAP_SD_CLEAR, 0, + 0, 0, PAGEMAP_OP_MASK, PAGEMAP_OP_MASK) >= 0, + "%s rmask and amask specified\n", __func__); + munmap(mem, mem_size); + + /* 2. Get sd and present pages with amask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem) + ksft_exit_fail_msg("error nomem\n"); + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_OP_MASK, 0, PAGEMAP_OP_MASK); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && vec[0].len == vec_size && + vec[0].bitmap == (PAGE_IS_SD | PAGE_IS_PRESENT), + "%s Get sd and present pages with amask\n", __func__); + + /* 3. Get sd and present pages with rmask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_OP_MASK, 0, 0, PAGEMAP_OP_MASK); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && vec[0].len == vec_size && + vec[0].bitmap == (PAGE_IS_SD | PAGE_IS_PRESENT), + "%s Get all the pages with rmask\n", __func__); + + /* 4. Get sd and present pages with rmask and amask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_SD, PAGE_IS_PRESENT, 0, PAGEMAP_OP_MASK); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && vec[0].len == vec_size && + vec[0].bitmap == (PAGE_IS_SD | PAGE_IS_PRESENT), + "%s Get sd and present pages with rmask and amask\n", __func__); + + /* 5. Don't get sd pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, 0, PAGE_IS_SD, PAGEMAP_OP_MASK); + ksft_test_result(ret == 0, "%s Don't get sd pages\n", __func__); + + /* 6. Don't get present pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, 0, PAGE_IS_PRESENT, PAGEMAP_OP_MASK); + ksft_test_result(ret == 0, "%s Don't get present pages\n", __func__); + + munmap(mem, mem_size); + + /* 8. Find dirty present pages with return mask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem) + ksft_exit_fail_msg("error nomem\n"); + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_OP_MASK, 0, PAGE_IS_SD); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && vec[0].len == vec_size && + vec[0].bitmap == PAGE_IS_SD, + "%s Find dirty present pages with return mask\n", __func__); + + /* 9. Memory mapped file */ + int fd; + struct stat sbuf; + + fd = open("run_vmtests.sh", O_RDONLY); + if (fd < 0) { + ksft_test_result_skip("%s Memory mapped file\n"); + goto free_vec_and_return; + } + + ret = stat("run_vmtests.sh", &sbuf); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (!fmem) + ksft_exit_fail_msg("error nomem\n"); + + ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, + 0, PAGEMAP_OP_MASK, 0, PAGEMAP_OP_MASK); + + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && + vec[0].len == ceilf((float)sbuf.st_size/page_size) && + vec[0].bitmap == (PAGE_IS_SD | PAGE_IS_FILE), + "%s Memory mapped file\n", __func__); + + munmap(fmem, sbuf.st_size); + +free_vec_and_return: + free(vec); + return 0; +} + +int main(void) +{ + int page_size = getpagesize(); + size_t hpage_len = read_pmd_pagesize(); + char *mem, *map; + int mem_size; + + ksft_print_header(); + ksft_set_plan(59); + + pagemap_fd = open(PAGEMAP, O_RDWR); + if (pagemap_fd < 0) + return -EINVAL; + + /* + * Soft-dirty PTE bit tests + */ + + /* 1. Sanity testing */ + sanity_tests_sd(page_size); + + /* 2. Normal page testing */ + mem_size = 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem) + ksft_exit_fail_msg("error nomem\n"); + + base_tests("Page testing:", mem, mem_size, page_size, 0, 0); + + munmap(mem, mem_size); + + /* 3. Large page testing */ + mem_size = 512 * 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem) + ksft_exit_fail_msg("error nomem\n"); + + base_tests("Large Page testing:", mem, mem_size, page_size, 0, 0); + + munmap(mem, mem_size); + + /* 4. Huge page testing */ + map = gethugepage(hpage_len); + if (map) + base_tests("Huge page testing:", map, hpage_len, page_size, 0, 0); + else + base_tests("Huge page testing:", NULL, 0, 0, 1, 0); + + free(map); + + /* 5. Performance page testing */ + mem_size = 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (!mem) + ksft_exit_fail_msg("error nomem\n"); + + base_tests("Performance Page testing:", mem, mem_size, page_size, 0, + PAGEMAP_NO_REUSED_REGIONS); + + munmap(mem, mem_size); + + /* 6. Huge page tests */ + hpage_unit_tests(page_size); + + /* 7. Unmapped address test */ + unmapped_region_tests(page_size); + + /* 8. Iterative test */ + test_simple(page_size); + + /* + * Other PTE bit tests + */ + + /* 1. Sanity testing */ + sanity_tests(page_size); + + close(pagemap_fd); + return ksft_exit_pass(); +}