From patchwork Wed Dec 22 12:34:00 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Peng Liang X-Patchwork-Id: 12691521 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) by smtp.lore.kernel.org (Postfix) with ESMTP id 64FA4C4332F for ; Wed, 22 Dec 2021 12:42:37 +0000 (UTC) Received: by kanga.kvack.org (Postfix) id F21996B0074; Wed, 22 Dec 2021 07:42:36 -0500 (EST) Received: by kanga.kvack.org (Postfix, from userid 40) id ED1E66B0075; Wed, 22 Dec 2021 07:42:36 -0500 (EST) X-Delivered-To: int-list-linux-mm@kvack.org Received: by kanga.kvack.org (Postfix, from userid 63042) id D998B6B0078; Wed, 22 Dec 2021 07:42:36 -0500 (EST) X-Delivered-To: linux-mm@kvack.org Received: from forelay.hostedemail.com (smtprelay0189.hostedemail.com [216.40.44.189]) by kanga.kvack.org (Postfix) with ESMTP id CC6C26B0074 for ; Wed, 22 Dec 2021 07:42:36 -0500 (EST) Received: from smtpin28.hostedemail.com (10.5.19.251.rfc1918.com [10.5.19.251]) by forelay05.hostedemail.com (Postfix) with ESMTP id 8FD1C181AC9C6 for ; Wed, 22 Dec 2021 12:42:36 +0000 (UTC) X-FDA: 78945393912.28.53FF6DE Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189]) by imf07.hostedemail.com (Postfix) with ESMTP id 4879C40031 for ; Wed, 22 Dec 2021 12:42:35 +0000 (UTC) Received: from kwepemi100008.china.huawei.com (unknown [172.30.72.54]) by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4JJtFj5mbXz8w3c; Wed, 22 Dec 2021 20:40:09 +0800 (CST) Received: from kwepemm600002.china.huawei.com (7.193.23.29) by kwepemi100008.china.huawei.com (7.221.188.57) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2308.20; Wed, 22 Dec 2021 20:42:30 +0800 Received: from localhost.localdomain (10.175.101.6) by kwepemm600002.china.huawei.com (7.193.23.29) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2308.20; Wed, 22 Dec 2021 20:42:29 +0800 From: Peng Liang To: , CC: , , , , , Subject: [RFC 1/1] memfd: Support mapping to zero page on reading Date: Wed, 22 Dec 2021 20:34:00 +0800 Message-ID: <20211222123400.1659635-2-liangpeng10@huawei.com> X-Mailer: git-send-email 2.33.1 In-Reply-To: <20211222123400.1659635-1-liangpeng10@huawei.com> References: <20211222123400.1659635-1-liangpeng10@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.175.101.6] X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To kwepemm600002.china.huawei.com (7.193.23.29) X-CFilter-Loop: Reflected X-Rspamd-Queue-Id: 4879C40031 X-Stat-Signature: qj3hx5tt98tp5gxzmmefps1y9ytd85d5 Authentication-Results: imf07.hostedemail.com; dkim=none; dmarc=pass (policy=quarantine) header.from=huawei.com; spf=pass (imf07.hostedemail.com: domain of liangpeng10@huawei.com designates 45.249.212.189 as permitted sender) smtp.mailfrom=liangpeng10@huawei.com X-Rspamd-Server: rspam02 X-HE-Tag: 1640176955-421432 X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 Sender: owner-linux-mm@kvack.org Precedence: bulk X-Loop: owner-majordomo@kvack.org List-ID: Mapping to zero page on reading memfd and COWing it when a write occurs. Signed-off-by: Peng Liang --- include/linux/fs.h | 2 ++ include/uapi/linux/memfd.h | 1 + mm/memfd.c | 8 ++++++-- mm/memory.c | 37 ++++++++++++++++++++++++++++++++++--- mm/shmem.c | 10 ++++++++-- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..404c0c26ba98 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2249,6 +2249,7 @@ struct super_operations { #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ +#define S_ZEROPAGE (1 << 17) /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -2291,6 +2292,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) +#define IS_ZEROPAGE(inode) ((inode)->i_flags & S_ZEROPAGE) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..2bfac06f53fb 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,7 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +#define MFD_ZEROPAGE 0x0008U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..5c167b2de9ae 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -245,7 +245,7 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_ZEROPAGE) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -301,8 +301,12 @@ SYSCALL_DEFINE2(memfd_create, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); + if (flags & MFD_ZEROPAGE) { + file_inode(file)->i_flags |= S_ZEROPAGE; + } + } if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..360606964a7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3208,6 +3208,26 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) return ret; } +static vm_fault_t do_shared_fault(struct vm_fault *vmf); + +static vm_fault_t wp_zero_shared(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + vm_fault_t ret; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + ret = do_shared_fault(vmf); + mmu_notifier_invalidate_range_only_end(&range); + return ret; +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -3254,8 +3274,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(vmf); + (VM_WRITE|VM_SHARED)) { + if (unlikely(vma->vm_file && + IS_ZEROPAGE(file_inode(vma->vm_file)) && + is_zero_pfn(pte_pfn(*vmf->pte)))) { + return wp_zero_shared(vmf); + } else { + return wp_pfn_shared(vmf); + } + } pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); @@ -3970,12 +3997,16 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (unlikely(vma->vm_file && IS_ZEROPAGE(file_inode(vma->vm_file)) && + is_zero_pfn(page_to_pfn(page)))) + entry = pte_mkspecial(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); - } else { + } else if (likely(!vma->vm_file || !IS_ZEROPAGE(file_inode(vma->vm_file)) || + !is_zero_pfn(page_to_pfn(page)))) { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..f4b23124826d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1899,8 +1899,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: - page = shmem_alloc_and_acct_page(gfp, inode, - index, false); + if (IS_ZEROPAGE(inode) && vmf && + !(vmf->flags & FAULT_FLAG_WRITE)) { + page = ZERO_PAGE(0); + goto out; + } else { + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } } if (IS_ERR(page)) { int retry = 5;