@@ -2249,6 +2249,7 @@ struct super_operations {
#define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
+#define S_ZEROPAGE (1 << 17)
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2291,6 +2292,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY)
+#define IS_ZEROPAGE(inode) ((inode)->i_flags & S_ZEROPAGE)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
@@ -8,6 +8,7 @@
#define MFD_CLOEXEC 0x0001U
#define MFD_ALLOW_SEALING 0x0002U
#define MFD_HUGETLB 0x0004U
+#define MFD_ZEROPAGE 0x0008U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
@@ -245,7 +245,7 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_ZEROPAGE)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -301,8 +301,12 @@ SYSCALL_DEFINE2(memfd_create,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (flags & MFD_ZEROPAGE) {
+ file_inode(file)->i_flags |= S_ZEROPAGE;
+ }
+ }
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
@@ -3208,6 +3208,26 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
return ret;
}
+static vm_fault_t do_shared_fault(struct vm_fault *vmf);
+
+static vm_fault_t wp_zero_shared(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ vm_fault_t ret;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ ret = do_shared_fault(vmf);
+ mmu_notifier_invalidate_range_only_end(&range);
+ return ret;
+}
+
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -3254,8 +3274,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- return wp_pfn_shared(vmf);
+ (VM_WRITE|VM_SHARED)) {
+ if (unlikely(vma->vm_file &&
+ IS_ZEROPAGE(file_inode(vma->vm_file)) &&
+ is_zero_pfn(pte_pfn(*vmf->pte)))) {
+ return wp_zero_shared(vmf);
+ } else {
+ return wp_pfn_shared(vmf);
+ }
+ }
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
@@ -3970,12 +3997,16 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (unlikely(vma->vm_file && IS_ZEROPAGE(file_inode(vma->vm_file)) &&
+ is_zero_pfn(page_to_pfn(page))))
+ entry = pte_mkspecial(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
- } else {
+ } else if (likely(!vma->vm_file || !IS_ZEROPAGE(file_inode(vma->vm_file)) ||
+ !is_zero_pfn(page_to_pfn(page)))) {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
@@ -1899,8 +1899,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
alloc_nohuge:
- page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
+ if (IS_ZEROPAGE(inode) && vmf &&
+ !(vmf->flags & FAULT_FLAG_WRITE)) {
+ page = ZERO_PAGE(0);
+ goto out;
+ } else {
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
}
if (IS_ERR(page)) {
int retry = 5;
Mapping to zero page on reading memfd and COWing it when a write occurs. Signed-off-by: Peng Liang <liangpeng10@huawei.com> --- include/linux/fs.h | 2 ++ include/uapi/linux/memfd.h | 1 + mm/memfd.c | 8 ++++++-- mm/memory.c | 37 ++++++++++++++++++++++++++++++++++--- mm/shmem.c | 10 ++++++++-- 5 files changed, 51 insertions(+), 7 deletions(-)