diff mbox series

[v3,2/2] mm,thp: Add experimental config option RO_EXEC_FILEMAP_HUGE_FAULT_THP

Message ID 20190731082513.16957-3-william.kucharski@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm,thp: Add filemap_huge_fault() for THP | expand

Commit Message

William Kucharski July 31, 2019, 8:25 a.m. UTC
Add filemap_huge_fault() to attempt to satisfy page
faults on memory-mapped read-only text pages using THP when possible.

Signed-off-by: William Kucharski <william.kucharski@oracle.com>
---
 include/linux/huge_mm.h |  16 ++-
 include/linux/mm.h      |   6 +
 mm/Kconfig              |  15 ++
 mm/filemap.c            | 300 +++++++++++++++++++++++++++++++++++++++-
 mm/huge_memory.c        |   3 +
 mm/mmap.c               |  36 ++++-
 mm/rmap.c               |   8 ++
 7 files changed, 374 insertions(+), 10 deletions(-)

Comments

Kirill A . Shutemov Aug. 1, 2019, 12:36 p.m. UTC | #1
On Wed, Jul 31, 2019 at 02:25:13AM -0600, William Kucharski wrote:
> Add filemap_huge_fault() to attempt to satisfy page
> faults on memory-mapped read-only text pages using THP when possible.
> 
> Signed-off-by: William Kucharski <william.kucharski@oracle.com>
> ---
>  include/linux/huge_mm.h |  16 ++-
>  include/linux/mm.h      |   6 +
>  mm/Kconfig              |  15 ++
>  mm/filemap.c            | 300 +++++++++++++++++++++++++++++++++++++++-
>  mm/huge_memory.c        |   3 +
>  mm/mmap.c               |  36 ++++-
>  mm/rmap.c               |   8 ++
>  7 files changed, 374 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 45ede62aa85b..b1e5fd3179fd 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -79,13 +79,15 @@ extern struct kobj_attribute shmem_enabled_attr;
>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -#define HPAGE_PMD_SHIFT PMD_SHIFT
> -#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
> -#define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
> -
> -#define HPAGE_PUD_SHIFT PUD_SHIFT
> -#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
> -#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
> +#define HPAGE_PMD_SHIFT		PMD_SHIFT
> +#define HPAGE_PMD_SIZE		((1UL) << HPAGE_PMD_SHIFT)
> +#define HPAGE_PMD_OFFSET	(HPAGE_PMD_SIZE - 1)
> +#define HPAGE_PMD_MASK		(~(HPAGE_PMD_OFFSET))
> +
> +#define HPAGE_PUD_SHIFT		PUD_SHIFT
> +#define HPAGE_PUD_SIZE		((1UL) << HPAGE_PUD_SHIFT)
> +#define HPAGE_PUD_OFFSET	(HPAGE_PUD_SIZE - 1)
> +#define HPAGE_PUD_MASK		(~(HPAGE_PUD_OFFSET))

OFFSET vs MASK semantics can be confusing without reading the definition.
We don't have anything similar for base page size, right (PAGE_OFFSET is
completely different thing :P)?

>  
>  extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0334ca97c584..ba24b515468a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2433,6 +2433,12 @@ extern void truncate_inode_pages_final(struct address_space *);
>  
>  /* generic vm_area_ops exported for stackable file systems */
>  extern vm_fault_t filemap_fault(struct vm_fault *vmf);
> +
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
> +			enum page_entry_size pe_size);
> +#endif
> +

No need for #ifdef here.

>  extern void filemap_map_pages(struct vm_fault *vmf,
>  		pgoff_t start_pgoff, pgoff_t end_pgoff);
>  extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 56cec636a1fc..2debaded0e4d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -736,4 +736,19 @@ config ARCH_HAS_PTE_SPECIAL
>  config ARCH_HAS_HUGEPD
>  	bool
>  
> +config RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +	bool "read-only exec filemap_huge_fault THP support (EXPERIMENTAL)"
> +	depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
> +
> +	help
> +	    Introduce filemap_huge_fault() to automatically map executable
> +	    read-only pages of mapped files of suitable size and alignment
> +	    using THP if possible.
> +
> +	    This is marked experimental because it is a new feature and is
> +	    dependent upon filesystmes implementing readpages() in a way
> +	    that will recognize large THP pages and read file content to
> +	    them without polluting the pagecache with PAGESIZE pages due
> +	    to readahead.
> +
>  endmenu
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 38b46fc00855..db1d8df20367 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -199,6 +199,8 @@ static void unaccount_page_cache_page(struct address_space *mapping,
>  	nr = hpage_nr_pages(page);
>  
>  	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
> +
> +#ifndef	CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>  	if (PageSwapBacked(page)) {
>  		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
>  		if (PageTransHuge(page))
> @@ -206,6 +208,13 @@ static void unaccount_page_cache_page(struct address_space *mapping,
>  	} else {
>  		VM_BUG_ON_PAGE(PageTransHuge(page), page);
>  	}
> +#else
> +	if (PageSwapBacked(page))
> +		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
> +
> +	if (PageTransHuge(page))
> +		__dec_node_page_state(page, NR_SHMEM_THPS);
> +#endif

Again, no need for #ifdef: the new definition should be fine for
everybody.

>  	/*
>  	 * At this point page must be either written or cleaned by
> @@ -1663,7 +1672,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>  no_page:
>  	if (!page && (fgp_flags & FGP_CREAT)) {
>  		int err;
> -		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
> +		if ((fgp_flags & FGP_WRITE) &&
> +			mapping_cap_account_dirty(mapping))
>  			gfp_mask |= __GFP_WRITE;
>  		if (fgp_flags & FGP_NOFS)
>  			gfp_mask &= ~__GFP_FS;
> @@ -2643,6 +2653,291 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>  }
>  EXPORT_SYMBOL(filemap_fault);
>  
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +/*
> + * Check for an entry in the page cache which would conflict with the address
> + * range we wish to map using a THP or is otherwise unusable to map a large
> + * cached page.
> + *
> + * The routine will return true if a usable page is found in the page cache
> + * (and *pagep will be set to the address of the cached page), or if no
> + * cached page is found (and *pagep will be set to NULL).
> + */
> +static bool
> +filemap_huge_check_pagecache_usable(struct xa_state *xas,
> +	struct page **pagep, pgoff_t hindex, pgoff_t hindex_max)
> +{
> +	struct page *page;
> +
> +	while (1) {
> +		page = xas_find(xas, hindex_max);
> +
> +		if (xas_retry(xas, page)) {
> +			xas_set(xas, hindex);
> +			continue;
> +		}
> +
> +		/*
> +		 * A found entry is unusable if:
> +		 *	+ the entry is an Xarray value, not a pointer
> +		 *	+ the entry is an internal Xarray node
> +		 *	+ the entry is not a Transparent Huge Page
> +		 *	+ the entry is not a compound page

PageCompound() and PageTransCompound() are the same thing if THP is
enabled at compile time.

PageHuge() check here is looking out of place. I don't thing we can ever
will see hugetlb pages here.

> +		 *	+ the entry is not the head of a compound page
> +		 *	+ the enbry is a page page with an order other than

Typo.

> +		 *	  HPAGE_PMD_ORDER

If you see unexpected page order in page cache, something went horribly
wrong, right?

> +		 *	+ the page's index is not what we expect it to be

Same here.

> +		 *	+ the page is not up-to-date
> +		 *	+ the page is unlocked

Confused here.

Do you expect caller to lock page before the check? If so, state it in the
comment for the function.

> +		 */
> +		if ((page) && (xa_is_value(page) || xa_is_internal(page) ||
> +			(!PageCompound(page)) || (PageHuge(page)) ||
> +			(!PageTransCompound(page)) ||
> +			page != compound_head(page) ||
> +			compound_order(page) != HPAGE_PMD_ORDER ||
> +			page->index != hindex || (!PageUptodate(page)) ||
> +			(!PageLocked(page))))
> +			return false;

Wow. That's unreadable. Can we rewrite it something like (commenting each
check):

		if (!page)
			break;

		if (xa_is_value(page) || xa_is_internal(page))
			return false;

		if (!PageCompound(page))
			return false;

		...

> +
> +		break;
> +	}
> +
> +	xas_set(xas, hindex);
> +	*pagep = page;
> +	return true;
> +}
> +
> +/**
> + * filemap_huge_fault - read in file data for page fault handling to THP
> + * @vmf:	struct vm_fault containing details of the fault
> + * @pe_size:	large page size to map, currently this must be PE_SIZE_PMD
> + *
> + * filemap_huge_fault() is invoked via the vma operations vector for a
> + * mapped memory region to read in file data to a transparent huge page during
> + * a page fault.
> + *
> + * If for any reason we can't allocate a THP, map it or add it to the page
> + * cache, VM_FAULT_FALLBACK will be returned which will cause the fault
> + * handler to try mapping the page using a PAGESIZE page, usually via
> + * filemap_fault() if so speicifed in the vma operations vector.
> + *
> + * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte()
> + * to map the new THP.
> + *
> + * NOTE: This routine depends upon the file system's readpage routine as
> + *       specified in the address space operations vector to recognize when it
> + *	 is being passed a large page and to read the approprate amount of data
> + *	 in full and without polluting the page cache for the large page itself
> + *	 with PAGESIZE pages to perform a buffered read or to pollute what
> + *	 would be the page cache space for any succeeding pages with PAGESIZE
> + *	 pages due to readahead.
> + *
> + *	 It is VITAL that this routine not be enabled without such filesystem
> + *	 support. As there is no way to determine how many bytes were read by
> + *	 the readpage() operation, if only a PAGESIZE page is read, this routine
> + *	 will map the THP containing only the first PAGESIZE bytes of file data
> + *	 to satisfy the fault, which is never the result desired.
> + */
> +vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
> +		enum page_entry_size pe_size)
> +{
> +	struct file *filp = vmf->vma->vm_file;
> +	struct address_space *mapping = filp->f_mapping;
> +	struct vm_area_struct *vma = vmf->vma;
> +
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	pgoff_t hindex = round_down(vmf->pgoff, HPAGE_PMD_NR);
> +	pgoff_t hindex_max = hindex + HPAGE_PMD_NR;
> +
> +	struct page *cached_page, *hugepage;
> +	struct page *new_page = NULL;
> +
> +	vm_fault_t ret = VM_FAULT_FALLBACK;
> +	int error;
> +
> +	XA_STATE_ORDER(xas, &mapping->i_pages, hindex, HPAGE_PMD_ORDER);
> +
> +	/*
> +	 * Return VM_FAULT_FALLBACK if:
> +	 *
> +	 *	+ pe_size != PE_SIZE_PMD
> +	 *	+ FAULT_FLAG_WRITE is set in vmf->flags
> +	 *	+ vma isn't aligned to allow a PMD mapping
> +	 *	+ PMD would extend beyond the end of the vma
> +	 */
> +	if (pe_size != PE_SIZE_PMD || (vmf->flags & FAULT_FLAG_WRITE) ||
> +		(haddr < vma->vm_start ||
> +		(haddr + HPAGE_PMD_SIZE > vma->vm_end)))
> +		return ret;

You also need to check that VMA alignment is suitable for huge pages.
See transhuge_vma_suitable().

> +
> +	xas_lock_irq(&xas);
> +
> +retry_xas_locked:
> +	if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex,
> +		hindex_max)) {

I don't see how this check will ever succeed. Who locks the page here?

> +		/* found a conflicting entry in the page cache, so fallback */
> +		goto unlock;
> +	} else if (cached_page) {
> +		/* found a valid cached page, so map it */
> +		hugepage = cached_page;
> +		goto map_huge;
> +	}
> +
> +	xas_unlock_irq(&xas);
> +
> +	/* allocate huge THP page in VMA */
> +	new_page = __page_cache_alloc(vmf->gfp_mask | __GFP_COMP |
> +		__GFP_NOWARN | __GFP_NORETRY, HPAGE_PMD_ORDER);
> +
> +	if (unlikely(!new_page))
> +		return ret;
> +
> +	if (unlikely(!(PageCompound(new_page)))) {

How can it happen?

> +		put_page(new_page);
> +		return ret;
> +	}
> +
> +	prep_transhuge_page(new_page);
> +	new_page->index = hindex;
> +	new_page->mapping = mapping;
> +
> +	__SetPageLocked(new_page);
> +
> +	/*
> +	 * The readpage() operation below is expected to fill the large
> +	 * page with data without polluting the page cache with
> +	 * PAGESIZE entries due to a buffered read and/or readahead().
> +	 *
> +	 * A filesystem's vm_operations_struct huge_fault field should
> +	 * never point to this routine without such a capability, and
> +	 * without it a call to this routine would eventually just
> +	 * fall through to the normal fault op anyway.
> +	 */
> +	error = mapping->a_ops->readpage(vmf->vma->vm_file, new_page);
> +
> +	if (unlikely(error)) {
> +		put_page(new_page);
> +		return ret;
> +	}
> +
> +	/* XXX - use wait_on_page_locked_killable() instead? */
> +	wait_on_page_locked(new_page);
> +
> +	if (!PageUptodate(new_page)) {
> +		/* EIO */
> +		new_page->mapping = NULL;
> +		put_page(new_page);
> +		return ret;
> +	}
> +
> +	do {
> +		xas_lock_irq(&xas);
> +		xas_set(&xas, hindex);
> +		xas_create_range(&xas);
> +
> +		if (!(xas_error(&xas)))
> +			break;
> +
> +		if (!xas_nomem(&xas, GFP_KERNEL)) {
> +			if (new_page) {
> +				new_page->mapping = NULL;
> +				put_page(new_page);
> +			}
> +
> +			goto unlock;
> +		}
> +
> +		xas_unlock_irq(&xas);
> +	} while (1);
> +
> +	/*
> +	 * Double check that an entry did not sneak into the page cache while
> +	 * creating Xarray entries for the new page.
> +	 */
> +	if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex,
> +		hindex_max)) {
> +		/*
> +		 * An unusable entry was found, so delete the newly allocated
> +		 * page and fallback.
> +		 */
> +		new_page->mapping = NULL;
> +		put_page(new_page);
> +		goto unlock;
> +	} else if (cached_page) {
> +		/*
> +		 * A valid large page was found in the page cache, so free the
> +		 * newly allocated page and map the cached page instead.
> +		 */
> +		new_page->mapping = NULL;
> +		put_page(new_page);
> +		new_page = NULL;
> +		hugepage = cached_page;
> +		goto map_huge;
> +	}
> +
> +	__SetPageLocked(new_page);

Again?

> +
> +	/* did it get truncated? */
> +	if (unlikely(new_page->mapping != mapping)) {

Hm. IIRC this path only reachable for just allocated page that is not
exposed to anybody yet. How can it be truncated?

> +		unlock_page(new_page);
> +		put_page(new_page);
> +		goto retry_xas_locked;
> +	}
> +
> +	hugepage = new_page;
> +
> +map_huge:
> +	/* map hugepage at the PMD level */
> +	ret = alloc_set_pte(vmf, NULL, hugepage);

It has to be

	ret = alloc_set_pte(vmf, vmf->memcg, hugepage);

right?

> +
> +	VM_BUG_ON_PAGE((!(pmd_trans_huge(*vmf->pmd))), hugepage);
> +
> +	if (likely(!(ret & VM_FAULT_ERROR))) {
> +		/*
> +		 * The alloc_set_pte() succeeded without error, so
> +		 * add the page to the page cache if it is new, and
> +		 * increment page statistics accordingly.
> +		 */

It looks backwards to me. I believe the page must be in page cache
*before* it got mapped.

I expect all sorts of weird bug due to races when the page is mapped but
not visible via syscalls.

Hm?

> +		if (new_page) {
> +			unsigned long nr;
> +
> +			xas_set(&xas, hindex);
> +
> +			for (nr = 0; nr < HPAGE_PMD_NR; nr++) {
> +#ifndef	COMPOUND_PAGES_HEAD_ONLY
> +				xas_store(&xas, new_page + nr);
> +#else
> +				xas_store(&xas, new_page);
> +#endif
> +				xas_next(&xas);
> +			}
> +
> +			count_vm_event(THP_FILE_ALLOC);
> +			__inc_node_page_state(new_page, NR_SHMEM_THPS);
> +			__mod_node_page_state(page_pgdat(new_page),
> +				NR_FILE_PAGES, HPAGE_PMD_NR);
> +			__mod_node_page_state(page_pgdat(new_page),
> +				NR_SHMEM, HPAGE_PMD_NR);
> +		}
> +
> +		vmf->address = haddr;
> +		vmf->page = hugepage;
> +
> +		page_ref_add(hugepage, HPAGE_PMD_NR);
> +		count_vm_event(THP_FILE_MAPPED);
> +	} else if (new_page) {
> +		/* there was an error mapping the new page, so release it */
> +		new_page->mapping = NULL;
> +		put_page(new_page);
> +	}
> +
> +unlock:
> +	xas_unlock_irq(&xas);
> +	return ret;
> +}
> +EXPORT_SYMBOL(filemap_huge_fault);
> +#endif
> +
>  void filemap_map_pages(struct vm_fault *vmf,
>  		pgoff_t start_pgoff, pgoff_t end_pgoff)
>  {
> @@ -2925,7 +3220,8 @@ struct page *read_cache_page(struct address_space *mapping,
>  EXPORT_SYMBOL(read_cache_page);
>  
>  /**
> - * read_cache_page_gfp - read into page cache, using specified page allocation flags.
> + * read_cache_page_gfp - read into page cache, using specified page allocation
> + *			 flags.
>   * @mapping:	the page's address_space
>   * @index:	the page index
>   * @gfp:	the page allocator flags to use if allocating
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1334ede667a8..26d74466d1f7 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -543,8 +543,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  
>  	if (addr)
>  		goto out;
> +
> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP

IS_ENABLED()?

>  	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
>  		goto out;
> +#endif
>  
>  	addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
>  	if (addr)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 7e8c3e8ae75f..96ff80d2a8fb 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1391,6 +1391,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	struct mm_struct *mm = current->mm;
>  	int pkey = 0;
>  
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +	unsigned long vm_maywrite = VM_MAYWRITE;
> +#endif
> +
>  	*populate = 0;
>  
>  	if (!len)
> @@ -1429,7 +1433,33 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	/* Obtain the address to map to. we verify (or select) it and ensure
>  	 * that it represents a valid section of the address space.
>  	 */
> -	addr = get_unmapped_area(file, addr, len, pgoff, flags);
> +
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +	/*
> +	 * If THP is enabled, it's a read-only executable that is
> +	 * MAP_PRIVATE mapped, the length is larger than a PMD page
> +	 * and either it's not a MAP_FIXED mapping or the passed address is
> +	 * properly aligned for a PMD page, attempt to get an appropriate
> +	 * address at which to map a PMD-sized THP page, otherwise call the
> +	 * normal routine.
> +	 */
> +	if ((prot & PROT_READ) && (prot & PROT_EXEC) &&
> +		(!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) &&

Why require PROT_EXEC && PROT_READ. You only must ask for !PROT_WRITE.

And how do you protect against mprotect() later? Should you ask for
ro-file instead?

> +		(!(flags & MAP_FIXED)) && len >= HPAGE_PMD_SIZE &&
> +		(!(addr & HPAGE_PMD_OFFSET))) {

All size considerations are already handled by thp_get_unmapped_area(). No
need to duplicate it here.

You might want to add thp_ro_get_unmapped_area() that would check file for
RO, before going for THP-suitable mapping.

> +		addr = thp_get_unmapped_area(file, addr, len, pgoff, flags);
> +
> +		if (addr && (!(addr & HPAGE_PMD_OFFSET)))
> +			vm_maywrite = 0;

Oh. That's way too hacky. Better to ask for RO file instead.

> +		else
> +			addr = get_unmapped_area(file, addr, len, pgoff, flags);
> +	} else {
> +#endif
> +		addr = get_unmapped_area(file, addr, len, pgoff, flags);
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +	}
> +#endif
> +
>  	if (offset_in_page(addr))
>  		return addr;
>  
> @@ -1451,7 +1481,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	 * of the memory object, so we don't do any here.
>  	 */
>  	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> +			mm->def_flags | VM_MAYREAD | vm_maywrite | VM_MAYEXEC;
> +#else
>  			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
> +#endif
>  
>  	if (flags & MAP_LOCKED)
>  		if (!can_do_mlock())
> diff --git a/mm/rmap.c b/mm/rmap.c
> index e5dfe2ae6b0d..503612d3b52b 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1192,7 +1192,11 @@ void page_add_file_rmap(struct page *page, bool compound)
>  		}
>  		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
>  			goto out;
> +
> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>  		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
> +#endif
> +

Just remove it. Don't add more #ifdefs.

>  		__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
>  	} else {
>  		if (PageTransCompound(page) && page_mapping(page)) {
> @@ -1232,7 +1236,11 @@ static void page_remove_file_rmap(struct page *page, bool compound)
>  		}
>  		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
>  			goto out;
> +
> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>  		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
> +#endif
> +

Ditto.

>  		__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
>  	} else {
>  		if (!atomic_add_negative(-1, &page->_mapcount))
> -- 
> 2.21.0
>
William Kucharski Aug. 3, 2019, 10:27 p.m. UTC | #2
On 8/1/19 6:36 AM, Kirill A. Shutemov wrote:

>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -#define HPAGE_PMD_SHIFT PMD_SHIFT
>> -#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
>> -#define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
>> -
>> -#define HPAGE_PUD_SHIFT PUD_SHIFT
>> -#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
>> -#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
>> +#define HPAGE_PMD_SHIFT		PMD_SHIFT
>> +#define HPAGE_PMD_SIZE		((1UL) << HPAGE_PMD_SHIFT)
>> +#define HPAGE_PMD_OFFSET	(HPAGE_PMD_SIZE - 1)
>> +#define HPAGE_PMD_MASK		(~(HPAGE_PMD_OFFSET))
>> +
>> +#define HPAGE_PUD_SHIFT		PUD_SHIFT
>> +#define HPAGE_PUD_SIZE		((1UL) << HPAGE_PUD_SHIFT)
>> +#define HPAGE_PUD_OFFSET	(HPAGE_PUD_SIZE - 1)
>> +#define HPAGE_PUD_MASK		(~(HPAGE_PUD_OFFSET))
> 
> OFFSET vs MASK semantics can be confusing without reading the definition.
> We don't have anything similar for base page size, right (PAGE_OFFSET is
> completely different thing :P)?

I came up with the OFFSET definitions, the MASK definitions already existed
in huge_mm.h, e.g.:

#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))

Is there different terminology you'd prefer to see me use here to clarify
this?


>> +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>> +extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
>> +			enum page_entry_size pe_size);
>> +#endif
>> +
> 
> No need for #ifdef here.

I wanted to avoid referencing an extern that wouldn't exist if the config
option wasn't set; I can remove it.


>> +
>> +#ifndef	CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>>   	if (PageSwapBacked(page)) {
>>   		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
>>   		if (PageTransHuge(page))
>> @@ -206,6 +208,13 @@ static void unaccount_page_cache_page(struct address_space *mapping,
>>   	} else {
>>   		VM_BUG_ON_PAGE(PageTransHuge(page), page);
>>   	}
>> +#else
>> +	if (PageSwapBacked(page))
>> +		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
>> +
>> +	if (PageTransHuge(page))
>> +		__dec_node_page_state(page, NR_SHMEM_THPS);
>> +#endif
> 
> Again, no need for #ifdef: the new definition should be fine for
> everybody.

OK, I can do that; I didn't want to unnecessarily eliminate the
VM_BUG_ON_PAGE(PageTransHuge(page)) call for everyone given this
is CONFIG experimental code.

> PageCompound() and PageTransCompound() are the same thing if THP is
> enabled at compile time.

> PageHuge() check here is looking out of place. I don't thing we can ever
> will see hugetlb pages here.

What I'm trying to do is sanity check that what the cache contains is a THP 
page. I added the PageHuge() check simply because PageTransCompound() is true 
for both THP and hugetlbfs pages, and there's no routine that returns true JUST 
for THP pages; perhaps there should be?

>> +		 *	+ the enbry is a page page with an order other than
> 
> Typo.

Thanks, fixed.

> 
>> +		 *	  HPAGE_PMD_ORDER
> 
> If you see unexpected page order in page cache, something went horribly
> wrong, right?

This routine's main function other than validation is to make sure the page 
cache has not been polluted between when we go out to read the large page and 
when the page is added to the cache (more on that coming up.) For example, the 
way I was able to tell readahead() was polluting future possible THP mappings is 
because after a buffered read I would typically see 52 (the readahead size) 
PAGESIZE pages for the next 2M range in the page cache.

>> +		 *	+ the page's index is not what we expect it to be
> 
> Same here.

Same rationale.

> 
>> +		 *	+ the page is not up-to-date
>> +		 *	+ the page is unlocked
> 
> Confused here.

These should never be true, but I wanted to double check for them anyway. I can 
eliminate the checks if we are satisfied these states can "never" happen for a 
cached page.

> 
> Do you expect caller to lock page before the check? If so, state it in the
> comment for the function.

It's my understanding that pages in the page cache should be locked, so I wanted 
to check for that.

This routine is validating entries we find in the page cache to see whether they 
are conflicts or valid cached THP pages.

> Wow. That's unreadable. Can we rewrite it something like (commenting each
> check):

I can definitely break it down into multiple checks; it is a bit dense, thus the 
comment but you're correct, it will read better if broken down more.


> You also need to check that VMA alignment is suitable for huge pages.
> See transhuge_vma_suitable().

I don't really care if the start of the VMA is suitable, just whether I can map
the current faulting page with a THP. As far as I know, there's nothing wrong
with mapping all the pages before the VMA hits a properly aligned bound with
PAGESIZE pages and then aligned chunks in the middle with THP.

>> +	if (unlikely(!(PageCompound(new_page)))) {
> 
> How can it happen?

That check was already removed for a pending v4, thanks. I wasn't sure if
__page_cache_alloc() could ever erroneously return a non-compound page so
I wanted to check for it.

>> +	__SetPageLocked(new_page);
> 
> Again?

This is the page that content was just read to; readpage() will unlock the page
when it is done with I/O, but the page needs to be locked before it's inserted
into the page cache.

>> +	/* did it get truncated? */
>> +	if (unlikely(new_page->mapping != mapping)) {
> 
> Hm. IIRC this path only reachable for just allocated page that is not
> exposed to anybody yet. How can it be truncated?

Matthew advised I duplicate the similar routine from filemap_fault(), but that 
may be because of the normal way pages get added to the cache, which I may need 
to modify my code to do.

>> +	ret = alloc_set_pte(vmf, NULL, hugepage);
> 
> It has to be
> 
> 	ret = alloc_set_pte(vmf, vmf->memcg, hugepage);
> 
> right?

I can make that change; originally alloc_set_pte() didn't use the second 
parameter at all when mapping a read-only page.

Even now, if the page isn't writable, it would only be dereferenced by a
VM_BUG_ON_PAGE() call if it's COW.

> It looks backwards to me. I believe the page must be in page cache
> *before* it got mapped.
> 
> I expect all sorts of weird bug due to races when the page is mapped but
> not visible via syscalls.

You may be correct.

My original thinking on this was that as a THP is going to be rarer and more 
valuable to the system, I didn't want to add it to the page cache until its 
contents had been fully read and it was mapped. Talking with Matthew it seems I 
may need to change to do things the same way as PAGESIZE pages, where the page 
is added to the cache prior to the readpage() call and we rely upon PageUptodate 
to see if the reads were successful.

My thinking had been if any part of reading a large page and mapping it had 
failed, the code could just put_page() the newly allocated page and fallback to 
mapping the page with PAGESIZE pages rather than add a THP to the cache.


>> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> 
> IS_ENABLED()?
> 
>>   	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
>>   		goto out;
>> +#endif

This code short-circuits the address generation routine if the memory isn't DAX,
and if this code is enabled I need it not to goto out but rather fall through to
__thp_get_unmapped_area().

>> +	if ((prot & PROT_READ) && (prot & PROT_EXEC) &&
>> +		(!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) &&
> 
> Why require PROT_EXEC && PROT_READ. You only must ask for !PROT_WRITE.
> 
> And how do you protect against mprotect() later? Should you ask for
> ro-file instead?

My original goal was to only map program TEXT with THP, which means only
RO EXEC code, not just any non-writable address space.

If mprotect() is called, wouldn't the pages be COWed to PAGESIZE pages the
first time the area was written to? I may be way off on this assumption.

> All size considerations are already handled by thp_get_unmapped_area(). No
> need to duplicate it here.

Thanks, I'll remove them.

> You might want to add thp_ro_get_unmapped_area() that would check file for
> RO, before going for THP-suitable mapping.

Once again, the question is whether we want to make this just RO or RO + EXEC
to maintain my goal of just mapping program TEXT via THP. I'm willing to hear 
arguments either way.

> 
>> +		addr = thp_get_unmapped_area(file, addr, len, pgoff, flags);
>> +
>> +		if (addr && (!(addr & HPAGE_PMD_OFFSET)))
>> +			vm_maywrite = 0;
> 
> Oh. That's way too hacky. Better to ask for RO file instead.

I did that because the existing code just blindly sets VM_MAYWRITE and I 
obviously didn't want to, so making it a variable allowed me to shut it off if 
it was a THP mapping.


>> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>>   		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
>> +#endif
>> +
> 
> Just remove it. Don't add more #ifdefs.

OK; once again I didn't want to remove the existing VM_BUG_ON_PAGE() call
because this was an experimental config for now.


>> +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
>>   		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
>> +#endif
>> +
> 
> Ditto.

Same rationale.

Thanks for looking this over; I'm curious as to what others think about the need 
for an RO file and the issue of when the large page gets added to the page cache.

     -- Bill
Kirill A . Shutemov Aug. 5, 2019, 1:28 p.m. UTC | #3
On Sat, Aug 03, 2019 at 04:27:51PM -0600, William Kucharski wrote:
> 
> 
> On 8/1/19 6:36 AM, Kirill A. Shutemov wrote:
> 
> > >   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > > -#define HPAGE_PMD_SHIFT PMD_SHIFT
> > > -#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
> > > -#define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
> > > -
> > > -#define HPAGE_PUD_SHIFT PUD_SHIFT
> > > -#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
> > > -#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
> > > +#define HPAGE_PMD_SHIFT		PMD_SHIFT
> > > +#define HPAGE_PMD_SIZE		((1UL) << HPAGE_PMD_SHIFT)
> > > +#define HPAGE_PMD_OFFSET	(HPAGE_PMD_SIZE - 1)
> > > +#define HPAGE_PMD_MASK		(~(HPAGE_PMD_OFFSET))
> > > +
> > > +#define HPAGE_PUD_SHIFT		PUD_SHIFT
> > > +#define HPAGE_PUD_SIZE		((1UL) << HPAGE_PUD_SHIFT)
> > > +#define HPAGE_PUD_OFFSET	(HPAGE_PUD_SIZE - 1)
> > > +#define HPAGE_PUD_MASK		(~(HPAGE_PUD_OFFSET))
> > 
> > OFFSET vs MASK semantics can be confusing without reading the definition.
> > We don't have anything similar for base page size, right (PAGE_OFFSET is
> > completely different thing :P)?
> 
> I came up with the OFFSET definitions, the MASK definitions already existed
> in huge_mm.h, e.g.:
> 
> #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
> 
> Is there different terminology you'd prefer to see me use here to clarify
> this?

My point is that maybe we should just use ~HPAGE_P?D_MASK in code. The new
HPAGE_P?D_OFFSET doesn't add much for readability in my opinion.

> > > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> > > +extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
> > > +			enum page_entry_size pe_size);
> > > +#endif
> > > +
> > 
> > No need for #ifdef here.
> 
> I wanted to avoid referencing an extern that wouldn't exist if the config
> option wasn't set; I can remove it.
> 
> 
> > > +
> > > +#ifndef	CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> > >   	if (PageSwapBacked(page)) {
> > >   		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
> > >   		if (PageTransHuge(page))
> > > @@ -206,6 +208,13 @@ static void unaccount_page_cache_page(struct address_space *mapping,
> > >   	} else {
> > >   		VM_BUG_ON_PAGE(PageTransHuge(page), page);
> > >   	}
> > > +#else
> > > +	if (PageSwapBacked(page))
> > > +		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
> > > +
> > > +	if (PageTransHuge(page))
> > > +		__dec_node_page_state(page, NR_SHMEM_THPS);
> > > +#endif
> > 
> > Again, no need for #ifdef: the new definition should be fine for
> > everybody.
> 
> OK, I can do that; I didn't want to unnecessarily eliminate the
> VM_BUG_ON_PAGE(PageTransHuge(page)) call for everyone given this
> is CONFIG experimental code.

If you bring the feature, you bring the feature. Just drop these VM_BUGs.

> > PageCompound() and PageTransCompound() are the same thing if THP is
> > enabled at compile time.
> 
> > PageHuge() check here is looking out of place. I don't thing we can ever
> > will see hugetlb pages here.
> 
> What I'm trying to do is sanity check that what the cache contains is a THP
> page. I added the PageHuge() check simply because PageTransCompound() is
> true for both THP and hugetlbfs pages, and there's no routine that returns
> true JUST for THP pages; perhaps there should be?

I'm not sure. It will be costly comparing to PageTransCompound/Huge as we
need to check more than flags.

To exclude hugetlb pages here, use VM_BUG_ON_PAGE(PageHuge(page), page).
It will allow to catch wrong usage of the function.

> 
> > > +		 *	+ the enbry is a page page with an order other than
> > 
> > Typo.
> 
> Thanks, fixed.
> 
> > 
> > > +		 *	  HPAGE_PMD_ORDER
> > 
> > If you see unexpected page order in page cache, something went horribly
> > wrong, right?
> 
> This routine's main function other than validation is to make sure the page
> cache has not been polluted between when we go out to read the large page
> and when the page is added to the cache (more on that coming up.) For
> example, the way I was able to tell readahead() was polluting future
> possible THP mappings is because after a buffered read I would typically see
> 52 (the readahead size) PAGESIZE pages for the next 2M range in the page
> cache.

My point is that you should only see compound pages here if they are
HPAGE_PMD_ORDER, shouldn't you? Any other order of compound page would be
unexpected to say the least.

> > > +		 *	+ the page's index is not what we expect it to be
> > 
> > Same here.
> 
> Same rationale.
> 
> > 
> > > +		 *	+ the page is not up-to-date
> > > +		 *	+ the page is unlocked
> > 
> > Confused here.
> 
> These should never be true, but I wanted to double check for them anyway. I
> can eliminate the checks if we are satisfied these states can "never" happen
> for a cached page.
> > Do you expect caller to lock page before the check? If so, state it in the
> > comment for the function.
> 
> It's my understanding that pages in the page cache should be locked, so I
> wanted to check for that.

No. They are get locked temporary for some operation, but not all the
time.

> This routine is validating entries we find in the page cache to see whether
> they are conflicts or valid cached THP pages.
> 
> > Wow. That's unreadable. Can we rewrite it something like (commenting each
> > check):
> 
> I can definitely break it down into multiple checks; it is a bit dense, thus
> the comment but you're correct, it will read better if broken down more.
> 
> 
> > You also need to check that VMA alignment is suitable for huge pages.
> > See transhuge_vma_suitable().
> 
> I don't really care if the start of the VMA is suitable, just whether I can map
> the current faulting page with a THP. As far as I know, there's nothing wrong
> with mapping all the pages before the VMA hits a properly aligned bound with
> PAGESIZE pages and then aligned chunks in the middle with THP.

You cannot map any paged as huge into wrongly aligned VMA.

THP's ->index must be aligned to HPAGE_PMD_NR, so if the combination VMA's
->vm_start and ->vm_pgoff doesn't allow for this, you must fallback to
mapping the page with PTEs. I don't see it handled properly here.

> > > +	if (unlikely(!(PageCompound(new_page)))) {
> > 
> > How can it happen?
> 
> That check was already removed for a pending v4, thanks. I wasn't sure if
> __page_cache_alloc() could ever erroneously return a non-compound page so
> I wanted to check for it.
> 
> > > +	__SetPageLocked(new_page);
> > 
> > Again?
> 
> This is the page that content was just read to; readpage() will unlock the page
> when it is done with I/O, but the page needs to be locked before it's inserted
> into the page cache.

Then you must to lock the page properly with lock_page().

__SetPageLocked() is fine for just allocated pages that was not exposed
anywhere. After ->readpage() it's not the case and it's not safe to use
__SetPageLocked() for them.

> > > +	/* did it get truncated? */
> > > +	if (unlikely(new_page->mapping != mapping)) {
> > 
> > Hm. IIRC this path only reachable for just allocated page that is not
> > exposed to anybody yet. How can it be truncated?
> 
> Matthew advised I duplicate the similar routine from filemap_fault(), but
> that may be because of the normal way pages get added to the cache, which I
> may need to modify my code to do.
> 
> > > +	ret = alloc_set_pte(vmf, NULL, hugepage);
> > 
> > It has to be
> > 
> > 	ret = alloc_set_pte(vmf, vmf->memcg, hugepage);
> > 
> > right?
> 
> I can make that change; originally alloc_set_pte() didn't use the second
> parameter at all when mapping a read-only page.
> 
> Even now, if the page isn't writable, it would only be dereferenced by a
> VM_BUG_ON_PAGE() call if it's COW.

Please do change this. It has to be future-proof.

> > It looks backwards to me. I believe the page must be in page cache
> > *before* it got mapped.
> > 
> > I expect all sorts of weird bug due to races when the page is mapped but
> > not visible via syscalls.
> 
> You may be correct.
> 
> My original thinking on this was that as a THP is going to be rarer and more
> valuable to the system, I didn't want to add it to the page cache until its
> contents had been fully read and it was mapped. Talking with Matthew it
> seems I may need to change to do things the same way as PAGESIZE pages,
> where the page is added to the cache prior to the readpage() call and we
> rely upon PageUptodate to see if the reads were successful.
> 
> My thinking had been if any part of reading a large page and mapping it had
> failed, the code could just put_page() the newly allocated page and fallback
> to mapping the page with PAGESIZE pages rather than add a THP to the cache.

I think it's must change. We should not allow inconsistent view on page
cache.

> > > +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
> > 
> > IS_ENABLED()?
> > 
> > >   	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
> > >   		goto out;
> > > +#endif
> 
> This code short-circuits the address generation routine if the memory isn't DAX,
> and if this code is enabled I need it not to goto out but rather fall through to
> __thp_get_unmapped_area().
> 
> > > +	if ((prot & PROT_READ) && (prot & PROT_EXEC) &&
> > > +		(!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) &&
> > 
> > Why require PROT_EXEC && PROT_READ. You only must ask for !PROT_WRITE.
> > 
> > And how do you protect against mprotect() later? Should you ask for
> > ro-file instead?
> 
> My original goal was to only map program TEXT with THP, which means only
> RO EXEC code, not just any non-writable address space.
> 
> If mprotect() is called, wouldn't the pages be COWed to PAGESIZE pages the
> first time the area was written to? I may be way off on this assumption.

Okay, fair enough. COW will happen for private mappings.

But for private mappings you don't need to enforce even RO. All permission
mask should be fine.

> > All size considerations are already handled by thp_get_unmapped_area(). No
> > need to duplicate it here.
> 
> Thanks, I'll remove them.
> 
> > You might want to add thp_ro_get_unmapped_area() that would check file for
> > RO, before going for THP-suitable mapping.
> 
> Once again, the question is whether we want to make this just RO or RO + EXEC
> to maintain my goal of just mapping program TEXT via THP. I'm willing to
> hear arguments either way.

It think the goal is to make feature useful and therefore we need to make
it available for widest possible set of people.

I think is should be allowed for RO (based on how file was opened, not on
PROT_*) + SHARED and for any PRIVATE mappings.

> > 
> > > +		addr = thp_get_unmapped_area(file, addr, len, pgoff, flags);
> > > +
> > > +		if (addr && (!(addr & HPAGE_PMD_OFFSET)))
> > > +			vm_maywrite = 0;
> > 
> > Oh. That's way too hacky. Better to ask for RO file instead.
> 
> I did that because the existing code just blindly sets VM_MAYWRITE and I
> obviously didn't want to, so making it a variable allowed me to shut it off
> if it was a THP mapping.

I think touching VM_MAYWRITE here is wrong. It should reflect what file
under the mapping allows.
William Kucharski Aug. 5, 2019, 3:56 p.m. UTC | #4
> On Aug 5, 2019, at 7:28 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
>> 
>> Is there different terminology you'd prefer to see me use here to clarify
>> this?
> 
> My point is that maybe we should just use ~HPAGE_P?D_MASK in code. The new
> HPAGE_P?D_OFFSET doesn't add much for readability in my opinion.

Fair enough, I'll make that change.

>> OK, I can do that; I didn't want to unnecessarily eliminate the
>> VM_BUG_ON_PAGE(PageTransHuge(page)) call for everyone given this
>> is CONFIG experimental code.
> 
> If you bring the feature, you bring the feature. Just drop these VM_BUGs.

OK.

> I'm not sure. It will be costly comparing to PageTransCompound/Huge as we
> need to check more than flags.
> 
> To exclude hugetlb pages here, use VM_BUG_ON_PAGE(PageHuge(page), page).
> It will allow to catch wrong usage of the function.

That will also do it and that way we will better know if it ever happens,
which of course it shouldn't.

>> This routine's main function other than validation is to make sure the page
>> cache has not been polluted between when we go out to read the large page
>> and when the page is added to the cache (more on that coming up.) For
>> example, the way I was able to tell readahead() was polluting future
>> possible THP mappings is because after a buffered read I would typically see
>> 52 (the readahead size) PAGESIZE pages for the next 2M range in the page
>> cache.
> 
> My point is that you should only see compound pages here if they are
> HPAGE_PMD_ORDER, shouldn't you? Any other order of compound page would be
> unexpected to say the least.

Yes, compound pages should only be HPAGE_PMD_ORDER.

The routine and the check will need to be updated if we ever can
allocate/cache larger pages.

>> It's my understanding that pages in the page cache should be locked, so I
>> wanted to check for that.
> 
> No. They are get locked temporary for some operation, but not all the
> time.

OK, thanks for that.

>> I don't really care if the start of the VMA is suitable, just whether I can map
>> the current faulting page with a THP. As far as I know, there's nothing wrong
>> with mapping all the pages before the VMA hits a properly aligned bound with
>> PAGESIZE pages and then aligned chunks in the middle with THP.
> 
> You cannot map any paged as huge into wrongly aligned VMA.
> 
> THP's ->index must be aligned to HPAGE_PMD_NR, so if the combination VMA's
> ->vm_start and ->vm_pgoff doesn't allow for this, you must fallback to
> mapping the page with PTEs. I don't see it handled properly here.

It was my assumption that if say a VMA started at an address say one page
before a large page alignment, you could map that page with a PAGESIZE
page but if VMA size allowed, there was a fault on the next page, and
VMA size allowed, you could map that next range with a large page, taking
taking the approach of mapping chunks of the VMA with the largest page
possible.

Is it that the start of the VMA must always align or that the entire VMA
must be properly aligned and a multiple of the PMD size (so you either map
with all large pages or none)?

>> This is the page that content was just read to; readpage() will unlock the page
>> when it is done with I/O, but the page needs to be locked before it's inserted
>> into the page cache.
> 
> Then you must to lock the page properly with lock_page().
> 
> __SetPageLocked() is fine for just allocated pages that was not exposed
> anywhere. After ->readpage() it's not the case and it's not safe to use
> __SetPageLocked() for them.

In the current code, it's assumed it is not exposed, because a single read
of a large page that does no readahead before the page is inserted into the
cache means there are no external users of the page.

Regardless, I can make this change as part of the changes I will need to to
reorder when the page is inserted into the cache.

>> I can make that change; originally alloc_set_pte() didn't use the second
>> parameter at all when mapping a read-only page.
>> 
>> Even now, if the page isn't writable, it would only be dereferenced by a
>> VM_BUG_ON_PAGE() call if it's COW.
> 
> Please do change this. It has to be future-proof.

OK, thanks.

>> My thinking had been if any part of reading a large page and mapping it had
>> failed, the code could just put_page() the newly allocated page and fallback
>> to mapping the page with PAGESIZE pages rather than add a THP to the cache.
> 
> I think it's must change. We should not allow inconsistent view on page
> cache.

Yes, I can see where it would be confusing to anyone looking at it that assumed
the page must already be in the cache before readpage() is called.

>> If mprotect() is called, wouldn't the pages be COWed to PAGESIZE pages the
>> first time the area was written to? I may be way off on this assumption.
> 
> Okay, fair enough. COW will happen for private mappings.
> 
> But for private mappings you don't need to enforce even RO. All permission
> mask should be fine.

Thanks.

>> Once again, the question is whether we want to make this just RO or RO + EXEC
>> to maintain my goal of just mapping program TEXT via THP. I'm willing to
>> hear arguments either way.
> 
> It think the goal is to make feature useful and therefore we need to make
> it available for widest possible set of people.
> 
> I think is should be allowed for RO (based on how file was opened, not on
> PROT_*) + SHARED and for any PRIVATE mappings.

That makes sense.

>> I did that because the existing code just blindly sets VM_MAYWRITE and I
>> obviously didn't want to, so making it a variable allowed me to shut it off
>> if it was a THP mapping.
> 
> I think touching VM_MAYWRITE here is wrong. It should reflect what file
> under the mapping allows.

Fair enough.

Thanks again!
    -- Bill
Kirill A . Shutemov Aug. 6, 2019, 11:12 a.m. UTC | #5
On Mon, Aug 05, 2019 at 09:56:45AM -0600, William Kucharski wrote:
> >> I don't really care if the start of the VMA is suitable, just whether I can map
> >> the current faulting page with a THP. As far as I know, there's nothing wrong
> >> with mapping all the pages before the VMA hits a properly aligned bound with
> >> PAGESIZE pages and then aligned chunks in the middle with THP.
> > 
> > You cannot map any paged as huge into wrongly aligned VMA.
> > 
> > THP's ->index must be aligned to HPAGE_PMD_NR, so if the combination VMA's
> > ->vm_start and ->vm_pgoff doesn't allow for this, you must fallback to
> > mapping the page with PTEs. I don't see it handled properly here.
> 
> It was my assumption that if say a VMA started at an address say one page
> before a large page alignment, you could map that page with a PAGESIZE
> page but if VMA size allowed, there was a fault on the next page, and
> VMA size allowed, you could map that next range with a large page, taking
> taking the approach of mapping chunks of the VMA with the largest page
> possible.
> 
> Is it that the start of the VMA must always align or that the entire VMA
> must be properly aligned and a multiple of the PMD size (so you either map
> with all large pages or none)?

IIUC, you are missing ->vm_pgoff from the picture. The newly allocated
page must land into page cache aligned on HPAGE_PMD_NR boundary. In other
word you cannout have huge page with ->index, let say, 1.

VMA is only suitable for at least one file-THP page if:

 - (vma->vm_start >> PAGE_SHIFT) % (HPAGE_PMD_NR - 1) is equal to
    vma->vm_pgoff % (HPAGE_PMD_NR - 1)

    This guarantees right alignment in the backing page cache.

 - *and* vma->vm_end - round_up(vma->vm_start, HPAGE_PMD_SIZE) is equal or
   greater than HPAGE_PMD_SIZE.

Does it make sense?

> 
> >> This is the page that content was just read to; readpage() will unlock the page
> >> when it is done with I/O, but the page needs to be locked before it's inserted
> >> into the page cache.
> > 
> > Then you must to lock the page properly with lock_page().
> > 
> > __SetPageLocked() is fine for just allocated pages that was not exposed
> > anywhere. After ->readpage() it's not the case and it's not safe to use
> > __SetPageLocked() for them.
> 
> In the current code, it's assumed it is not exposed, because a single read
> of a large page that does no readahead before the page is inserted into the
> cache means there are no external users of the page.

You've exposed the page to the filesystem once you call ->readpage().
It *may* track the page somehow after the call.
William Kucharski Aug. 7, 2019, 4:12 p.m. UTC | #6
> On Aug 6, 2019, at 5:12 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> IIUC, you are missing ->vm_pgoff from the picture. The newly allocated
> page must land into page cache aligned on HPAGE_PMD_NR boundary. In other
> word you cannout have huge page with ->index, let say, 1.
> 
> VMA is only suitable for at least one file-THP page if:
> 
> - (vma->vm_start >> PAGE_SHIFT) % (HPAGE_PMD_NR - 1) is equal to
>    vma->vm_pgoff % (HPAGE_PMD_NR - 1)
> 
>    This guarantees right alignment in the backing page cache.
> 
> - *and* vma->vm_end - round_up(vma->vm_start, HPAGE_PMD_SIZE) is equal or
>   greater than HPAGE_PMD_SIZE.
> 
> Does it make sense?

It makes sense, but what I am thinking was say a vma->vm_start of 0x1ff000
and vma->vm_end of 0x400000.

Assuming x86, that can be mapped with a PAGESIZE page at 0x1ff000 then a
PMD page mapping 0x200000 - 0x400000.

That doesn't mean a vma IS or COULD ever be configured that way, so you are
correct with your comment, and I will change my check accordingly.

>> In the current code, it's assumed it is not exposed, because a single read
>> of a large page that does no readahead before the page is inserted into the
>> cache means there are no external users of the page.
> 
> You've exposed the page to the filesystem once you call ->readpage().
> It *may* track the page somehow after the call.

OK, thanks again.

I'll try to have a V4 available with these changes soon.
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 45ede62aa85b..b1e5fd3179fd 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -79,13 +79,15 @@  extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define HPAGE_PMD_SHIFT PMD_SHIFT
-#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
-#define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
-
-#define HPAGE_PUD_SHIFT PUD_SHIFT
-#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
-#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
+#define HPAGE_PMD_SHIFT		PMD_SHIFT
+#define HPAGE_PMD_SIZE		((1UL) << HPAGE_PMD_SHIFT)
+#define HPAGE_PMD_OFFSET	(HPAGE_PMD_SIZE - 1)
+#define HPAGE_PMD_MASK		(~(HPAGE_PMD_OFFSET))
+
+#define HPAGE_PUD_SHIFT		PUD_SHIFT
+#define HPAGE_PUD_SIZE		((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_OFFSET	(HPAGE_PUD_SIZE - 1)
+#define HPAGE_PUD_MASK		(~(HPAGE_PUD_OFFSET))
 
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..ba24b515468a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2433,6 +2433,12 @@  extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern vm_fault_t filemap_fault(struct vm_fault *vmf);
+
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+			enum page_entry_size pe_size);
+#endif
+
 extern void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec636a1fc..2debaded0e4d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -736,4 +736,19 @@  config ARCH_HAS_PTE_SPECIAL
 config ARCH_HAS_HUGEPD
 	bool
 
+config RO_EXEC_FILEMAP_HUGE_FAULT_THP
+	bool "read-only exec filemap_huge_fault THP support (EXPERIMENTAL)"
+	depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+	help
+	    Introduce filemap_huge_fault() to automatically map executable
+	    read-only pages of mapped files of suitable size and alignment
+	    using THP if possible.
+
+	    This is marked experimental because it is a new feature and is
+	    dependent upon filesystmes implementing readpages() in a way
+	    that will recognize large THP pages and read file content to
+	    them without polluting the pagecache with PAGESIZE pages due
+	    to readahead.
+
 endmenu
diff --git a/mm/filemap.c b/mm/filemap.c
index 38b46fc00855..db1d8df20367 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -199,6 +199,8 @@  static void unaccount_page_cache_page(struct address_space *mapping,
 	nr = hpage_nr_pages(page);
 
 	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+
+#ifndef	CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
 	if (PageSwapBacked(page)) {
 		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
 		if (PageTransHuge(page))
@@ -206,6 +208,13 @@  static void unaccount_page_cache_page(struct address_space *mapping,
 	} else {
 		VM_BUG_ON_PAGE(PageTransHuge(page), page);
 	}
+#else
+	if (PageSwapBacked(page))
+		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
+
+	if (PageTransHuge(page))
+		__dec_node_page_state(page, NR_SHMEM_THPS);
+#endif
 
 	/*
 	 * At this point page must be either written or cleaned by
@@ -1663,7 +1672,8 @@  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 no_page:
 	if (!page && (fgp_flags & FGP_CREAT)) {
 		int err;
-		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+		if ((fgp_flags & FGP_WRITE) &&
+			mapping_cap_account_dirty(mapping))
 			gfp_mask |= __GFP_WRITE;
 		if (fgp_flags & FGP_NOFS)
 			gfp_mask &= ~__GFP_FS;
@@ -2643,6 +2653,291 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+/*
+ * Check for an entry in the page cache which would conflict with the address
+ * range we wish to map using a THP or is otherwise unusable to map a large
+ * cached page.
+ *
+ * The routine will return true if a usable page is found in the page cache
+ * (and *pagep will be set to the address of the cached page), or if no
+ * cached page is found (and *pagep will be set to NULL).
+ */
+static bool
+filemap_huge_check_pagecache_usable(struct xa_state *xas,
+	struct page **pagep, pgoff_t hindex, pgoff_t hindex_max)
+{
+	struct page *page;
+
+	while (1) {
+		page = xas_find(xas, hindex_max);
+
+		if (xas_retry(xas, page)) {
+			xas_set(xas, hindex);
+			continue;
+		}
+
+		/*
+		 * A found entry is unusable if:
+		 *	+ the entry is an Xarray value, not a pointer
+		 *	+ the entry is an internal Xarray node
+		 *	+ the entry is not a Transparent Huge Page
+		 *	+ the entry is not a compound page
+		 *	+ the entry is not the head of a compound page
+		 *	+ the enbry is a page page with an order other than
+		 *	  HPAGE_PMD_ORDER
+		 *	+ the page's index is not what we expect it to be
+		 *	+ the page is not up-to-date
+		 *	+ the page is unlocked
+		 */
+		if ((page) && (xa_is_value(page) || xa_is_internal(page) ||
+			(!PageCompound(page)) || (PageHuge(page)) ||
+			(!PageTransCompound(page)) ||
+			page != compound_head(page) ||
+			compound_order(page) != HPAGE_PMD_ORDER ||
+			page->index != hindex || (!PageUptodate(page)) ||
+			(!PageLocked(page))))
+			return false;
+
+		break;
+	}
+
+	xas_set(xas, hindex);
+	*pagep = page;
+	return true;
+}
+
+/**
+ * filemap_huge_fault - read in file data for page fault handling to THP
+ * @vmf:	struct vm_fault containing details of the fault
+ * @pe_size:	large page size to map, currently this must be PE_SIZE_PMD
+ *
+ * filemap_huge_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data to a transparent huge page during
+ * a page fault.
+ *
+ * If for any reason we can't allocate a THP, map it or add it to the page
+ * cache, VM_FAULT_FALLBACK will be returned which will cause the fault
+ * handler to try mapping the page using a PAGESIZE page, usually via
+ * filemap_fault() if so speicifed in the vma operations vector.
+ *
+ * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte()
+ * to map the new THP.
+ *
+ * NOTE: This routine depends upon the file system's readpage routine as
+ *       specified in the address space operations vector to recognize when it
+ *	 is being passed a large page and to read the approprate amount of data
+ *	 in full and without polluting the page cache for the large page itself
+ *	 with PAGESIZE pages to perform a buffered read or to pollute what
+ *	 would be the page cache space for any succeeding pages with PAGESIZE
+ *	 pages due to readahead.
+ *
+ *	 It is VITAL that this routine not be enabled without such filesystem
+ *	 support. As there is no way to determine how many bytes were read by
+ *	 the readpage() operation, if only a PAGESIZE page is read, this routine
+ *	 will map the THP containing only the first PAGESIZE bytes of file data
+ *	 to satisfy the fault, which is never the result desired.
+ */
+vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	struct file *filp = vmf->vma->vm_file;
+	struct address_space *mapping = filp->f_mapping;
+	struct vm_area_struct *vma = vmf->vma;
+
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	pgoff_t hindex = round_down(vmf->pgoff, HPAGE_PMD_NR);
+	pgoff_t hindex_max = hindex + HPAGE_PMD_NR;
+
+	struct page *cached_page, *hugepage;
+	struct page *new_page = NULL;
+
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+	int error;
+
+	XA_STATE_ORDER(xas, &mapping->i_pages, hindex, HPAGE_PMD_ORDER);
+
+	/*
+	 * Return VM_FAULT_FALLBACK if:
+	 *
+	 *	+ pe_size != PE_SIZE_PMD
+	 *	+ FAULT_FLAG_WRITE is set in vmf->flags
+	 *	+ vma isn't aligned to allow a PMD mapping
+	 *	+ PMD would extend beyond the end of the vma
+	 */
+	if (pe_size != PE_SIZE_PMD || (vmf->flags & FAULT_FLAG_WRITE) ||
+		(haddr < vma->vm_start ||
+		(haddr + HPAGE_PMD_SIZE > vma->vm_end)))
+		return ret;
+
+	xas_lock_irq(&xas);
+
+retry_xas_locked:
+	if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex,
+		hindex_max)) {
+		/* found a conflicting entry in the page cache, so fallback */
+		goto unlock;
+	} else if (cached_page) {
+		/* found a valid cached page, so map it */
+		hugepage = cached_page;
+		goto map_huge;
+	}
+
+	xas_unlock_irq(&xas);
+
+	/* allocate huge THP page in VMA */
+	new_page = __page_cache_alloc(vmf->gfp_mask | __GFP_COMP |
+		__GFP_NOWARN | __GFP_NORETRY, HPAGE_PMD_ORDER);
+
+	if (unlikely(!new_page))
+		return ret;
+
+	if (unlikely(!(PageCompound(new_page)))) {
+		put_page(new_page);
+		return ret;
+	}
+
+	prep_transhuge_page(new_page);
+	new_page->index = hindex;
+	new_page->mapping = mapping;
+
+	__SetPageLocked(new_page);
+
+	/*
+	 * The readpage() operation below is expected to fill the large
+	 * page with data without polluting the page cache with
+	 * PAGESIZE entries due to a buffered read and/or readahead().
+	 *
+	 * A filesystem's vm_operations_struct huge_fault field should
+	 * never point to this routine without such a capability, and
+	 * without it a call to this routine would eventually just
+	 * fall through to the normal fault op anyway.
+	 */
+	error = mapping->a_ops->readpage(vmf->vma->vm_file, new_page);
+
+	if (unlikely(error)) {
+		put_page(new_page);
+		return ret;
+	}
+
+	/* XXX - use wait_on_page_locked_killable() instead? */
+	wait_on_page_locked(new_page);
+
+	if (!PageUptodate(new_page)) {
+		/* EIO */
+		new_page->mapping = NULL;
+		put_page(new_page);
+		return ret;
+	}
+
+	do {
+		xas_lock_irq(&xas);
+		xas_set(&xas, hindex);
+		xas_create_range(&xas);
+
+		if (!(xas_error(&xas)))
+			break;
+
+		if (!xas_nomem(&xas, GFP_KERNEL)) {
+			if (new_page) {
+				new_page->mapping = NULL;
+				put_page(new_page);
+			}
+
+			goto unlock;
+		}
+
+		xas_unlock_irq(&xas);
+	} while (1);
+
+	/*
+	 * Double check that an entry did not sneak into the page cache while
+	 * creating Xarray entries for the new page.
+	 */
+	if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex,
+		hindex_max)) {
+		/*
+		 * An unusable entry was found, so delete the newly allocated
+		 * page and fallback.
+		 */
+		new_page->mapping = NULL;
+		put_page(new_page);
+		goto unlock;
+	} else if (cached_page) {
+		/*
+		 * A valid large page was found in the page cache, so free the
+		 * newly allocated page and map the cached page instead.
+		 */
+		new_page->mapping = NULL;
+		put_page(new_page);
+		new_page = NULL;
+		hugepage = cached_page;
+		goto map_huge;
+	}
+
+	__SetPageLocked(new_page);
+
+	/* did it get truncated? */
+	if (unlikely(new_page->mapping != mapping)) {
+		unlock_page(new_page);
+		put_page(new_page);
+		goto retry_xas_locked;
+	}
+
+	hugepage = new_page;
+
+map_huge:
+	/* map hugepage at the PMD level */
+	ret = alloc_set_pte(vmf, NULL, hugepage);
+
+	VM_BUG_ON_PAGE((!(pmd_trans_huge(*vmf->pmd))), hugepage);
+
+	if (likely(!(ret & VM_FAULT_ERROR))) {
+		/*
+		 * The alloc_set_pte() succeeded without error, so
+		 * add the page to the page cache if it is new, and
+		 * increment page statistics accordingly.
+		 */
+		if (new_page) {
+			unsigned long nr;
+
+			xas_set(&xas, hindex);
+
+			for (nr = 0; nr < HPAGE_PMD_NR; nr++) {
+#ifndef	COMPOUND_PAGES_HEAD_ONLY
+				xas_store(&xas, new_page + nr);
+#else
+				xas_store(&xas, new_page);
+#endif
+				xas_next(&xas);
+			}
+
+			count_vm_event(THP_FILE_ALLOC);
+			__inc_node_page_state(new_page, NR_SHMEM_THPS);
+			__mod_node_page_state(page_pgdat(new_page),
+				NR_FILE_PAGES, HPAGE_PMD_NR);
+			__mod_node_page_state(page_pgdat(new_page),
+				NR_SHMEM, HPAGE_PMD_NR);
+		}
+
+		vmf->address = haddr;
+		vmf->page = hugepage;
+
+		page_ref_add(hugepage, HPAGE_PMD_NR);
+		count_vm_event(THP_FILE_MAPPED);
+	} else if (new_page) {
+		/* there was an error mapping the new page, so release it */
+		new_page->mapping = NULL;
+		put_page(new_page);
+	}
+
+unlock:
+	xas_unlock_irq(&xas);
+	return ret;
+}
+EXPORT_SYMBOL(filemap_huge_fault);
+#endif
+
 void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
@@ -2925,7 +3220,8 @@  struct page *read_cache_page(struct address_space *mapping,
 EXPORT_SYMBOL(read_cache_page);
 
 /**
- * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * read_cache_page_gfp - read into page cache, using specified page allocation
+ *			 flags.
  * @mapping:	the page's address_space
  * @index:	the page index
  * @gfp:	the page allocator flags to use if allocating
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1334ede667a8..26d74466d1f7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -543,8 +543,11 @@  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 
 	if (addr)
 		goto out;
+
+#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
 	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
 		goto out;
+#endif
 
 	addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
 	if (addr)
diff --git a/mm/mmap.c b/mm/mmap.c
index 7e8c3e8ae75f..96ff80d2a8fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1391,6 +1391,10 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	int pkey = 0;
 
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+	unsigned long vm_maywrite = VM_MAYWRITE;
+#endif
+
 	*populate = 0;
 
 	if (!len)
@@ -1429,7 +1433,33 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+	/*
+	 * If THP is enabled, it's a read-only executable that is
+	 * MAP_PRIVATE mapped, the length is larger than a PMD page
+	 * and either it's not a MAP_FIXED mapping or the passed address is
+	 * properly aligned for a PMD page, attempt to get an appropriate
+	 * address at which to map a PMD-sized THP page, otherwise call the
+	 * normal routine.
+	 */
+	if ((prot & PROT_READ) && (prot & PROT_EXEC) &&
+		(!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) &&
+		(!(flags & MAP_FIXED)) && len >= HPAGE_PMD_SIZE &&
+		(!(addr & HPAGE_PMD_OFFSET))) {
+		addr = thp_get_unmapped_area(file, addr, len, pgoff, flags);
+
+		if (addr && (!(addr & HPAGE_PMD_OFFSET)))
+			vm_maywrite = 0;
+		else
+			addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	} else {
+#endif
+		addr = get_unmapped_area(file, addr, len, pgoff, flags);
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+	}
+#endif
+
 	if (offset_in_page(addr))
 		return addr;
 
@@ -1451,7 +1481,11 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 	 * of the memory object, so we don't do any here.
 	 */
 	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
+			mm->def_flags | VM_MAYREAD | vm_maywrite | VM_MAYEXEC;
+#else
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+#endif
 
 	if (flags & MAP_LOCKED)
 		if (!can_do_mlock())
diff --git a/mm/rmap.c b/mm/rmap.c
index e5dfe2ae6b0d..503612d3b52b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1192,7 +1192,11 @@  void page_add_file_rmap(struct page *page, bool compound)
 		}
 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
 			goto out;
+
+#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+#endif
+
 		__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
 	} else {
 		if (PageTransCompound(page) && page_mapping(page)) {
@@ -1232,7 +1236,11 @@  static void page_remove_file_rmap(struct page *page, bool compound)
 		}
 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
 			goto out;
+
+#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP
 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+#endif
+
 		__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
 	} else {
 		if (!atomic_add_negative(-1, &page->_mapcount))