Message ID | 20190729210933.18674-3-william.kucharski@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm,thp: Add filemap_huge_fault() for THP | expand |
On Mon, Jul 29, 2019 at 2:10 PM William Kucharski <william.kucharski@oracle.com> wrote: > > Add filemap_huge_fault() to attempt to satisfy page faults on > memory-mapped read-only text pages using THP when possible. > > Signed-off-by: William Kucharski <william.kucharski@oracle.com> [..] > +/** > + * filemap_huge_fault - read in file data for page fault handling to THP > + * @vmf: struct vm_fault containing details of the fault > + * @pe_size: large page size to map, currently this must be PE_SIZE_PMD > + * > + * filemap_huge_fault() is invoked via the vma operations vector for a > + * mapped memory region to read in file data to a transparent huge page during > + * a page fault. > + * > + * If for any reason we can't allocate a THP, map it or add it to the page > + * cache, VM_FAULT_FALLBACK will be returned which will cause the fault > + * handler to try mapping the page using a PAGESIZE page, usually via > + * filemap_fault() if so speicifed in the vma operations vector. > + * > + * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte() > + * to map the new THP. > + * > + * NOTE: This routine depends upon the file system's readpage routine as > + * specified in the address space operations vector to recognize when it > + * is being passed a large page and to read the approprate amount of data > + * in full and without polluting the page cache for the large page itself > + * with PAGESIZE pages to perform a buffered read or to pollute what > + * would be the page cache space for any succeeding pages with PAGESIZE > + * pages due to readahead. > + * > + * It is VITAL that this routine not be enabled without such filesystem > + * support. Rather than a hopeful comment, this wants an explicit mechanism to prevent inadvertent mismatched ->readpage() assumptions. Either a new ->readhugepage() op, or a flags field in 'struct address_space_operations' indicating that the address_space opts into being careful to handle huge page arguments. I.e. something like mmap_supported_flags that was added to 'struct file_operations'.
> On Jul 29, 2019, at 2:09 PM, William Kucharski <william.kucharski@oracle.com> wrote: > > Add filemap_huge_fault() to attempt to satisfy page faults on > memory-mapped read-only text pages using THP when possible. I think this 2/2 doesn't need pagecache_get_page() changes in 1/2. Maybe we can split pagecache_get_page() related changes out? > > Signed-off-by: William Kucharski <william.kucharski@oracle.com> > --- > include/linux/huge_mm.h | 16 ++- > include/linux/mm.h | 6 + > mm/Kconfig | 15 ++ > mm/filemap.c | 299 +++++++++++++++++++++++++++++++++++++++- > mm/huge_memory.c | 3 + > mm/mmap.c | 36 ++++- > mm/rmap.c | 8 ++ > 7 files changed, 373 insertions(+), 10 deletions(-) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 45ede62aa85b..34723f7e75d0 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -79,13 +79,15 @@ extern struct kobj_attribute shmem_enabled_attr; > #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > -#define HPAGE_PMD_SHIFT PMD_SHIFT > -#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) > -#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) > - > -#define HPAGE_PUD_SHIFT PUD_SHIFT > -#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) > -#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) > +#define HPAGE_PMD_SHIFT PMD_SHIFT > +#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) > +#define HPAGE_PMD_OFFSET (HPAGE_PMD_SIZE - 1) ^ space vs. tab difference here. > +#define HPAGE_PMD_MASK (~(HPAGE_PMD_OFFSET)) > + > +#define HPAGE_PUD_SHIFT PUD_SHIFT > +#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) > +#define HPAGE_PUD_OFFSET (HPAGE_PUD_SIZE - 1) > +#define HPAGE_PUD_MASK (~(HPAGE_PUD_OFFSET)) Should HPAGE_PMD_OFFSET and HPAGE_PUD_OFFSET include bits for PAGE_OFFSET? I guess we can just keep huge_mm.h as-is and use ~HPAGE_PMD_MASK. > > extern bool is_vma_temporary_stack(struct vm_area_struct *vma); > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0334ca97c584..ba24b515468a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2433,6 +2433,12 @@ extern void truncate_inode_pages_final(struct address_space *); > > /* generic vm_area_ops exported for stackable file systems */ > extern vm_fault_t filemap_fault(struct vm_fault *vmf); > + > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > +extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf, > + enum page_entry_size pe_size); > +#endif > + > extern void filemap_map_pages(struct vm_fault *vmf, > pgoff_t start_pgoff, pgoff_t end_pgoff); > extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); > diff --git a/mm/Kconfig b/mm/Kconfig > index 56cec636a1fc..2debaded0e4d 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -736,4 +736,19 @@ config ARCH_HAS_PTE_SPECIAL > config ARCH_HAS_HUGEPD > bool > > +config RO_EXEC_FILEMAP_HUGE_FAULT_THP > + bool "read-only exec filemap_huge_fault THP support (EXPERIMENTAL)" > + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM > + > + help > + Introduce filemap_huge_fault() to automatically map executable > + read-only pages of mapped files of suitable size and alignment > + using THP if possible. > + > + This is marked experimental because it is a new feature and is > + dependent upon filesystmes implementing readpages() in a way > + that will recognize large THP pages and read file content to > + them without polluting the pagecache with PAGESIZE pages due > + to readahead. > + > endmenu > diff --git a/mm/filemap.c b/mm/filemap.c > index a96092243fc4..4e7287db0d8e 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -199,6 +199,8 @@ static void unaccount_page_cache_page(struct address_space *mapping, > nr = hpage_nr_pages(page); > > __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); > + > +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > if (PageSwapBacked(page)) { > __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); > if (PageTransHuge(page)) > @@ -206,6 +208,13 @@ static void unaccount_page_cache_page(struct address_space *mapping, > } else { > VM_BUG_ON_PAGE(PageTransHuge(page), page); > } > +#else > + if (PageSwapBacked(page)) > + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); > + > + if (PageTransHuge(page)) > + __dec_node_page_state(page, NR_SHMEM_THPS); > +#endif > > /* > * At this point page must be either written or cleaned by > @@ -1615,7 +1624,7 @@ EXPORT_SYMBOL(find_lock_entry); > * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do > * its own locking dance if the page is already in cache, or unlock the page > * before returning if we had to add the page to pagecache. > - * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page. > + * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page I think we haven't used FGP_PMD yet? > * > * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even > * if the GFP flags specified for FGP_CREAT are atomic. > @@ -2642,6 +2651,291 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) > } > EXPORT_SYMBOL(filemap_fault); > > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > +/* > + * Check for an entry in the page cache which would conflict with the address > + * range we wish to map using a THP or is otherwise unusable to map a large > + * cached page. > + * > + * The routine will return true if a usable page is found in the page cache > + * (and *pagep will be set to the address of the cached page), or if no > + * cached page is found (and *pagep will be set to NULL). > + */ > +static bool > +filemap_huge_check_pagecache_usable(struct xa_state *xasp, > + struct page **pagep, pgoff_t hindex, pgoff_t hindex_max) We have been using name "xas" for "struct xa_state *". Let's keep using it? > +{ > + struct page *page; > + > + while (1) { > + page = xas_find(xasp, hindex_max); > + > + if (xas_retry(xasp, page)) { > + xas_set(xasp, hindex); > + continue; > + } > + > + /* > + * A found entry is unusable if: > + * + the entry is an Xarray value, not a pointer > + * + the entry is an internal Xarray node > + * + the entry is not a Transparent Huge Page > + * + the entry is not a compound page > + * + the entry is not the head of a compound page > + * + the enbry is a page page with an order other than > + * HPAGE_PMD_ORDER > + * + the page's index is not what we expect it to be > + * + the page is not up-to-date > + * + the page is unlocked > + */ > + if ((page) && (xa_is_value(page) || xa_is_internal(page) || > + (!PageCompound(page)) || (PageHuge(page)) || > + (!PageTransCompound(page)) || > + page != compound_head(page) || > + compound_order(page) != HPAGE_PMD_ORDER || > + page->index != hindex || (!PageUptodate(page)) || > + (!PageLocked(page)))) > + return false; > + > + break; > + } > + > + xas_set(xasp, hindex); > + *pagep = page; > + return true; > +} > + > +/** > + * filemap_huge_fault - read in file data for page fault handling to THP > + * @vmf: struct vm_fault containing details of the fault > + * @pe_size: large page size to map, currently this must be PE_SIZE_PMD > + * > + * filemap_huge_fault() is invoked via the vma operations vector for a > + * mapped memory region to read in file data to a transparent huge page during > + * a page fault. > + * > + * If for any reason we can't allocate a THP, map it or add it to the page > + * cache, VM_FAULT_FALLBACK will be returned which will cause the fault > + * handler to try mapping the page using a PAGESIZE page, usually via > + * filemap_fault() if so speicifed in the vma operations vector. > + * > + * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte() > + * to map the new THP. > + * > + * NOTE: This routine depends upon the file system's readpage routine as > + * specified in the address space operations vector to recognize when it > + * is being passed a large page and to read the approprate amount of data > + * in full and without polluting the page cache for the large page itself > + * with PAGESIZE pages to perform a buffered read or to pollute what > + * would be the page cache space for any succeeding pages with PAGESIZE > + * pages due to readahead. > + * > + * It is VITAL that this routine not be enabled without such filesystem > + * support. As there is no way to determine how many bytes were read by > + * the readpage() operation, if only a PAGESIZE page is read, this routine > + * will map the THP containing only the first PAGESIZE bytes of file data > + * to satisfy the fault, which is never the result desired. > + */ > +vm_fault_t filemap_huge_fault(struct vm_fault *vmf, > + enum page_entry_size pe_size) > +{ > + struct file *filp = vmf->vma->vm_file; > + struct address_space *mapping = filp->f_mapping; > + struct vm_area_struct *vma = vmf->vma; > + > + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > + pgoff_t hindex = round_down(vmf->pgoff, HPAGE_PMD_NR); > + pgoff_t hindex_max = hindex + HPAGE_PMD_NR; > + > + struct page *cached_page, *hugepage; > + struct page *new_page = NULL; > + > + vm_fault_t ret = VM_FAULT_FALLBACK; > + int error; > + > + XA_STATE_ORDER(xas, &mapping->i_pages, hindex, HPAGE_PMD_ORDER); > + > + /* > + * Return VM_FAULT_FALLBACK if: > + * > + * + pe_size != PE_SIZE_PMD > + * + FAULT_FLAG_WRITE is set in vmf->flags > + * + vma isn't aligned to allow a PMD mapping > + * + PMD would extend beyond the end of the vma > + */ > + if (pe_size != PE_SIZE_PMD || (vmf->flags & FAULT_FLAG_WRITE) || > + (haddr < vma->vm_start || > + (haddr + HPAGE_PMD_SIZE > vma->vm_end))) > + return ret; > + > + xas_lock_irq(&xas); > + > +retry_xas_locked: > + if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex, > + hindex_max)) { > + /* found a conflicting entry in the page cache, so fallback */ > + goto unlock; > + } else if (cached_page) { > + /* found a valid cached page, so map it */ > + hugepage = cached_page; > + goto map_huge; > + } > + > + xas_unlock_irq(&xas); > + > + /* allocate huge THP page in VMA */ > + new_page = __page_cache_alloc(vmf->gfp_mask | __GFP_COMP | > + __GFP_NOWARN | __GFP_NORETRY, HPAGE_PMD_ORDER); > + > + if (unlikely(!new_page)) > + return ret; > + > + if (unlikely(!(PageCompound(new_page)))) { What condition triggers this case? > + put_page(new_page); > + return ret; > + } > + > + prep_transhuge_page(new_page); > + new_page->index = hindex; > + new_page->mapping = mapping; > + > + __SetPageLocked(new_page); > + > + /* > + * The readpage() operation below is expected to fill the large > + * page with data without polluting the page cache with > + * PAGESIZE entries due to a buffered read and/or readahead(). > + * > + * A filesystem's vm_operations_struct huge_fault field should > + * never point to this routine without such a capability, and > + * without it a call to this routine would eventually just > + * fall through to the normal fault op anyway. > + */ > + error = mapping->a_ops->readpage(vmf->vma->vm_file, new_page); > + > + if (unlikely(error)) { > + put_page(new_page); > + return ret; > + } > + > + /* XXX - use wait_on_page_locked_killable() instead? */ > + wait_on_page_locked(new_page); > + > + if (!PageUptodate(new_page)) { > + /* EIO */ > + new_page->mapping = NULL; > + put_page(new_page); > + return ret; > + } > + > + do { > + xas_lock_irq(&xas); > + xas_set(&xas, hindex); > + xas_create_range(&xas); > + > + if (!(xas_error(&xas))) > + break; > + > + if (!xas_nomem(&xas, GFP_KERNEL)) { > + if (new_page) { > + new_page->mapping = NULL; > + put_page(new_page); > + } > + > + goto unlock; > + } > + > + xas_unlock_irq(&xas); > + } while (1); > + > + /* > + * Double check that an entry did not sneak into the page cache while > + * creating Xarray entries for the new page. > + */ > + if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex, > + hindex_max)) { > + /* > + * An unusable entry was found, so delete the newly allocated > + * page and fallback. > + */ > + new_page->mapping = NULL; > + put_page(new_page); > + goto unlock; > + } else if (cached_page) { > + /* > + * A valid large page was found in the page cache, so free the > + * newly allocated page and map the cached page instead. > + */ > + new_page->mapping = NULL; > + put_page(new_page); > + new_page = NULL; > + hugepage = cached_page; > + goto map_huge; > + } > + > + __SetPageLocked(new_page); > + > + /* did it get truncated? */ > + if (unlikely(new_page->mapping != mapping)) { > + unlock_page(new_page); > + put_page(new_page); > + goto retry_xas_locked; > + } > + > + hugepage = new_page; > + > +map_huge: > + /* map hugepage at the PMD level */ > + ret = alloc_set_pte(vmf, NULL, hugepage); > + > + VM_BUG_ON_PAGE((!(pmd_trans_huge(*vmf->pmd))), hugepage); > + > + if (likely(!(ret & VM_FAULT_ERROR))) { > + /* > + * The alloc_set_pte() succeeded without error, so > + * add the page to the page cache if it is new, and > + * increment page statistics accordingly. > + */ > + if (new_page) { > + unsigned long nr; > + > + xas_set(&xas, hindex); > + > + for (nr = 0; nr < HPAGE_PMD_NR; nr++) { > +#ifndef COMPOUND_PAGES_HEAD_ONLY Where do we define COMPOUND_PAGES_HEAD_ONLY? > + xas_store(&xas, new_page + nr); > +#else > + xas_store(&xas, new_page); > +#endif > + xas_next(&xas); > + } > + > + count_vm_event(THP_FILE_ALLOC); > + __inc_node_page_state(new_page, NR_SHMEM_THPS); > + __mod_node_page_state(page_pgdat(new_page), > + NR_FILE_PAGES, HPAGE_PMD_NR); > + __mod_node_page_state(page_pgdat(new_page), > + NR_SHMEM, HPAGE_PMD_NR); > + } > + > + vmf->address = haddr; > + vmf->page = hugepage; > + > + page_ref_add(hugepage, HPAGE_PMD_NR); > + count_vm_event(THP_FILE_MAPPED); > + } else if (new_page) { > + /* there was an error mapping the new page, so release it */ > + new_page->mapping = NULL; > + put_page(new_page); > + } > + > +unlock: > + xas_unlock_irq(&xas); > + return ret; > +} > +EXPORT_SYMBOL(filemap_huge_fault); > +#endif > + > void filemap_map_pages(struct vm_fault *vmf, > pgoff_t start_pgoff, pgoff_t end_pgoff) > { > @@ -2924,7 +3218,8 @@ struct page *read_cache_page(struct address_space *mapping, > EXPORT_SYMBOL(read_cache_page); > > /** > - * read_cache_page_gfp - read into page cache, using specified page allocation flags. > + * read_cache_page_gfp - read into page cache, using specified page allocation > + * flags. > * @mapping: the page's address_space > * @index: the page index > * @gfp: the page allocator flags to use if allocating > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 1334ede667a8..26d74466d1f7 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -543,8 +543,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, > > if (addr) > goto out; > + > +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) > goto out; > +#endif > > addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); > if (addr) > diff --git a/mm/mmap.c b/mm/mmap.c > index 7e8c3e8ae75f..96ff80d2a8fb 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1391,6 +1391,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > struct mm_struct *mm = current->mm; > int pkey = 0; > > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > + unsigned long vm_maywrite = VM_MAYWRITE; > +#endif > + > *populate = 0; > > if (!len) > @@ -1429,7 +1433,33 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > /* Obtain the address to map to. we verify (or select) it and ensure > * that it represents a valid section of the address space. > */ > - addr = get_unmapped_area(file, addr, len, pgoff, flags); > + > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > + /* > + * If THP is enabled, it's a read-only executable that is > + * MAP_PRIVATE mapped, the length is larger than a PMD page > + * and either it's not a MAP_FIXED mapping or the passed address is > + * properly aligned for a PMD page, attempt to get an appropriate > + * address at which to map a PMD-sized THP page, otherwise call the > + * normal routine. > + */ > + if ((prot & PROT_READ) && (prot & PROT_EXEC) && > + (!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) && > + (!(flags & MAP_FIXED)) && len >= HPAGE_PMD_SIZE && > + (!(addr & HPAGE_PMD_OFFSET))) { > + addr = thp_get_unmapped_area(file, addr, len, pgoff, flags); > + > + if (addr && (!(addr & HPAGE_PMD_OFFSET))) > + vm_maywrite = 0; > + else > + addr = get_unmapped_area(file, addr, len, pgoff, flags); > + } else { > +#endif > + addr = get_unmapped_area(file, addr, len, pgoff, flags); > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > + } > +#endif > + > if (offset_in_page(addr)) > return addr; > > @@ -1451,7 +1481,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > * of the memory object, so we don't do any here. > */ > vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | > +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > + mm->def_flags | VM_MAYREAD | vm_maywrite | VM_MAYEXEC; > +#else > mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; > +#endif > > if (flags & MAP_LOCKED) > if (!can_do_mlock()) > diff --git a/mm/rmap.c b/mm/rmap.c > index e5dfe2ae6b0d..503612d3b52b 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1192,7 +1192,11 @@ void page_add_file_rmap(struct page *page, bool compound) > } > if (!atomic_inc_and_test(compound_mapcount_ptr(page))) > goto out; > + > +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > VM_BUG_ON_PAGE(!PageSwapBacked(page), page); > +#endif > + > __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); > } else { > if (PageTransCompound(page) && page_mapping(page)) { > @@ -1232,7 +1236,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) > } > if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) > goto out; > + > +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP > VM_BUG_ON_PAGE(!PageSwapBacked(page), page); > +#endif > + > __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); > } else { > if (!atomic_add_negative(-1, &page->_mapcount)) > -- > 2.21.0 >
On 7/29/19 4:51 PM, Song Liu wrote: > >> +#define HPAGE_PMD_OFFSET (HPAGE_PMD_SIZE - 1) > ^ space vs. tab difference here. Thanks, good catch! > >> +#define HPAGE_PMD_MASK (~(HPAGE_PMD_OFFSET)) >> + >> +#define HPAGE_PUD_SHIFT PUD_SHIFT >> +#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) >> +#define HPAGE_PUD_OFFSET (HPAGE_PUD_SIZE - 1) Saw this one, too. > Should HPAGE_PMD_OFFSET and HPAGE_PUD_OFFSET include bits for > PAGE_OFFSET? I guess we can just keep huge_mm.h as-is and use > ~HPAGE_PMD_MASK. That's what I had intended; would you rather see those macros omit the unneeded for the larger page size bits? >> - * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page. >> + * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page No - this came in as part of patch 1/2 and I missed dropping the period at the end of the line that caused this to be a diff, so I will put it back. :-) > We have been using name "xas" for "struct xa_state *". Let's keep using it? Thanks, done. >> + if (unlikely(!(PageCompound(new_page)))) { > > What condition triggers this case I wanted a check to make sure that __page_cacke_alloc() returned a large page. I don't recall if the mechanism guarantees that when you ask for a large page, you get one, so I wanted to handle that case. If you prefer, I could make this a VM_BUG_ON_PAGE() instead, but I wanted it to fallback gracefully if it can't get a properly sized page. >> +#ifndef COMPOUND_PAGES_HEAD_ONLY > > Where do we define COMPOUND_PAGES_HEAD_ONLY? At present, we do not. I used this so I could include the code that would be needed once Matthew's "store only head pages in page cache" changes go back in, which looks like it may not be until 5.4-rc1. Matthew recommended I include this so we didn't lose track of the code change that would be needed then. I'll be talking to him today about this and the issues you raised regarding patch 1/2. Thanks for going through this!! -- Bill
> On Jul 30, 2019, at 7:11 AM, William Kucharski <william.kucharski@oracle.com> wrote: > > > > On 7/29/19 4:51 PM, Song Liu wrote: > >> >>> +#define HPAGE_PMD_OFFSET (HPAGE_PMD_SIZE - 1) >> ^ space vs. tab difference here. > > Thanks, good catch! > >>> +#define HPAGE_PMD_MASK (~(HPAGE_PMD_OFFSET)) >>> + >>> +#define HPAGE_PUD_SHIFT PUD_SHIFT >>> +#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) >>> +#define HPAGE_PUD_OFFSET (HPAGE_PUD_SIZE - 1) > > Saw this one, too. > >> Should HPAGE_PMD_OFFSET and HPAGE_PUD_OFFSET include bits for >> PAGE_OFFSET? I guess we can just keep huge_mm.h as-is and use >> ~HPAGE_PMD_MASK. > > That's what I had intended; would you rather see those macros > omit the unneeded for the larger page size bits? I think using ~HPAGE_PMD_MASK is common practice. Let's keep it that way. > >>> - * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page. >>> + * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page > > No - this came in as part of patch 1/2 and I missed dropping the period at the end of the line that caused this to be a diff, so I will put it > back. :-) > >> We have been using name "xas" for "struct xa_state *". Let's keep using it? > > Thanks, done. > >>> + if (unlikely(!(PageCompound(new_page)))) { >> What condition triggers this case > I wanted a check to make sure that __page_cacke_alloc() returned a large page. I don't recall if the mechanism guarantees that when you ask for > a large page, you get one, so I wanted to handle that case. > > If you prefer, I could make this a VM_BUG_ON_PAGE() instead, but I > wanted it to fallback gracefully if it can't get a properly sized > page. I think __page_cache_alloc() guarantees compound page. If not, it should return NULL. > >>> +#ifndef COMPOUND_PAGES_HEAD_ONLY >> Where do we define COMPOUND_PAGES_HEAD_ONLY? > > At present, we do not. > > I used this so I could include the code that would be needed once > Matthew's "store only head pages in page cache" changes go back in, > which looks like it may not be until 5.4-rc1. Matthew recommended I We don't have to wait until 5.4-rc1. We could develop based on this patch once it lands in mm tree. Thanks, Song
On Mon, Jul 29, 2019 at 03:47:18PM -0700, Dan Williams wrote: > On Mon, Jul 29, 2019 at 2:10 PM William Kucharski > <william.kucharski@oracle.com> wrote: > > > > Add filemap_huge_fault() to attempt to satisfy page faults on > > memory-mapped read-only text pages using THP when possible. > > > > Signed-off-by: William Kucharski <william.kucharski@oracle.com> > [..] > > +/** > > + * filemap_huge_fault - read in file data for page fault handling to THP > > + * @vmf: struct vm_fault containing details of the fault > > + * @pe_size: large page size to map, currently this must be PE_SIZE_PMD > > + * > > + * filemap_huge_fault() is invoked via the vma operations vector for a > > + * mapped memory region to read in file data to a transparent huge page during > > + * a page fault. > > + * > > + * If for any reason we can't allocate a THP, map it or add it to the page > > + * cache, VM_FAULT_FALLBACK will be returned which will cause the fault > > + * handler to try mapping the page using a PAGESIZE page, usually via > > + * filemap_fault() if so speicifed in the vma operations vector. > > + * > > + * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte() > > + * to map the new THP. > > + * > > + * NOTE: This routine depends upon the file system's readpage routine as > > + * specified in the address space operations vector to recognize when it > > + * is being passed a large page and to read the approprate amount of data > > + * in full and without polluting the page cache for the large page itself > > + * with PAGESIZE pages to perform a buffered read or to pollute what > > + * would be the page cache space for any succeeding pages with PAGESIZE > > + * pages due to readahead. > > + * > > + * It is VITAL that this routine not be enabled without such filesystem > > + * support. > > Rather than a hopeful comment, this wants an explicit mechanism to > prevent inadvertent mismatched ->readpage() assumptions. Filesystems have to opt in to this. If they add a ->huge_fault entry to their vm_operations_struct without updating their ->readpage implementation, they only have themselves to blame.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..34723f7e75d0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -79,13 +79,15 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define HPAGE_PMD_SHIFT PMD_SHIFT -#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) -#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) - -#define HPAGE_PUD_SHIFT PUD_SHIFT -#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) -#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) +#define HPAGE_PMD_SHIFT PMD_SHIFT +#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) +#define HPAGE_PMD_OFFSET (HPAGE_PMD_SIZE - 1) +#define HPAGE_PMD_MASK (~(HPAGE_PMD_OFFSET)) + +#define HPAGE_PUD_SHIFT PUD_SHIFT +#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +#define HPAGE_PUD_OFFSET (HPAGE_PUD_SIZE - 1) +#define HPAGE_PUD_MASK (~(HPAGE_PUD_OFFSET)) extern bool is_vma_temporary_stack(struct vm_area_struct *vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0334ca97c584..ba24b515468a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2433,6 +2433,12 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); + +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP +extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size); +#endif + extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); diff --git a/mm/Kconfig b/mm/Kconfig index 56cec636a1fc..2debaded0e4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -736,4 +736,19 @@ config ARCH_HAS_PTE_SPECIAL config ARCH_HAS_HUGEPD bool +config RO_EXEC_FILEMAP_HUGE_FAULT_THP + bool "read-only exec filemap_huge_fault THP support (EXPERIMENTAL)" + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + + help + Introduce filemap_huge_fault() to automatically map executable + read-only pages of mapped files of suitable size and alignment + using THP if possible. + + This is marked experimental because it is a new feature and is + dependent upon filesystmes implementing readpages() in a way + that will recognize large THP pages and read file content to + them without polluting the pagecache with PAGESIZE pages due + to readahead. + endmenu diff --git a/mm/filemap.c b/mm/filemap.c index a96092243fc4..4e7287db0d8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -199,6 +199,8 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) @@ -206,6 +208,13 @@ static void unaccount_page_cache_page(struct address_space *mapping, } else { VM_BUG_ON_PAGE(PageTransHuge(page), page); } +#else + if (PageSwapBacked(page)) + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + + if (PageTransHuge(page)) + __dec_node_page_state(page, NR_SHMEM_THPS); +#endif /* * At this point page must be either written or cleaned by @@ -1615,7 +1624,7 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do * its own locking dance if the page is already in cache, or unlock the page * before returning if we had to add the page to pagecache. - * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page. + * - FGP_PMD: If FGP_CREAT is specified, attempt to allocate a PMD-sized page * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -2642,6 +2651,291 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP +/* + * Check for an entry in the page cache which would conflict with the address + * range we wish to map using a THP or is otherwise unusable to map a large + * cached page. + * + * The routine will return true if a usable page is found in the page cache + * (and *pagep will be set to the address of the cached page), or if no + * cached page is found (and *pagep will be set to NULL). + */ +static bool +filemap_huge_check_pagecache_usable(struct xa_state *xasp, + struct page **pagep, pgoff_t hindex, pgoff_t hindex_max) +{ + struct page *page; + + while (1) { + page = xas_find(xasp, hindex_max); + + if (xas_retry(xasp, page)) { + xas_set(xasp, hindex); + continue; + } + + /* + * A found entry is unusable if: + * + the entry is an Xarray value, not a pointer + * + the entry is an internal Xarray node + * + the entry is not a Transparent Huge Page + * + the entry is not a compound page + * + the entry is not the head of a compound page + * + the enbry is a page page with an order other than + * HPAGE_PMD_ORDER + * + the page's index is not what we expect it to be + * + the page is not up-to-date + * + the page is unlocked + */ + if ((page) && (xa_is_value(page) || xa_is_internal(page) || + (!PageCompound(page)) || (PageHuge(page)) || + (!PageTransCompound(page)) || + page != compound_head(page) || + compound_order(page) != HPAGE_PMD_ORDER || + page->index != hindex || (!PageUptodate(page)) || + (!PageLocked(page)))) + return false; + + break; + } + + xas_set(xasp, hindex); + *pagep = page; + return true; +} + +/** + * filemap_huge_fault - read in file data for page fault handling to THP + * @vmf: struct vm_fault containing details of the fault + * @pe_size: large page size to map, currently this must be PE_SIZE_PMD + * + * filemap_huge_fault() is invoked via the vma operations vector for a + * mapped memory region to read in file data to a transparent huge page during + * a page fault. + * + * If for any reason we can't allocate a THP, map it or add it to the page + * cache, VM_FAULT_FALLBACK will be returned which will cause the fault + * handler to try mapping the page using a PAGESIZE page, usually via + * filemap_fault() if so speicifed in the vma operations vector. + * + * Returns either VM_FAULT_FALLBACK or the result of calling allcc_set_pte() + * to map the new THP. + * + * NOTE: This routine depends upon the file system's readpage routine as + * specified in the address space operations vector to recognize when it + * is being passed a large page and to read the approprate amount of data + * in full and without polluting the page cache for the large page itself + * with PAGESIZE pages to perform a buffered read or to pollute what + * would be the page cache space for any succeeding pages with PAGESIZE + * pages due to readahead. + * + * It is VITAL that this routine not be enabled without such filesystem + * support. As there is no way to determine how many bytes were read by + * the readpage() operation, if only a PAGESIZE page is read, this routine + * will map the THP containing only the first PAGESIZE bytes of file data + * to satisfy the fault, which is never the result desired. + */ +vm_fault_t filemap_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + struct file *filp = vmf->vma->vm_file; + struct address_space *mapping = filp->f_mapping; + struct vm_area_struct *vma = vmf->vma; + + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pgoff_t hindex = round_down(vmf->pgoff, HPAGE_PMD_NR); + pgoff_t hindex_max = hindex + HPAGE_PMD_NR; + + struct page *cached_page, *hugepage; + struct page *new_page = NULL; + + vm_fault_t ret = VM_FAULT_FALLBACK; + int error; + + XA_STATE_ORDER(xas, &mapping->i_pages, hindex, HPAGE_PMD_ORDER); + + /* + * Return VM_FAULT_FALLBACK if: + * + * + pe_size != PE_SIZE_PMD + * + FAULT_FLAG_WRITE is set in vmf->flags + * + vma isn't aligned to allow a PMD mapping + * + PMD would extend beyond the end of the vma + */ + if (pe_size != PE_SIZE_PMD || (vmf->flags & FAULT_FLAG_WRITE) || + (haddr < vma->vm_start || + (haddr + HPAGE_PMD_SIZE > vma->vm_end))) + return ret; + + xas_lock_irq(&xas); + +retry_xas_locked: + if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex, + hindex_max)) { + /* found a conflicting entry in the page cache, so fallback */ + goto unlock; + } else if (cached_page) { + /* found a valid cached page, so map it */ + hugepage = cached_page; + goto map_huge; + } + + xas_unlock_irq(&xas); + + /* allocate huge THP page in VMA */ + new_page = __page_cache_alloc(vmf->gfp_mask | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY, HPAGE_PMD_ORDER); + + if (unlikely(!new_page)) + return ret; + + if (unlikely(!(PageCompound(new_page)))) { + put_page(new_page); + return ret; + } + + prep_transhuge_page(new_page); + new_page->index = hindex; + new_page->mapping = mapping; + + __SetPageLocked(new_page); + + /* + * The readpage() operation below is expected to fill the large + * page with data without polluting the page cache with + * PAGESIZE entries due to a buffered read and/or readahead(). + * + * A filesystem's vm_operations_struct huge_fault field should + * never point to this routine without such a capability, and + * without it a call to this routine would eventually just + * fall through to the normal fault op anyway. + */ + error = mapping->a_ops->readpage(vmf->vma->vm_file, new_page); + + if (unlikely(error)) { + put_page(new_page); + return ret; + } + + /* XXX - use wait_on_page_locked_killable() instead? */ + wait_on_page_locked(new_page); + + if (!PageUptodate(new_page)) { + /* EIO */ + new_page->mapping = NULL; + put_page(new_page); + return ret; + } + + do { + xas_lock_irq(&xas); + xas_set(&xas, hindex); + xas_create_range(&xas); + + if (!(xas_error(&xas))) + break; + + if (!xas_nomem(&xas, GFP_KERNEL)) { + if (new_page) { + new_page->mapping = NULL; + put_page(new_page); + } + + goto unlock; + } + + xas_unlock_irq(&xas); + } while (1); + + /* + * Double check that an entry did not sneak into the page cache while + * creating Xarray entries for the new page. + */ + if (!filemap_huge_check_pagecache_usable(&xas, &cached_page, hindex, + hindex_max)) { + /* + * An unusable entry was found, so delete the newly allocated + * page and fallback. + */ + new_page->mapping = NULL; + put_page(new_page); + goto unlock; + } else if (cached_page) { + /* + * A valid large page was found in the page cache, so free the + * newly allocated page and map the cached page instead. + */ + new_page->mapping = NULL; + put_page(new_page); + new_page = NULL; + hugepage = cached_page; + goto map_huge; + } + + __SetPageLocked(new_page); + + /* did it get truncated? */ + if (unlikely(new_page->mapping != mapping)) { + unlock_page(new_page); + put_page(new_page); + goto retry_xas_locked; + } + + hugepage = new_page; + +map_huge: + /* map hugepage at the PMD level */ + ret = alloc_set_pte(vmf, NULL, hugepage); + + VM_BUG_ON_PAGE((!(pmd_trans_huge(*vmf->pmd))), hugepage); + + if (likely(!(ret & VM_FAULT_ERROR))) { + /* + * The alloc_set_pte() succeeded without error, so + * add the page to the page cache if it is new, and + * increment page statistics accordingly. + */ + if (new_page) { + unsigned long nr; + + xas_set(&xas, hindex); + + for (nr = 0; nr < HPAGE_PMD_NR; nr++) { +#ifndef COMPOUND_PAGES_HEAD_ONLY + xas_store(&xas, new_page + nr); +#else + xas_store(&xas, new_page); +#endif + xas_next(&xas); + } + + count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(new_page, NR_SHMEM_THPS); + __mod_node_page_state(page_pgdat(new_page), + NR_FILE_PAGES, HPAGE_PMD_NR); + __mod_node_page_state(page_pgdat(new_page), + NR_SHMEM, HPAGE_PMD_NR); + } + + vmf->address = haddr; + vmf->page = hugepage; + + page_ref_add(hugepage, HPAGE_PMD_NR); + count_vm_event(THP_FILE_MAPPED); + } else if (new_page) { + /* there was an error mapping the new page, so release it */ + new_page->mapping = NULL; + put_page(new_page); + } + +unlock: + xas_unlock_irq(&xas); + return ret; +} +EXPORT_SYMBOL(filemap_huge_fault); +#endif + void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { @@ -2924,7 +3218,8 @@ struct page *read_cache_page(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page); /** - * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * read_cache_page_gfp - read into page cache, using specified page allocation + * flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1334ede667a8..26d74466d1f7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -543,8 +543,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) goto out; + +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) goto out; +#endif addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); if (addr) diff --git a/mm/mmap.c b/mm/mmap.c index 7e8c3e8ae75f..96ff80d2a8fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1391,6 +1391,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; int pkey = 0; +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP + unsigned long vm_maywrite = VM_MAYWRITE; +#endif + *populate = 0; if (!len) @@ -1429,7 +1433,33 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP + /* + * If THP is enabled, it's a read-only executable that is + * MAP_PRIVATE mapped, the length is larger than a PMD page + * and either it's not a MAP_FIXED mapping or the passed address is + * properly aligned for a PMD page, attempt to get an appropriate + * address at which to map a PMD-sized THP page, otherwise call the + * normal routine. + */ + if ((prot & PROT_READ) && (prot & PROT_EXEC) && + (!(prot & PROT_WRITE)) && (flags & MAP_PRIVATE) && + (!(flags & MAP_FIXED)) && len >= HPAGE_PMD_SIZE && + (!(addr & HPAGE_PMD_OFFSET))) { + addr = thp_get_unmapped_area(file, addr, len, pgoff, flags); + + if (addr && (!(addr & HPAGE_PMD_OFFSET))) + vm_maywrite = 0; + else + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } else { +#endif + addr = get_unmapped_area(file, addr, len, pgoff, flags); +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP + } +#endif + if (offset_in_page(addr)) return addr; @@ -1451,7 +1481,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | +#ifdef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP + mm->def_flags | VM_MAYREAD | vm_maywrite | VM_MAYEXEC; +#else mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; +#endif if (flags & MAP_LOCKED) if (!can_do_mlock()) diff --git a/mm/rmap.c b/mm/rmap.c index e5dfe2ae6b0d..503612d3b52b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1192,7 +1192,11 @@ void page_add_file_rmap(struct page *page, bool compound) } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; + +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP VM_BUG_ON_PAGE(!PageSwapBacked(page), page); +#endif + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { if (PageTransCompound(page) && page_mapping(page)) { @@ -1232,7 +1236,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) goto out; + +#ifndef CONFIG_RO_EXEC_FILEMAP_HUGE_FAULT_THP VM_BUG_ON_PAGE(!PageSwapBacked(page), page); +#endif + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount))
Add filemap_huge_fault() to attempt to satisfy page faults on memory-mapped read-only text pages using THP when possible. Signed-off-by: William Kucharski <william.kucharski@oracle.com> --- include/linux/huge_mm.h | 16 ++- include/linux/mm.h | 6 + mm/Kconfig | 15 ++ mm/filemap.c | 299 +++++++++++++++++++++++++++++++++++++++- mm/huge_memory.c | 3 + mm/mmap.c | 36 ++++- mm/rmap.c | 8 ++ 7 files changed, 373 insertions(+), 10 deletions(-)