Message ID | 20240829235415.57374fc3@imladris.surriel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm,tmpfs: consider end of file write in shmem_is_huge | expand |
On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote: > Take the end of a file write into consideration when deciding whether > or not to use huge folios for tmpfs files when the tmpfs filesystem is > mounted with huge=within_size > > This allows large writes that append to the end of a file to automatically > use large folios. > > Doing 4MB squential writes without fallocate to a 16GB tmpfs file: > - 4kB pages: 1560 MB/s > - huge=within_size 4720 MB/s > - huge=always: 4720 MB/s > > Signed-off-by: Rik van Riel <riel@surriel.com> > --- > fs/xfs/scrub/xfile.c | 6 +++--- > fs/xfs/xfs_buf_mem.c | 2 +- > include/linux/shmem_fs.h | 12 ++++++----- > mm/huge_memory.c | 2 +- > mm/khugepaged.c | 2 +- > mm/shmem.c | 44 +++++++++++++++++++++------------------- > mm/userfaultfd.c | 2 +- > 7 files changed, 37 insertions(+), 33 deletions(-) > > diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c > index d848222f802b..e6e1c1fd23cb 100644 > --- a/fs/xfs/scrub/xfile.c > +++ b/fs/xfs/scrub/xfile.c > @@ -126,7 +126,7 @@ xfile_load( > unsigned int len; > unsigned int offset; > > - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, > SGP_READ) < 0) > break; > if (!folio) { > @@ -196,7 +196,7 @@ xfile_store( > unsigned int len; > unsigned int offset; > > - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, Technically speaking, the "0" here could be (pos + count), though for the current xfile users this isn't likely to make much difference because online fsck's index building only appends small amounts of data (i.e. not larger than a PAGE_SIZE) at a time. > SGP_CACHE) < 0) > break; > if (filemap_check_wb_err(inode->i_mapping, 0)) { > @@ -267,7 +267,7 @@ xfile_get_folio( > i_size_write(inode, pos + len); > > pflags = memalloc_nofs_save(); > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, This 0 could be pos + len, since the only caller is xfarray_sort, which runs much faster when it can heapsort a large folio's worth of data at a time. > (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); > memalloc_nofs_restore(pflags); > if (error) > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c > index 9bb2d24de709..07bebbfb16ee 100644 > --- a/fs/xfs/xfs_buf_mem.c > +++ b/fs/xfs/xfs_buf_mem.c > @@ -149,7 +149,7 @@ xmbuf_map_page( > return -ENOMEM; > } > > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE); The "0" here could be (pos + BBTOB(bp->length)) since we're likely going to write there soon. Granted, no current user of xmbufs actually uses a blocksize larger than PAGE_SIZE, but in theory we could someday turn that on. Everything below here looks sane enough to me, but I'm not that much of an expert on mm/ things outside of the pagecache and shmem.c. --D > if (error) > return error; > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index 1d06b1e5408a..846c1ea91f50 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -111,13 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); > int shmem_unuse(unsigned int type); > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, > - struct mm_struct *mm, unsigned long vm_flags); > +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, > + bool shmem_huge_force, struct mm_struct *mm, > + unsigned long vm_flags); > unsigned long shmem_allowable_huge_orders(struct inode *inode, > struct vm_area_struct *vma, pgoff_t index, > bool global_huge); > #else > -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, > +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, > + loff_t write_end, bool shmem_huge_force, > struct mm_struct *mm, unsigned long vm_flags) > { > return false; > @@ -150,8 +152,8 @@ enum sgp_type { > SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ > }; > > -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, > - enum sgp_type sgp); > +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, > + struct folio **foliop, enum sgp_type sgp); > struct folio *shmem_read_folio_gfp(struct address_space *mapping, > pgoff_t index, gfp_t gfp); > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 67c86a5d64a6..8c09071e78cd 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -160,7 +160,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > * own flags. > */ > if (!in_pf && shmem_file(vma->vm_file)) { > - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, > + bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0, > !enforce_sysfs, vma->vm_mm, vm_flags); > > if (!vma_is_anon_shmem(vma)) > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index cdd1d8655a76..0ebabff10f97 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1866,7 +1866,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > if (xa_is_value(folio) || !folio_test_uptodate(folio)) { > xas_unlock_irq(&xas); > /* swap in or instantiate fallocated page */ > - if (shmem_get_folio(mapping->host, index, > + if (shmem_get_folio(mapping->host, index, 0, > &folio, SGP_NOALLOC)) { > result = SCAN_FAIL; > goto xa_unlocked; > diff --git a/mm/shmem.c b/mm/shmem.c > index 5a77acf6ac6a..964c24fc480f 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -548,7 +548,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, > > static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; > > -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, > +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, > bool shmem_huge_force, struct mm_struct *mm, > unsigned long vm_flags) > { > @@ -568,7 +568,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, > return true; > case SHMEM_HUGE_WITHIN_SIZE: > index = round_up(index + 1, HPAGE_PMD_NR); > - i_size = round_up(i_size_read(inode), PAGE_SIZE); > + i_size = max(write_end, i_size_read(inode)); > + i_size = round_up(i_size, PAGE_SIZE); > if (i_size >> PAGE_SHIFT >= index) > return true; > fallthrough; > @@ -581,14 +582,14 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, > } > } > > -bool shmem_is_huge(struct inode *inode, pgoff_t index, > +bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, > bool shmem_huge_force, struct mm_struct *mm, > unsigned long vm_flags) > { > if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) > return false; > > - return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); > + return __shmem_is_huge(inode, index, write_end, shmem_huge_force, mm, vm_flags); > } > > #if defined(CONFIG_SYSFS) > @@ -971,7 +972,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) > * (although in some cases this is just a waste of time). > */ > folio = NULL; > - shmem_get_folio(inode, index, &folio, SGP_READ); > + shmem_get_folio(inode, index, 0, &folio, SGP_READ); > return folio; > } > > @@ -1156,7 +1157,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, > STATX_ATTR_NODUMP); > generic_fillattr(idmap, request_mask, inode, stat); > > - if (shmem_is_huge(inode, 0, false, NULL, 0)) > + if (shmem_is_huge(inode, 0, 0, false, NULL, 0)) > stat->blksize = HPAGE_PMD_SIZE; > > if (request_mask & STATX_BTIME) { > @@ -2078,8 +2079,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. > */ > static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, > - struct vm_fault *vmf, vm_fault_t *fault_type) > + loff_t write_end, struct folio **foliop, enum sgp_type sgp, > + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) > { > struct vm_area_struct *vma = vmf ? vmf->vma : NULL; > struct mm_struct *fault_mm; > @@ -2158,7 +2159,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > return 0; > } > > - huge = shmem_is_huge(inode, index, false, fault_mm, > + huge = shmem_is_huge(inode, index, write_end, false, fault_mm, > vma ? vma->vm_flags : 0); > /* Find hugepage orders that are allowed for anonymous shmem. */ > if (vma && vma_is_anon_shmem(vma)) > @@ -2268,6 +2269,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > * shmem_get_folio - find, and lock a shmem folio. > * @inode: inode to search > * @index: the page index. > + * @write_end: end of a write, could extend inode size. > * @foliop: pointer to the folio if found > * @sgp: SGP_* flags to control behavior > * > @@ -2287,10 +2289,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > * Context: May sleep. > * Return: 0 if successful, else a negative error code. > */ > -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, > - enum sgp_type sgp) > +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, > + struct folio **foliop, enum sgp_type sgp) > { > - return shmem_get_folio_gfp(inode, index, foliop, sgp, > + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, > mapping_gfp_mask(inode->i_mapping), NULL, NULL); > } > EXPORT_SYMBOL_GPL(shmem_get_folio); > @@ -2385,7 +2387,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) > } > > WARN_ON_ONCE(vmf->page != NULL); > - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, > + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, > gfp, vmf, &ret); > if (err) > return vmf_error(err); > @@ -2895,7 +2897,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, > return -EPERM; > } > > - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); > + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); > if (ret) > return ret; > > @@ -2966,7 +2968,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > break; > } > > - error = shmem_get_folio(inode, index, &folio, SGP_READ); > + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); > if (error) { > if (error == -EINVAL) > error = 0; > @@ -3142,7 +3144,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, > if (*ppos >= i_size_read(inode)) > break; > > - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, > + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, > SGP_READ); > if (error) { > if (error == -EINVAL) > @@ -3332,8 +3334,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, > else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) > error = -ENOMEM; > else > - error = shmem_get_folio(inode, index, &folio, > - SGP_FALLOC); > + error = shmem_get_folio(inode, index, offset + len, > + &folio, SGP_FALLOC); > if (error) { > info->fallocend = undo_fallocend; > /* Remove the !uptodate folios we added */ > @@ -3684,7 +3686,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, > } else { > inode_nohighmem(inode); > inode->i_mapping->a_ops = &shmem_aops; > - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); > + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); > if (error) > goto out_remove_offset; > inode->i_op = &shmem_symlink_inode_operations; > @@ -3730,7 +3732,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, > return ERR_PTR(-ECHILD); > } > } else { > - error = shmem_get_folio(inode, 0, &folio, SGP_READ); > + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); > if (error) > return ERR_PTR(error); > if (!folio) > @@ -5198,7 +5200,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, > struct folio *folio; > int error; > > - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, > + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, > gfp, NULL, NULL); > if (error) > return ERR_PTR(error); > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index e54e5c8907fa..cb8c76f8f118 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, > struct page *page; > int ret; > > - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); > + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); > /* Our caller expects us to return -EFAULT if we failed to find folio */ > if (ret == -ENOENT) > ret = -EFAULT; > -- > 2.45.2 >
On Thu, 2024-08-29 at 22:52 -0700, Darrick J. Wong wrote: > On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote: > > > > @@ -196,7 +196,7 @@ xfile_store( > > unsigned int len; > > unsigned int offset; > > > > - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, > > &folio, > > + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > &folio, > > Technically speaking, the "0" here could be (pos + count), though for > the current xfile users this isn't likely to make much difference > because online fsck's index building only appends small amounts of > data > (i.e. not larger than a PAGE_SIZE) at a time. > > > SGP_CACHE) < 0) With SGP_CACHE, won't shmem_get_folio simply refuse to allocate any pages beyond the end of the inode? if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; > > break; > > if (filemap_check_wb_err(inode->i_mapping, 0)) { > > @@ -267,7 +267,7 @@ xfile_get_folio( > > i_size_write(inode, pos + len); > > > > pflags = memalloc_nofs_save(); > > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > &folio, > > This 0 could be pos + len, since the only caller is xfarray_sort, > which > runs much faster when it can heapsort a large folio's worth of data > at a > time. > > > (flags & XFILE_ALLOC) ? SGP_CACHE : > > SGP_READ); The same applies here. > > memalloc_nofs_restore(pflags); > > if (error) > > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c > > index 9bb2d24de709..07bebbfb16ee 100644 > > --- a/fs/xfs/xfs_buf_mem.c > > +++ b/fs/xfs/xfs_buf_mem.c > > @@ -149,7 +149,7 @@ xmbuf_map_page( > > return -ENOMEM; > > } > > > > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > > SGP_CACHE); > > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > &folio, SGP_CACHE); > > The "0" here could be (pos + BBTOB(bp->length)) since we're likely > going > to write there soon. Granted, no current user of xmbufs actually > uses a > blocksize larger than PAGE_SIZE, but in theory we could someday turn > that on. > > Everything below here looks sane enough to me, but I'm not that much > of > an expert on mm/ things outside of the pagecache and shmem.c. ... and here. XFS is no using an SGP flag that allows shmem_get_folio to allocate a page beyond the end of the i_size.
On Fri, Aug 30, 2024 at 09:11:32AM -0400, Rik van Riel wrote: > On Thu, 2024-08-29 at 22:52 -0700, Darrick J. Wong wrote: > > On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote: > > > > > > @@ -196,7 +196,7 @@ xfile_store( > > > unsigned int len; > > > unsigned int offset; > > > > > > - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, > > > &folio, > > > + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > > &folio, > > > > Technically speaking, the "0" here could be (pos + count), though for > > the current xfile users this isn't likely to make much difference > > because online fsck's index building only appends small amounts of > > data > > (i.e. not larger than a PAGE_SIZE) at a time. > > > > > SGP_CACHE) < 0) > > With SGP_CACHE, won't shmem_get_folio simply refuse to allocate > any pages beyond the end of the inode? Yes, though we're careful to i_size_write appropriate beforehand such that @index is always within EOF. --D > if (sgp <= SGP_CACHE && > ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) > return -EINVAL; > > > > break; > > > if (filemap_check_wb_err(inode->i_mapping, 0)) { > > > @@ -267,7 +267,7 @@ xfile_get_folio( > > > i_size_write(inode, pos + len); > > > > > > pflags = memalloc_nofs_save(); > > > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > > > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > > &folio, > > > > This 0 could be pos + len, since the only caller is xfarray_sort, > > which > > runs much faster when it can heapsort a large folio's worth of data > > at a > > time. > > > > > (flags & XFILE_ALLOC) ? SGP_CACHE : > > > SGP_READ); > > The same applies here. > > > > memalloc_nofs_restore(pflags); > > > if (error) > > > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c > > > index 9bb2d24de709..07bebbfb16ee 100644 > > > --- a/fs/xfs/xfs_buf_mem.c > > > +++ b/fs/xfs/xfs_buf_mem.c > > > @@ -149,7 +149,7 @@ xmbuf_map_page( > > > return -ENOMEM; > > > } > > > > > > - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, > > > SGP_CACHE); > > > + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, > > > &folio, SGP_CACHE); > > > > The "0" here could be (pos + BBTOB(bp->length)) since we're likely > > going > > to write there soon. Granted, no current user of xmbufs actually > > uses a > > blocksize larger than PAGE_SIZE, but in theory we could someday turn > > that on. > > > > Everything below here looks sane enough to me, but I'm not that much > > of > > an expert on mm/ things outside of the pagecache and shmem.c. > > ... and here. > > XFS is no using an SGP flag that allows shmem_get_folio to allocate > a page beyond the end of the i_size. > > -- > All Rights Reversed.
On 2024/8/30 11:54, Rik van Riel wrote: > Take the end of a file write into consideration when deciding whether > or not to use huge folios for tmpfs files when the tmpfs filesystem is > mounted with huge=within_size > > This allows large writes that append to the end of a file to automatically > use large folios. Make sense to me. > > Doing 4MB squential writes without fallocate to a 16GB tmpfs file: > - 4kB pages: 1560 MB/s > - huge=within_size 4720 MB/s > - huge=always: 4720 MB/s > > Signed-off-by: Rik van Riel <riel@surriel.com> > --- > fs/xfs/scrub/xfile.c | 6 +++--- > fs/xfs/xfs_buf_mem.c | 2 +- > include/linux/shmem_fs.h | 12 ++++++----- > mm/huge_memory.c | 2 +- > mm/khugepaged.c | 2 +- > mm/shmem.c | 44 +++++++++++++++++++++------------------- > mm/userfaultfd.c | 2 +- > 7 files changed, 37 insertions(+), 33 deletions(-) > > diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c > index d848222f802b..e6e1c1fd23cb 100644 [snip] > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index 1d06b1e5408a..846c1ea91f50 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -111,13 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); > int shmem_unuse(unsigned int type); > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, > - struct mm_struct *mm, unsigned long vm_flags); > +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, > + bool shmem_huge_force, struct mm_struct *mm, > + unsigned long vm_flags); > unsigned long shmem_allowable_huge_orders(struct inode *inode, > struct vm_area_struct *vma, pgoff_t index, > bool global_huge); > #else > -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, > +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, > + loff_t write_end, bool shmem_huge_force, > struct mm_struct *mm, unsigned long vm_flags) > { > return false; > @@ -150,8 +152,8 @@ enum sgp_type { > SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ > }; > > -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, > - enum sgp_type sgp); > +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, > + struct folio **foliop, enum sgp_type sgp); > struct folio *shmem_read_folio_gfp(struct address_space *mapping, > pgoff_t index, gfp_t gfp); > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 67c86a5d64a6..8c09071e78cd 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -160,7 +160,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > * own flags. > */ > if (!in_pf && shmem_file(vma->vm_file)) { > - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, > + bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0, > !enforce_sysfs, vma->vm_mm, vm_flags); > > if (!vma_is_anon_shmem(vma)) > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index cdd1d8655a76..0ebabff10f97 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1866,7 +1866,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > if (xa_is_value(folio) || !folio_test_uptodate(folio)) { > xas_unlock_irq(&xas); > /* swap in or instantiate fallocated page */ > - if (shmem_get_folio(mapping->host, index, > + if (shmem_get_folio(mapping->host, index, 0, > &folio, SGP_NOALLOC)) { > result = SCAN_FAIL; > goto xa_unlocked; > diff --git a/mm/shmem.c b/mm/shmem.c > index 5a77acf6ac6a..964c24fc480f 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -548,7 +548,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, > > static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; > > -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, > +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, > bool shmem_huge_force, struct mm_struct *mm, > unsigned long vm_flags) > { > @@ -568,7 +568,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, > return true; > case SHMEM_HUGE_WITHIN_SIZE: > index = round_up(index + 1, HPAGE_PMD_NR); > - i_size = round_up(i_size_read(inode), PAGE_SIZE); > + i_size = max(write_end, i_size_read(inode)); > + i_size = round_up(i_size, PAGE_SIZE); > if (i_size >> PAGE_SHIFT >= index) > return true; > fallthrough; The shmem_is_huge() is no longer exported and has been renamed to shmem_huge_global_enabled() by the series[1]. So you need rebase on the latest mm-unstable branch. [1] https://lore.kernel.org/all/cover.1721626645.git.baolin.wang@linux.alibaba.com/T/#md2580130f990af0b1428010bfb4cc789bb865136
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d848222f802b..e6e1c1fd23cb 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -126,7 +126,7 @@ xfile_load( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_READ) < 0) break; if (!folio) { @@ -196,7 +196,7 @@ xfile_store( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE) < 0) break; if (filemap_check_wb_err(inode->i_mapping, 0)) { @@ -267,7 +267,7 @@ xfile_get_folio( i_size_write(inode, pos + len); pflags = memalloc_nofs_save(); - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); memalloc_nofs_restore(pflags); if (error) diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 9bb2d24de709..07bebbfb16ee 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -149,7 +149,7 @@ xmbuf_map_page( return -ENOMEM; } - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE); if (error) return error; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1d06b1e5408a..846c1ea91f50 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,13 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, bool global_huge); #else -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; @@ -150,8 +152,8 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 67c86a5d64a6..8c09071e78cd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -160,7 +160,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) { - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0, !enforce_sysfs, vma->vm_mm, vm_flags); if (!vma_is_anon_shmem(vma)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cdd1d8655a76..0ebabff10f97 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1866,7 +1866,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_get_folio(mapping->host, index, + if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; diff --git a/mm/shmem.c b/mm/shmem.c index 5a77acf6ac6a..964c24fc480f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -548,7 +548,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { @@ -568,7 +568,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, return true; case SHMEM_HUGE_WITHIN_SIZE: index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= index) return true; fallthrough; @@ -581,14 +582,14 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, } } -bool shmem_is_huge(struct inode *inode, pgoff_t index, +bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); + return __shmem_is_huge(inode, index, write_end, shmem_huge_force, mm, vm_flags); } #if defined(CONFIG_SYSFS) @@ -971,7 +972,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -1156,7 +1157,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_is_huge(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -2078,8 +2079,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_fault *vmf, vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; @@ -2158,7 +2159,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - huge = shmem_is_huge(inode, index, false, fault_mm, + huge = shmem_is_huge(inode, index, write_end, false, fault_mm, vma ? vma->vm_flags : 0); /* Find hugepage orders that are allowed for anonymous shmem. */ if (vma && vma_is_anon_shmem(vma)) @@ -2268,6 +2269,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. + * @write_end: end of a write, could extend inode size. * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * @@ -2287,10 +2289,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * Context: May sleep. * Return: 0 if successful, else a negative error code. */ -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); @@ -2385,7 +2387,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); @@ -2895,7 +2897,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; @@ -2966,7 +2968,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3142,7 +3144,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3332,8 +3334,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3684,7 +3686,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; @@ -3730,7 +3732,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -5198,7 +5200,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e54e5c8907fa..cb8c76f8f118 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT;
Take the end of a file write into consideration when deciding whether or not to use huge folios for tmpfs files when the tmpfs filesystem is mounted with huge=within_size This allows large writes that append to the end of a file to automatically use large folios. Doing 4MB squential writes without fallocate to a 16GB tmpfs file: - 4kB pages: 1560 MB/s - huge=within_size 4720 MB/s - huge=always: 4720 MB/s Signed-off-by: Rik van Riel <riel@surriel.com> --- fs/xfs/scrub/xfile.c | 6 +++--- fs/xfs/xfs_buf_mem.c | 2 +- include/linux/shmem_fs.h | 12 ++++++----- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 44 +++++++++++++++++++++------------------- mm/userfaultfd.c | 2 +- 7 files changed, 37 insertions(+), 33 deletions(-)