diff mbox series

mm,tmpfs: consider end of file write in shmem_is_huge

Message ID 20240829235415.57374fc3@imladris.surriel.com (mailing list archive)
State New
Headers show
Series mm,tmpfs: consider end of file write in shmem_is_huge | expand

Commit Message

Rik van Riel Aug. 30, 2024, 3:54 a.m. UTC
Take the end of a file write into consideration when deciding whether
or not to use huge folios for tmpfs files when the tmpfs filesystem is
mounted with huge=within_size

This allows large writes that append to the end of a file to automatically
use large folios.

Doing 4MB squential writes without fallocate to a 16GB tmpfs file:
- 4kB pages:       1560 MB/s
- huge=within_size 4720 MB/s
- huge=always:     4720 MB/s

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 fs/xfs/scrub/xfile.c     |  6 +++---
 fs/xfs/xfs_buf_mem.c     |  2 +-
 include/linux/shmem_fs.h | 12 ++++++-----
 mm/huge_memory.c         |  2 +-
 mm/khugepaged.c          |  2 +-
 mm/shmem.c               | 44 +++++++++++++++++++++-------------------
 mm/userfaultfd.c         |  2 +-
 7 files changed, 37 insertions(+), 33 deletions(-)

Comments

Darrick J. Wong Aug. 30, 2024, 5:52 a.m. UTC | #1
On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote:
> Take the end of a file write into consideration when deciding whether
> or not to use huge folios for tmpfs files when the tmpfs filesystem is
> mounted with huge=within_size
> 
> This allows large writes that append to the end of a file to automatically
> use large folios.
> 
> Doing 4MB squential writes without fallocate to a 16GB tmpfs file:
> - 4kB pages:       1560 MB/s
> - huge=within_size 4720 MB/s
> - huge=always:     4720 MB/s
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  fs/xfs/scrub/xfile.c     |  6 +++---
>  fs/xfs/xfs_buf_mem.c     |  2 +-
>  include/linux/shmem_fs.h | 12 ++++++-----
>  mm/huge_memory.c         |  2 +-
>  mm/khugepaged.c          |  2 +-
>  mm/shmem.c               | 44 +++++++++++++++++++++-------------------
>  mm/userfaultfd.c         |  2 +-
>  7 files changed, 37 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
> index d848222f802b..e6e1c1fd23cb 100644
> --- a/fs/xfs/scrub/xfile.c
> +++ b/fs/xfs/scrub/xfile.c
> @@ -126,7 +126,7 @@ xfile_load(
>  		unsigned int	len;
>  		unsigned int	offset;
>  
> -		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> +		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
>  				SGP_READ) < 0)
>  			break;
>  		if (!folio) {
> @@ -196,7 +196,7 @@ xfile_store(
>  		unsigned int	len;
>  		unsigned int	offset;
>  
> -		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> +		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,

Technically speaking, the "0" here could be (pos + count), though for
the current xfile users this isn't likely to make much difference
because online fsck's index building only appends small amounts of data
(i.e. not larger than a PAGE_SIZE) at a time.

>  				SGP_CACHE) < 0)
>  			break;
>  		if (filemap_check_wb_err(inode->i_mapping, 0)) {
> @@ -267,7 +267,7 @@ xfile_get_folio(
>  		i_size_write(inode, pos + len);
>  
>  	pflags = memalloc_nofs_save();
> -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,

This 0 could be pos + len, since the only caller is xfarray_sort, which
runs much faster when it can heapsort a large folio's worth of data at a
time.

>  			(flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
>  	memalloc_nofs_restore(pflags);
>  	if (error)
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 9bb2d24de709..07bebbfb16ee 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -149,7 +149,7 @@ xmbuf_map_page(
>  		return -ENOMEM;
>  	}
>  
> -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
> +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE);

The "0" here could be (pos + BBTOB(bp->length)) since we're likely going
to write there soon.  Granted, no current user of xmbufs actually uses a
blocksize larger than PAGE_SIZE, but in theory we could someday turn
that on.

Everything below here looks sane enough to me, but I'm not that much of
an expert on mm/ things outside of the pagecache and shmem.c.

--D

>  	if (error)
>  		return error;
>  
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 1d06b1e5408a..846c1ea91f50 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -111,13 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
>  int shmem_unuse(unsigned int type);
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
> -			  struct mm_struct *mm, unsigned long vm_flags);
> +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
> +	       		  bool shmem_huge_force, struct mm_struct *mm,
> +			  unsigned long vm_flags);
>  unsigned long shmem_allowable_huge_orders(struct inode *inode,
>  				struct vm_area_struct *vma, pgoff_t index,
>  				bool global_huge);
>  #else
> -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
> +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index,
> +					  loff_t write_end, bool shmem_huge_force,
>  					  struct mm_struct *mm, unsigned long vm_flags)
>  {
>  	return false;
> @@ -150,8 +152,8 @@ enum sgp_type {
>  	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
>  };
>  
> -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
> -		enum sgp_type sgp);
> +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
> +		    struct folio **foliop, enum sgp_type sgp);
>  struct folio *shmem_read_folio_gfp(struct address_space *mapping,
>  		pgoff_t index, gfp_t gfp);
>  
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..8c09071e78cd 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -160,7 +160,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>  	 * own flags.
>  	 */
>  	if (!in_pf && shmem_file(vma->vm_file)) {
> -		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
> +		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0,
>  							!enforce_sysfs, vma->vm_mm, vm_flags);
>  
>  		if (!vma_is_anon_shmem(vma))
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76..0ebabff10f97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1866,7 +1866,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  			if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
>  				xas_unlock_irq(&xas);
>  				/* swap in or instantiate fallocated page */
> -				if (shmem_get_folio(mapping->host, index,
> +				if (shmem_get_folio(mapping->host, index, 0,
>  						&folio, SGP_NOALLOC)) {
>  					result = SCAN_FAIL;
>  					goto xa_unlocked;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5a77acf6ac6a..964c24fc480f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -548,7 +548,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
>  
>  static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
>  
> -static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
> +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
>  			    bool shmem_huge_force, struct mm_struct *mm,
>  			    unsigned long vm_flags)
>  {
> @@ -568,7 +568,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
>  		return true;
>  	case SHMEM_HUGE_WITHIN_SIZE:
>  		index = round_up(index + 1, HPAGE_PMD_NR);
> -		i_size = round_up(i_size_read(inode), PAGE_SIZE);
> +		i_size = max(write_end, i_size_read(inode));
> +		i_size = round_up(i_size, PAGE_SIZE);
>  		if (i_size >> PAGE_SHIFT >= index)
>  			return true;
>  		fallthrough;
> @@ -581,14 +582,14 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
>  	}
>  }
>  
> -bool shmem_is_huge(struct inode *inode, pgoff_t index,
> +bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
>  		   bool shmem_huge_force, struct mm_struct *mm,
>  		   unsigned long vm_flags)
>  {
>  	if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
>  		return false;
>  
> -	return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
> +	return __shmem_is_huge(inode, index, write_end, shmem_huge_force, mm, vm_flags);
>  }
>  
>  #if defined(CONFIG_SYSFS)
> @@ -971,7 +972,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
>  	 * (although in some cases this is just a waste of time).
>  	 */
>  	folio = NULL;
> -	shmem_get_folio(inode, index, &folio, SGP_READ);
> +	shmem_get_folio(inode, index, 0, &folio, SGP_READ);
>  	return folio;
>  }
>  
> @@ -1156,7 +1157,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
>  			STATX_ATTR_NODUMP);
>  	generic_fillattr(idmap, request_mask, inode, stat);
>  
> -	if (shmem_is_huge(inode, 0, false, NULL, 0))
> +	if (shmem_is_huge(inode, 0, 0, false, NULL, 0))
>  		stat->blksize = HPAGE_PMD_SIZE;
>  
>  	if (request_mask & STATX_BTIME) {
> @@ -2078,8 +2079,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
>   */
>  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> -		struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
> -		struct vm_fault *vmf, vm_fault_t *fault_type)
> +	       	loff_t write_end, struct folio **foliop, enum sgp_type sgp,
> +		gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
>  {
>  	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
>  	struct mm_struct *fault_mm;
> @@ -2158,7 +2159,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>  		return 0;
>  	}
>  
> -	huge = shmem_is_huge(inode, index, false, fault_mm,
> +	huge = shmem_is_huge(inode, index, write_end, false, fault_mm,
>  			     vma ? vma->vm_flags : 0);
>  	/* Find hugepage orders that are allowed for anonymous shmem. */
>  	if (vma && vma_is_anon_shmem(vma))
> @@ -2268,6 +2269,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>   * shmem_get_folio - find, and lock a shmem folio.
>   * @inode:	inode to search
>   * @index:	the page index.
> + * @write_end:	end of a write, could extend inode size.
>   * @foliop:	pointer to the folio if found
>   * @sgp:	SGP_* flags to control behavior
>   *
> @@ -2287,10 +2289,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>   * Context: May sleep.
>   * Return: 0 if successful, else a negative error code.
>   */
> -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
> -		enum sgp_type sgp)
> +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
> +		struct folio **foliop, enum sgp_type sgp)
>  {
> -	return shmem_get_folio_gfp(inode, index, foliop, sgp,
> +	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
>  			mapping_gfp_mask(inode->i_mapping), NULL, NULL);
>  }
>  EXPORT_SYMBOL_GPL(shmem_get_folio);
> @@ -2385,7 +2387,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>  	}
>  
>  	WARN_ON_ONCE(vmf->page != NULL);
> -	err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
> +	err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
>  				  gfp, vmf, &ret);
>  	if (err)
>  		return vmf_error(err);
> @@ -2895,7 +2897,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
>  			return -EPERM;
>  	}
>  
> -	ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
> +	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
>  	if (ret)
>  		return ret;
>  
> @@ -2966,7 +2968,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  				break;
>  		}
>  
> -		error = shmem_get_folio(inode, index, &folio, SGP_READ);
> +		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
>  		if (error) {
>  			if (error == -EINVAL)
>  				error = 0;
> @@ -3142,7 +3144,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
>  		if (*ppos >= i_size_read(inode))
>  			break;
>  
> -		error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
> +		error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
>  					SGP_READ);
>  		if (error) {
>  			if (error == -EINVAL)
> @@ -3332,8 +3334,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
>  		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
>  			error = -ENOMEM;
>  		else
> -			error = shmem_get_folio(inode, index, &folio,
> -						SGP_FALLOC);
> +			error = shmem_get_folio(inode, index, offset + len,
> +						&folio, SGP_FALLOC);
>  		if (error) {
>  			info->fallocend = undo_fallocend;
>  			/* Remove the !uptodate folios we added */
> @@ -3684,7 +3686,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>  	} else {
>  		inode_nohighmem(inode);
>  		inode->i_mapping->a_ops = &shmem_aops;
> -		error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
> +		error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
>  		if (error)
>  			goto out_remove_offset;
>  		inode->i_op = &shmem_symlink_inode_operations;
> @@ -3730,7 +3732,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
>  			return ERR_PTR(-ECHILD);
>  		}
>  	} else {
> -		error = shmem_get_folio(inode, 0, &folio, SGP_READ);
> +		error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
>  		if (error)
>  			return ERR_PTR(error);
>  		if (!folio)
> @@ -5198,7 +5200,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
>  	struct folio *folio;
>  	int error;
>  
> -	error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
> +	error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
>  				    gfp, NULL, NULL);
>  	if (error)
>  		return ERR_PTR(error);
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e54e5c8907fa..cb8c76f8f118 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
>  	struct page *page;
>  	int ret;
>  
> -	ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
> +	ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
>  	/* Our caller expects us to return -EFAULT if we failed to find folio */
>  	if (ret == -ENOENT)
>  		ret = -EFAULT;
> -- 
> 2.45.2
>
Rik van Riel Aug. 30, 2024, 1:11 p.m. UTC | #2
On Thu, 2024-08-29 at 22:52 -0700, Darrick J. Wong wrote:
> On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote:
> > 
> > @@ -196,7 +196,7 @@ xfile_store(
> >  		unsigned int	len;
> >  		unsigned int	offset;
> >  
> > -		if (shmem_get_folio(inode, pos >> PAGE_SHIFT,
> > &folio,
> > +		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > &folio,
> 
> Technically speaking, the "0" here could be (pos + count), though for
> the current xfile users this isn't likely to make much difference
> because online fsck's index building only appends small amounts of
> data
> (i.e. not larger than a PAGE_SIZE) at a time.
> 
> >  				SGP_CACHE) < 0)

With SGP_CACHE, won't shmem_get_folio simply refuse to allocate
any pages beyond the end of the inode?

        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                return -EINVAL;

> >  			break;
> >  		if (filemap_check_wb_err(inode->i_mapping, 0)) {
> > @@ -267,7 +267,7 @@ xfile_get_folio(
> >  		i_size_write(inode, pos + len);
> >  
> >  	pflags = memalloc_nofs_save();
> > -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> > +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > &folio,
> 
> This 0 could be pos + len, since the only caller is xfarray_sort,
> which
> runs much faster when it can heapsort a large folio's worth of data
> at a
> time.
> 
> >  			(flags & XFILE_ALLOC) ? SGP_CACHE :
> > SGP_READ);

The same applies here.

> >  	memalloc_nofs_restore(pflags);
> >  	if (error)
> > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> > index 9bb2d24de709..07bebbfb16ee 100644
> > --- a/fs/xfs/xfs_buf_mem.c
> > +++ b/fs/xfs/xfs_buf_mem.c
> > @@ -149,7 +149,7 @@ xmbuf_map_page(
> >  		return -ENOMEM;
> >  	}
> >  
> > -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> > SGP_CACHE);
> > +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > &folio, SGP_CACHE);
> 
> The "0" here could be (pos + BBTOB(bp->length)) since we're likely
> going
> to write there soon.  Granted, no current user of xmbufs actually
> uses a
> blocksize larger than PAGE_SIZE, but in theory we could someday turn
> that on.
> 
> Everything below here looks sane enough to me, but I'm not that much
> of
> an expert on mm/ things outside of the pagecache and shmem.c.

... and here.

XFS is no using an SGP flag that allows shmem_get_folio to allocate
a page beyond the end of the i_size.
Darrick J. Wong Aug. 30, 2024, 11:21 p.m. UTC | #3
On Fri, Aug 30, 2024 at 09:11:32AM -0400, Rik van Riel wrote:
> On Thu, 2024-08-29 at 22:52 -0700, Darrick J. Wong wrote:
> > On Thu, Aug 29, 2024 at 11:54:15PM -0400, Rik van Riel wrote:
> > > 
> > > @@ -196,7 +196,7 @@ xfile_store(
> > >  		unsigned int	len;
> > >  		unsigned int	offset;
> > >  
> > > -		if (shmem_get_folio(inode, pos >> PAGE_SHIFT,
> > > &folio,
> > > +		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > > &folio,
> > 
> > Technically speaking, the "0" here could be (pos + count), though for
> > the current xfile users this isn't likely to make much difference
> > because online fsck's index building only appends small amounts of
> > data
> > (i.e. not larger than a PAGE_SIZE) at a time.
> > 
> > >  				SGP_CACHE) < 0)
> 
> With SGP_CACHE, won't shmem_get_folio simply refuse to allocate
> any pages beyond the end of the inode?

Yes, though we're careful to i_size_write appropriate beforehand such
that @index is always within EOF.

--D

>         if (sgp <= SGP_CACHE &&
>             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
>                 return -EINVAL;
> 
> > >  			break;
> > >  		if (filemap_check_wb_err(inode->i_mapping, 0)) {
> > > @@ -267,7 +267,7 @@ xfile_get_folio(
> > >  		i_size_write(inode, pos + len);
> > >  
> > >  	pflags = memalloc_nofs_save();
> > > -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> > > +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > > &folio,
> > 
> > This 0 could be pos + len, since the only caller is xfarray_sort,
> > which
> > runs much faster when it can heapsort a large folio's worth of data
> > at a
> > time.
> > 
> > >  			(flags & XFILE_ALLOC) ? SGP_CACHE :
> > > SGP_READ);
> 
> The same applies here.
> 
> > >  	memalloc_nofs_restore(pflags);
> > >  	if (error)
> > > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> > > index 9bb2d24de709..07bebbfb16ee 100644
> > > --- a/fs/xfs/xfs_buf_mem.c
> > > +++ b/fs/xfs/xfs_buf_mem.c
> > > @@ -149,7 +149,7 @@ xmbuf_map_page(
> > >  		return -ENOMEM;
> > >  	}
> > >  
> > > -	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> > > SGP_CACHE);
> > > +	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0,
> > > &folio, SGP_CACHE);
> > 
> > The "0" here could be (pos + BBTOB(bp->length)) since we're likely
> > going
> > to write there soon.  Granted, no current user of xmbufs actually
> > uses a
> > blocksize larger than PAGE_SIZE, but in theory we could someday turn
> > that on.
> > 
> > Everything below here looks sane enough to me, but I'm not that much
> > of
> > an expert on mm/ things outside of the pagecache and shmem.c.
> 
> ... and here.
> 
> XFS is no using an SGP flag that allows shmem_get_folio to allocate
> a page beyond the end of the i_size.
> 
> -- 
> All Rights Reversed.
Baolin Wang Sept. 2, 2024, 8:36 a.m. UTC | #4
On 2024/8/30 11:54, Rik van Riel wrote:
> Take the end of a file write into consideration when deciding whether
> or not to use huge folios for tmpfs files when the tmpfs filesystem is
> mounted with huge=within_size
> 
> This allows large writes that append to the end of a file to automatically
> use large folios.

Make sense to me.

> 
> Doing 4MB squential writes without fallocate to a 16GB tmpfs file:
> - 4kB pages:       1560 MB/s
> - huge=within_size 4720 MB/s
> - huge=always:     4720 MB/s
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>   fs/xfs/scrub/xfile.c     |  6 +++---
>   fs/xfs/xfs_buf_mem.c     |  2 +-
>   include/linux/shmem_fs.h | 12 ++++++-----
>   mm/huge_memory.c         |  2 +-
>   mm/khugepaged.c          |  2 +-
>   mm/shmem.c               | 44 +++++++++++++++++++++-------------------
>   mm/userfaultfd.c         |  2 +-
>   7 files changed, 37 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
> index d848222f802b..e6e1c1fd23cb 100644

[snip]

> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 1d06b1e5408a..846c1ea91f50 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -111,13 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
>   int shmem_unuse(unsigned int type);
>   
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
> -			  struct mm_struct *mm, unsigned long vm_flags);
> +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
> +	       		  bool shmem_huge_force, struct mm_struct *mm,
> +			  unsigned long vm_flags);
>   unsigned long shmem_allowable_huge_orders(struct inode *inode,
>   				struct vm_area_struct *vma, pgoff_t index,
>   				bool global_huge);
>   #else
> -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
> +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index,
> +					  loff_t write_end, bool shmem_huge_force,
>   					  struct mm_struct *mm, unsigned long vm_flags)
>   {
>   	return false;
> @@ -150,8 +152,8 @@ enum sgp_type {
>   	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
>   };
>   
> -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
> -		enum sgp_type sgp);
> +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
> +		    struct folio **foliop, enum sgp_type sgp);
>   struct folio *shmem_read_folio_gfp(struct address_space *mapping,
>   		pgoff_t index, gfp_t gfp);
>   
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..8c09071e78cd 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -160,7 +160,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>   	 * own flags.
>   	 */
>   	if (!in_pf && shmem_file(vma->vm_file)) {
> -		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
> +		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0,
>   							!enforce_sysfs, vma->vm_mm, vm_flags);
>   
>   		if (!vma_is_anon_shmem(vma))
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76..0ebabff10f97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1866,7 +1866,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>   			if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
>   				xas_unlock_irq(&xas);
>   				/* swap in or instantiate fallocated page */
> -				if (shmem_get_folio(mapping->host, index,
> +				if (shmem_get_folio(mapping->host, index, 0,
>   						&folio, SGP_NOALLOC)) {
>   					result = SCAN_FAIL;
>   					goto xa_unlocked;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5a77acf6ac6a..964c24fc480f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -548,7 +548,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
>   
>   static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
>   
> -static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
> +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
>   			    bool shmem_huge_force, struct mm_struct *mm,
>   			    unsigned long vm_flags)
>   {
> @@ -568,7 +568,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
>   		return true;
>   	case SHMEM_HUGE_WITHIN_SIZE:
>   		index = round_up(index + 1, HPAGE_PMD_NR);
> -		i_size = round_up(i_size_read(inode), PAGE_SIZE);
> +		i_size = max(write_end, i_size_read(inode));
> +		i_size = round_up(i_size, PAGE_SIZE);
>   		if (i_size >> PAGE_SHIFT >= index)
>   			return true;
>   		fallthrough;

The shmem_is_huge() is no longer exported and has been renamed to 
shmem_huge_global_enabled() by the series[1]. So you need rebase on the 
latest mm-unstable branch.

[1] 
https://lore.kernel.org/all/cover.1721626645.git.baolin.wang@linux.alibaba.com/T/#md2580130f990af0b1428010bfb4cc789bb865136
diff mbox series

Patch

diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index d848222f802b..e6e1c1fd23cb 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -126,7 +126,7 @@  xfile_load(
 		unsigned int	len;
 		unsigned int	offset;
 
-		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
 				SGP_READ) < 0)
 			break;
 		if (!folio) {
@@ -196,7 +196,7 @@  xfile_store(
 		unsigned int	len;
 		unsigned int	offset;
 
-		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
 				SGP_CACHE) < 0)
 			break;
 		if (filemap_check_wb_err(inode->i_mapping, 0)) {
@@ -267,7 +267,7 @@  xfile_get_folio(
 		i_size_write(inode, pos + len);
 
 	pflags = memalloc_nofs_save();
-	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
 			(flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
 	memalloc_nofs_restore(pflags);
 	if (error)
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 9bb2d24de709..07bebbfb16ee 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -149,7 +149,7 @@  xmbuf_map_page(
 		return -ENOMEM;
 	}
 
-	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE);
 	if (error)
 		return error;
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1d06b1e5408a..846c1ea91f50 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,13 +111,15 @@  extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 int shmem_unuse(unsigned int type);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
-			  struct mm_struct *mm, unsigned long vm_flags);
+extern bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
+	       		  bool shmem_huge_force, struct mm_struct *mm,
+			  unsigned long vm_flags);
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
 				bool global_huge);
 #else
-static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index,
+					  loff_t write_end, bool shmem_huge_force,
 					  struct mm_struct *mm, unsigned long vm_flags)
 {
 	return false;
@@ -150,8 +152,8 @@  enum sgp_type {
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
 
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
-		enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
+		    struct folio **foliop, enum sgp_type sgp);
 struct folio *shmem_read_folio_gfp(struct address_space *mapping,
 		pgoff_t index, gfp_t gfp);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 67c86a5d64a6..8c09071e78cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -160,7 +160,7 @@  unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 	 * own flags.
 	 */
 	if (!in_pf && shmem_file(vma->vm_file)) {
-		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+		bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, 0,
 							!enforce_sysfs, vma->vm_mm, vm_flags);
 
 		if (!vma_is_anon_shmem(vma))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cdd1d8655a76..0ebabff10f97 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1866,7 +1866,7 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 			if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
 				xas_unlock_irq(&xas);
 				/* swap in or instantiate fallocated page */
-				if (shmem_get_folio(mapping->host, index,
+				if (shmem_get_folio(mapping->host, index, 0,
 						&folio, SGP_NOALLOC)) {
 					result = SCAN_FAIL;
 					goto xa_unlocked;
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a77acf6ac6a..964c24fc480f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -548,7 +548,7 @@  static bool shmem_confirm_swap(struct address_space *mapping,
 
 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
 
-static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
+static bool __shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
 			    bool shmem_huge_force, struct mm_struct *mm,
 			    unsigned long vm_flags)
 {
@@ -568,7 +568,8 @@  static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
 		return true;
 	case SHMEM_HUGE_WITHIN_SIZE:
 		index = round_up(index + 1, HPAGE_PMD_NR);
-		i_size = round_up(i_size_read(inode), PAGE_SIZE);
+		i_size = max(write_end, i_size_read(inode));
+		i_size = round_up(i_size, PAGE_SIZE);
 		if (i_size >> PAGE_SHIFT >= index)
 			return true;
 		fallthrough;
@@ -581,14 +582,14 @@  static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
 	}
 }
 
-bool shmem_is_huge(struct inode *inode, pgoff_t index,
+bool shmem_is_huge(struct inode *inode, pgoff_t index, loff_t write_end,
 		   bool shmem_huge_force, struct mm_struct *mm,
 		   unsigned long vm_flags)
 {
 	if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
 		return false;
 
-	return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
+	return __shmem_is_huge(inode, index, write_end, shmem_huge_force, mm, vm_flags);
 }
 
 #if defined(CONFIG_SYSFS)
@@ -971,7 +972,7 @@  static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 	 * (although in some cases this is just a waste of time).
 	 */
 	folio = NULL;
-	shmem_get_folio(inode, index, &folio, SGP_READ);
+	shmem_get_folio(inode, index, 0, &folio, SGP_READ);
 	return folio;
 }
 
@@ -1156,7 +1157,7 @@  static int shmem_getattr(struct mnt_idmap *idmap,
 			STATX_ATTR_NODUMP);
 	generic_fillattr(idmap, request_mask, inode, stat);
 
-	if (shmem_is_huge(inode, 0, false, NULL, 0))
+	if (shmem_is_huge(inode, 0, 0, false, NULL, 0))
 		stat->blksize = HPAGE_PMD_SIZE;
 
 	if (request_mask & STATX_BTIME) {
@@ -2078,8 +2079,8 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
  */
 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
-		struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
-		struct vm_fault *vmf, vm_fault_t *fault_type)
+	       	loff_t write_end, struct folio **foliop, enum sgp_type sgp,
+		gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
 {
 	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
 	struct mm_struct *fault_mm;
@@ -2158,7 +2159,7 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 		return 0;
 	}
 
-	huge = shmem_is_huge(inode, index, false, fault_mm,
+	huge = shmem_is_huge(inode, index, write_end, false, fault_mm,
 			     vma ? vma->vm_flags : 0);
 	/* Find hugepage orders that are allowed for anonymous shmem. */
 	if (vma && vma_is_anon_shmem(vma))
@@ -2268,6 +2269,7 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
  * shmem_get_folio - find, and lock a shmem folio.
  * @inode:	inode to search
  * @index:	the page index.
+ * @write_end:	end of a write, could extend inode size.
  * @foliop:	pointer to the folio if found
  * @sgp:	SGP_* flags to control behavior
  *
@@ -2287,10 +2289,10 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
  * Context: May sleep.
  * Return: 0 if successful, else a negative error code.
  */
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
-		enum sgp_type sgp)
+int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
+		struct folio **foliop, enum sgp_type sgp)
 {
-	return shmem_get_folio_gfp(inode, index, foliop, sgp,
+	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
 			mapping_gfp_mask(inode->i_mapping), NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(shmem_get_folio);
@@ -2385,7 +2387,7 @@  static vm_fault_t shmem_fault(struct vm_fault *vmf)
 	}
 
 	WARN_ON_ONCE(vmf->page != NULL);
-	err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
+	err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
 				  gfp, vmf, &ret);
 	if (err)
 		return vmf_error(err);
@@ -2895,7 +2897,7 @@  shmem_write_begin(struct file *file, struct address_space *mapping,
 			return -EPERM;
 	}
 
-	ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
+	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
 	if (ret)
 		return ret;
 
@@ -2966,7 +2968,7 @@  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				break;
 		}
 
-		error = shmem_get_folio(inode, index, &folio, SGP_READ);
+		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
 		if (error) {
 			if (error == -EINVAL)
 				error = 0;
@@ -3142,7 +3144,7 @@  static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 		if (*ppos >= i_size_read(inode))
 			break;
 
-		error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
+		error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
 					SGP_READ);
 		if (error) {
 			if (error == -EINVAL)
@@ -3332,8 +3334,8 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
 			error = -ENOMEM;
 		else
-			error = shmem_get_folio(inode, index, &folio,
-						SGP_FALLOC);
+			error = shmem_get_folio(inode, index, offset + len,
+						&folio, SGP_FALLOC);
 		if (error) {
 			info->fallocend = undo_fallocend;
 			/* Remove the !uptodate folios we added */
@@ -3684,7 +3686,7 @@  static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	} else {
 		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &shmem_aops;
-		error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
+		error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
 		if (error)
 			goto out_remove_offset;
 		inode->i_op = &shmem_symlink_inode_operations;
@@ -3730,7 +3732,7 @@  static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
 			return ERR_PTR(-ECHILD);
 		}
 	} else {
-		error = shmem_get_folio(inode, 0, &folio, SGP_READ);
+		error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
 		if (error)
 			return ERR_PTR(error);
 		if (!folio)
@@ -5198,7 +5200,7 @@  struct folio *shmem_read_folio_gfp(struct address_space *mapping,
 	struct folio *folio;
 	int error;
 
-	error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
+	error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
 				    gfp, NULL, NULL);
 	if (error)
 		return ERR_PTR(error);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..cb8c76f8f118 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -391,7 +391,7 @@  static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 	struct page *page;
 	int ret;
 
-	ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+	ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
 	/* Our caller expects us to return -EFAULT if we failed to find folio */
 	if (ret == -ENOENT)
 		ret = -EFAULT;