diff mbox series

[7/7] btrfs: drop mmap_sem in mkwrite for btrfs

Message ID 20181018202318.9131-8-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series drop the mmap_sem when doing IO in the fault path | expand

Commit Message

Josef Bacik Oct. 18, 2018, 8:23 p.m. UTC
->page_mkwrite is extremely expensive in btrfs.  We have to reserve
space, which can take 6 lifetimes, and we could possibly have to wait on
writeback on the page, another several lifetimes.  To avoid this simply
drop the mmap_sem if we didn't have the cached page and do all of our
work and return the appropriate retry error.  If we have the cached page
we know we did all the right things to set this page up and we can just
carry on.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/inode.c   | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/mm.h | 14 ++++++++++++++
 mm/filemap.c       |  3 ++-
 3 files changed, 55 insertions(+), 3 deletions(-)

Comments

Dave Chinner Oct. 19, 2018, 3:48 a.m. UTC | #1
On Thu, Oct 18, 2018 at 04:23:18PM -0400, Josef Bacik wrote:
> ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> space, which can take 6 lifetimes, and we could possibly have to wait on
> writeback on the page, another several lifetimes.  To avoid this simply
> drop the mmap_sem if we didn't have the cached page and do all of our
> work and return the appropriate retry error.  If we have the cached page
> we know we did all the right things to set this page up and we can just
> carry on.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/inode.c   | 41 +++++++++++++++++++++++++++++++++++++++--
>  include/linux/mm.h | 14 ++++++++++++++
>  mm/filemap.c       |  3 ++-
>  3 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 3ea5339603cf..6b723d29bc0c 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -8809,7 +8809,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
>  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct page *page = vmf->page;
> -	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	struct file *file = vmf->vma->vm_file, *fpin;
> +	struct mm_struct *mm = vmf->vma->vm_mm;
> +	struct inode *inode = file_inode(file);
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
>  	struct btrfs_ordered_extent *ordered;
> @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  
>  	reserved_space = PAGE_SIZE;
>  
> +	/*
> +	 * We have our cached page from a previous mkwrite, check it to make
> +	 * sure it's still dirty and our file size matches when we ran mkwrite
> +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> +	 * otherwise do the mkwrite again.
> +	 */
> +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> +		lock_page(page);
> +		if (vmf->cached_size == i_size_read(inode) &&
> +		    PageDirty(page))
> +			return VM_FAULT_LOCKED;
> +		unlock_page(page);
> +	}

What does the file size have to do with whether we can use the
initialised page or not? The file can be extended by other
data operations (like write()) while a page fault is in progress,
so I'm not sure how or why this check makes any sense.

I also don't see anything btrfs specific here, so....

> +	/*
> +	 * mkwrite is extremely expensive, and we are holding the mmap_sem
> +	 * during this, which means we can starve out anybody trying to
> +	 * down_write(mmap_sem) for a long while, especially if we throw cgroups
> +	 * into the mix.  So just drop the mmap_sem and do all of our work,
> +	 * we'll loop back through and verify everything is ok the next time and
> +	 * hopefully avoid doing the work twice.
> +	 */
> +	fpin = maybe_unlock_mmap_for_io(vmf->vma, vmf->flags);

why can't all this be done by the caller of ->page_mkwrite?

>  	sb_start_pagefault(inode->i_sb);
>  	page_start = page_offset(page);
>  	page_end = page_start + PAGE_SIZE - 1;
> @@ -8844,7 +8869,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  	ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
>  					   reserved_space);
>  	if (!ret2) {
> -		ret2 = file_update_time(vmf->vma->vm_file);
> +		ret2 = file_update_time(file);
>  		reserved = 1;
>  	}
>  	if (ret2) {
> @@ -8943,6 +8968,14 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
>  		sb_end_pagefault(inode->i_sb);
>  		extent_changeset_free(data_reserved);
> +		if (fpin) {
> +			unlock_page(page);
> +			fput(fpin);
> +			get_page(page);
> +			vmf->cached_size = size;
> +			vmf->cached_page = page;
> +			return VM_FAULT_RETRY;
> +		}
>  		return VM_FAULT_LOCKED;

And this can all be done by the caller, too.

I'm not seeing anything btrfs sepcific here - maybe I'm missing
something, but this all looks likes it should be in the high level
mm/ code, not in the filesystem code.

>  	}
>  
> @@ -8955,6 +8988,10 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  out_noreserve:
>  	sb_end_pagefault(inode->i_sb);
>  	extent_changeset_free(data_reserved);
> +	if (fpin) {
> +		fput(fpin);
> +		down_read(&mm->mmap_sem);
> +	}
>  	return ret;
>  }
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a7305d193c71..02b420be6b06 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -370,6 +370,13 @@ struct vm_fault {
>  					 * next time we loop through the fault
>  					 * handler for faster lookup.
>  					 */
> +	loff_t cached_size;		/* ->page_mkwrite handlers may drop
> +					 * the mmap_sem to avoid starvation, in
> +					 * which case they need to save the
> +					 * i_size in order to verify the cached
> +					 * page we're using the next loop
> +					 * through hasn't changed under us.
> +					 */

You still haven't explained what the size verification is actually
required for.

>  	/* These three entries are valid only while holding ptl lock */
>  	pte_t *pte;			/* Pointer to pte entry matching
>  					 * the 'address'. NULL if the page
> @@ -1437,6 +1444,8 @@ extern vm_fault_t handle_mm_fault_cacheable(struct vm_fault *vmf);
>  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>  			    unsigned long address, unsigned int fault_flags,
>  			    bool *unlocked);
> +extern struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma,
> +					     int flags);
>  void unmap_mapping_pages(struct address_space *mapping,
>  		pgoff_t start, pgoff_t nr, bool even_cows);
>  void unmap_mapping_range(struct address_space *mapping,
> @@ -1463,6 +1472,11 @@ static inline int fixup_user_fault(struct task_struct *tsk,
>  	BUG();
>  	return -EFAULT;
>  }
> +static inline struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma,
> +						    int flags)
> +{
> +	return NULL;
> +}
>  static inline void unmap_mapping_pages(struct address_space *mapping,
>  		pgoff_t start, pgoff_t nr, bool even_cows) { }
>  static inline void unmap_mapping_range(struct address_space *mapping,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index e9cb44bd35aa..8027f082d74f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2366,7 +2366,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>  EXPORT_SYMBOL(generic_file_read_iter);
>  
>  #ifdef CONFIG_MMU
> -static struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int flags)
> +struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int flags)
>  {
>  	if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == FAULT_FLAG_ALLOW_RETRY) {
>  		struct file *file;
> @@ -2377,6 +2377,7 @@ static struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int fla
>  	}
>  	return NULL;
>  }
> +EXPORT_SYMBOL_GPL(maybe_unlock_mmap_for_io);

These API mods (if this functionality remains in the filesystem
code) belong in whatever patch introduced this function.

Cheers,

Dave.
Josef Bacik Oct. 22, 2018, 5:56 p.m. UTC | #2
On Fri, Oct 19, 2018 at 02:48:47PM +1100, Dave Chinner wrote:
> On Thu, Oct 18, 2018 at 04:23:18PM -0400, Josef Bacik wrote:
> > ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> > space, which can take 6 lifetimes, and we could possibly have to wait on
> > writeback on the page, another several lifetimes.  To avoid this simply
> > drop the mmap_sem if we didn't have the cached page and do all of our
> > work and return the appropriate retry error.  If we have the cached page
> > we know we did all the right things to set this page up and we can just
> > carry on.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/btrfs/inode.c   | 41 +++++++++++++++++++++++++++++++++++++++--
> >  include/linux/mm.h | 14 ++++++++++++++
> >  mm/filemap.c       |  3 ++-
> >  3 files changed, 55 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 3ea5339603cf..6b723d29bc0c 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -8809,7 +8809,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
> >  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >  {
> >  	struct page *page = vmf->page;
> > -	struct inode *inode = file_inode(vmf->vma->vm_file);
> > +	struct file *file = vmf->vma->vm_file, *fpin;
> > +	struct mm_struct *mm = vmf->vma->vm_mm;
> > +	struct inode *inode = file_inode(file);
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> >  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> >  	struct btrfs_ordered_extent *ordered;
> > @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >  
> >  	reserved_space = PAGE_SIZE;
> >  
> > +	/*
> > +	 * We have our cached page from a previous mkwrite, check it to make
> > +	 * sure it's still dirty and our file size matches when we ran mkwrite
> > +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> > +	 * otherwise do the mkwrite again.
> > +	 */
> > +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> > +		lock_page(page);
> > +		if (vmf->cached_size == i_size_read(inode) &&
> > +		    PageDirty(page))
> > +			return VM_FAULT_LOCKED;
> > +		unlock_page(page);
> > +	}
> 
> What does the file size have to do with whether we can use the
> initialised page or not? The file can be extended by other
> data operations (like write()) while a page fault is in progress,
> so I'm not sure how or why this check makes any sense.
> 
> I also don't see anything btrfs specific here, so....
> 

First thanks for the review, I've gone through and addressed everything you
mentioned, however this one is subtle.

The problem is the vmf->vma->vm_file access.  Once we drop the mmap_sem we can
no longer safely go into vmf->vma, so I'd have to fix all the page_mkwrite()'s
to not touch vma, and add a vmf->fpin instead to mess with. Plus I didn't want
to miss some subtlety in other fs's page_mkwrite()'s and inavertedly break them.
If I break btrfs I can fix it, I'm not as good with xfs.

If you want this in the generic layer and not in the fs I can go back and add a
vmf->fpin and vmf->mm, and then fix up all the mkwrite()'s to use that instead
of vmf->vma.  I think that's the only things we care about so it wouldn't be
hard.  Does that sound reasonable to you?

Also the size thing was just a paranoid way to make sure everything had stayed
exactly the same, but you're right, I'll just keep it consistent with all of our
other "is this page ok" checks.  Thanks,

Josef
Dave Chinner Oct. 22, 2018, 9:31 p.m. UTC | #3
On Mon, Oct 22, 2018 at 01:56:54PM -0400, Josef Bacik wrote:
> On Fri, Oct 19, 2018 at 02:48:47PM +1100, Dave Chinner wrote:
> > On Thu, Oct 18, 2018 at 04:23:18PM -0400, Josef Bacik wrote:
> > > ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> > > space, which can take 6 lifetimes, and we could possibly have to wait on
> > > writeback on the page, another several lifetimes.  To avoid this simply
> > > drop the mmap_sem if we didn't have the cached page and do all of our
> > > work and return the appropriate retry error.  If we have the cached page
> > > we know we did all the right things to set this page up and we can just
> > > carry on.
> > > 
> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > > ---
> > >  fs/btrfs/inode.c   | 41 +++++++++++++++++++++++++++++++++++++++--
> > >  include/linux/mm.h | 14 ++++++++++++++
> > >  mm/filemap.c       |  3 ++-
> > >  3 files changed, 55 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > index 3ea5339603cf..6b723d29bc0c 100644
> > > --- a/fs/btrfs/inode.c
> > > +++ b/fs/btrfs/inode.c
> > > @@ -8809,7 +8809,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
> > >  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >  {
> > >  	struct page *page = vmf->page;
> > > -	struct inode *inode = file_inode(vmf->vma->vm_file);
> > > +	struct file *file = vmf->vma->vm_file, *fpin;
> > > +	struct mm_struct *mm = vmf->vma->vm_mm;
> > > +	struct inode *inode = file_inode(file);
> > >  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > >  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > >  	struct btrfs_ordered_extent *ordered;
> > > @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >  
> > >  	reserved_space = PAGE_SIZE;
> > >  
> > > +	/*
> > > +	 * We have our cached page from a previous mkwrite, check it to make
> > > +	 * sure it's still dirty and our file size matches when we ran mkwrite
> > > +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> > > +	 * otherwise do the mkwrite again.
> > > +	 */
> > > +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> > > +		lock_page(page);
> > > +		if (vmf->cached_size == i_size_read(inode) &&
> > > +		    PageDirty(page))
> > > +			return VM_FAULT_LOCKED;
> > > +		unlock_page(page);
> > > +	}
> > 
> > What does the file size have to do with whether we can use the
> > initialised page or not? The file can be extended by other
> > data operations (like write()) while a page fault is in progress,
> > so I'm not sure how or why this check makes any sense.
> > 
> > I also don't see anything btrfs specific here, so....
> > 
> 
> First thanks for the review, I've gone through and addressed everything you
> mentioned, however this one is subtle.
> 
> The problem is the vmf->vma->vm_file access.  Once we drop the mmap_sem we can
> no longer safely go into vmf->vma, so I'd have to fix all the page_mkwrite()'s
> to not touch vma, and add a vmf->fpin instead to mess with.

Adding a vmf->file pointer seems pretty easy - making /everything/
use it instead of just special casing page_mkwrite also seems like
it would be a good idea - set it up in your new init function and
it's done. Pinning and unpinning could be done unconditionally, too
- that doesn't look expensive - and it would pull a lot of the
complexity out of the patchset for the cases where unlocking the
mmap_sem is done....

> Plus I didn't want
> to miss some subtlety in other fs's page_mkwrite()'s and inavertedly break them.
> If I break btrfs I can fix it, I'm not as good with xfs.

Sure, but if you make it a generic helper then you don't have to
worry about that. i.e.

generic_page_mkwrite_nommapsem(vmf, page_mkwrite_cb)
{
	/* use cached page if valid */

	/* unlock mmap_sem */

	/* do filesystem page_mkwrite callback */
	ret = page_mkwrite_cb(vmf);

	/* handle page caching */

	/* return retry fault indicator */
}

and that allows all filesystems to easily opt in to dropping the
mmap_sem and you don't have to worry about it too much. It means
filesystems with dax fault paths (like XFS) can convert the non-dax
paths that call immediately, and worry about the more tricky stuff
later...

> vmf->fpin and vmf->mm, and then fix up all the mkwrite()'s to use that instead

Ignoring DAX (because I haven't checked), what filesystem
page_mkwrite implementation requires access to the mm_struct? They
should all just be doing file operations on a page, not anything to
do with the way the page is mapped into the task memory....

> of vmf->vma.  I think that's the only things we care about so it wouldn't be
> hard.  Does that sound reasonable to you?

I think a little bit of tweaking, and it will be easy to for
everyone to opt in to this behaviour....

Cheers,

Dave.
Jan Kara Oct. 25, 2018, 1:22 p.m. UTC | #4
On Thu 18-10-18 16:23:18, Josef Bacik wrote:
> ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> space, which can take 6 lifetimes, and we could possibly have to wait on
> writeback on the page, another several lifetimes.  To avoid this simply
> drop the mmap_sem if we didn't have the cached page and do all of our
> work and return the appropriate retry error.  If we have the cached page
> we know we did all the right things to set this page up and we can just
> carry on.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
...
> @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
>  
>  	reserved_space = PAGE_SIZE;
>  
> +	/*
> +	 * We have our cached page from a previous mkwrite, check it to make
> +	 * sure it's still dirty and our file size matches when we ran mkwrite
> +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> +	 * otherwise do the mkwrite again.
> +	 */
> +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> +		lock_page(page);
> +		if (vmf->cached_size == i_size_read(inode) &&
> +		    PageDirty(page))
> +			return VM_FAULT_LOCKED;
> +		unlock_page(page);
> +	}

I guess this is similar to Dave's comment: Why is i_size so special? What
makes sure that file didn't get modified between time you've prepared
cached_page and now such that you need to do the preparation again?
And if indeed metadata prepared for a page cannot change, what's so special
about it being that particular cached_page?

Maybe to phrase my objections differently: Your preparations in
btrfs_page_mkwrite() are obviously related to your filesystem metadata. So
why cannot you infer from that metadata (extent tree, whatever - I'd use
extent status tree in ext4) whether that particular file+offset is already
prepared for writing and just bail out with success in that case?

								Honza
Josef Bacik Oct. 25, 2018, 1:58 p.m. UTC | #5
On Thu, Oct 25, 2018 at 03:22:30PM +0200, Jan Kara wrote:
> On Thu 18-10-18 16:23:18, Josef Bacik wrote:
> > ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> > space, which can take 6 lifetimes, and we could possibly have to wait on
> > writeback on the page, another several lifetimes.  To avoid this simply
> > drop the mmap_sem if we didn't have the cached page and do all of our
> > work and return the appropriate retry error.  If we have the cached page
> > we know we did all the right things to set this page up and we can just
> > carry on.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ...
> > @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >  
> >  	reserved_space = PAGE_SIZE;
> >  
> > +	/*
> > +	 * We have our cached page from a previous mkwrite, check it to make
> > +	 * sure it's still dirty and our file size matches when we ran mkwrite
> > +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> > +	 * otherwise do the mkwrite again.
> > +	 */
> > +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> > +		lock_page(page);
> > +		if (vmf->cached_size == i_size_read(inode) &&
> > +		    PageDirty(page))
> > +			return VM_FAULT_LOCKED;
> > +		unlock_page(page);
> > +	}
> 
> I guess this is similar to Dave's comment: Why is i_size so special? What
> makes sure that file didn't get modified between time you've prepared
> cached_page and now such that you need to do the preparation again?
> And if indeed metadata prepared for a page cannot change, what's so special
> about it being that particular cached_page?
> 
> Maybe to phrase my objections differently: Your preparations in
> btrfs_page_mkwrite() are obviously related to your filesystem metadata. So
> why cannot you infer from that metadata (extent tree, whatever - I'd use
> extent status tree in ext4) whether that particular file+offset is already
> prepared for writing and just bail out with success in that case?
> 

I was just being overly paranoid, I was afraid of the case where we would
truncate and then extend in between, but now that I actually think about it that
would end up with the page not being on the mapping anymore so we would catch
that case.  I've dropped this part from my current version.  I'm getting some
testing on these patches in production and I'll post them sometime next week
once I'm happy with them.  Thanks,

Josef
Jan Kara Oct. 26, 2018, 9:47 a.m. UTC | #6
On Thu 25-10-18 09:58:51, Josef Bacik wrote:
> On Thu, Oct 25, 2018 at 03:22:30PM +0200, Jan Kara wrote:
> > On Thu 18-10-18 16:23:18, Josef Bacik wrote:
> > > ->page_mkwrite is extremely expensive in btrfs.  We have to reserve
> > > space, which can take 6 lifetimes, and we could possibly have to wait on
> > > writeback on the page, another several lifetimes.  To avoid this simply
> > > drop the mmap_sem if we didn't have the cached page and do all of our
> > > work and return the appropriate retry error.  If we have the cached page
> > > we know we did all the right things to set this page up and we can just
> > > carry on.
> > > 
> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ...
> > > @@ -8828,6 +8830,29 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> > >  
> > >  	reserved_space = PAGE_SIZE;
> > >  
> > > +	/*
> > > +	 * We have our cached page from a previous mkwrite, check it to make
> > > +	 * sure it's still dirty and our file size matches when we ran mkwrite
> > > +	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
> > > +	 * otherwise do the mkwrite again.
> > > +	 */
> > > +	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
> > > +		lock_page(page);
> > > +		if (vmf->cached_size == i_size_read(inode) &&
> > > +		    PageDirty(page))
> > > +			return VM_FAULT_LOCKED;
> > > +		unlock_page(page);
> > > +	}
> > 
> > I guess this is similar to Dave's comment: Why is i_size so special? What
> > makes sure that file didn't get modified between time you've prepared
> > cached_page and now such that you need to do the preparation again?
> > And if indeed metadata prepared for a page cannot change, what's so special
> > about it being that particular cached_page?
> > 
> > Maybe to phrase my objections differently: Your preparations in
> > btrfs_page_mkwrite() are obviously related to your filesystem metadata. So
> > why cannot you infer from that metadata (extent tree, whatever - I'd use
> > extent status tree in ext4) whether that particular file+offset is already
> > prepared for writing and just bail out with success in that case?
> > 
> 
> I was just being overly paranoid, I was afraid of the case where we would
> truncate and then extend in between, but now that I actually think about it that
> would end up with the page not being on the mapping anymore so we would catch
> that case.  I've dropped this part from my current version.  I'm getting some
> testing on these patches in production and I'll post them sometime next week
> once I'm happy with them.  Thanks,

OK, but do you still need the vmf->cached_page stuff? Because I don't see
why even that is necessary...

								Honza
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..6b723d29bc0c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8809,7 +8809,9 @@  static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct file *file = vmf->vma->vm_file, *fpin;
+	struct mm_struct *mm = vmf->vma->vm_mm;
+	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -8828,6 +8830,29 @@  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 
 	reserved_space = PAGE_SIZE;
 
+	/*
+	 * We have our cached page from a previous mkwrite, check it to make
+	 * sure it's still dirty and our file size matches when we ran mkwrite
+	 * the last time.  If everything is OK then return VM_FAULT_LOCKED,
+	 * otherwise do the mkwrite again.
+	 */
+	if (vmf->flags & FAULT_FLAG_USED_CACHED) {
+		lock_page(page);
+		if (vmf->cached_size == i_size_read(inode) &&
+		    PageDirty(page))
+			return VM_FAULT_LOCKED;
+		unlock_page(page);
+	}
+
+	/*
+	 * mkwrite is extremely expensive, and we are holding the mmap_sem
+	 * during this, which means we can starve out anybody trying to
+	 * down_write(mmap_sem) for a long while, especially if we throw cgroups
+	 * into the mix.  So just drop the mmap_sem and do all of our work,
+	 * we'll loop back through and verify everything is ok the next time and
+	 * hopefully avoid doing the work twice.
+	 */
+	fpin = maybe_unlock_mmap_for_io(vmf->vma, vmf->flags);
 	sb_start_pagefault(inode->i_sb);
 	page_start = page_offset(page);
 	page_end = page_start + PAGE_SIZE - 1;
@@ -8844,7 +8869,7 @@  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 	ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
 					   reserved_space);
 	if (!ret2) {
-		ret2 = file_update_time(vmf->vma->vm_file);
+		ret2 = file_update_time(file);
 		reserved = 1;
 	}
 	if (ret2) {
@@ -8943,6 +8968,14 @@  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
 		sb_end_pagefault(inode->i_sb);
 		extent_changeset_free(data_reserved);
+		if (fpin) {
+			unlock_page(page);
+			fput(fpin);
+			get_page(page);
+			vmf->cached_size = size;
+			vmf->cached_page = page;
+			return VM_FAULT_RETRY;
+		}
 		return VM_FAULT_LOCKED;
 	}
 
@@ -8955,6 +8988,10 @@  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	extent_changeset_free(data_reserved);
+	if (fpin) {
+		fput(fpin);
+		down_read(&mm->mmap_sem);
+	}
 	return ret;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7305d193c71..02b420be6b06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -370,6 +370,13 @@  struct vm_fault {
 					 * next time we loop through the fault
 					 * handler for faster lookup.
 					 */
+	loff_t cached_size;		/* ->page_mkwrite handlers may drop
+					 * the mmap_sem to avoid starvation, in
+					 * which case they need to save the
+					 * i_size in order to verify the cached
+					 * page we're using the next loop
+					 * through hasn't changed under us.
+					 */
 	/* These three entries are valid only while holding ptl lock */
 	pte_t *pte;			/* Pointer to pte entry matching
 					 * the 'address'. NULL if the page
@@ -1437,6 +1444,8 @@  extern vm_fault_t handle_mm_fault_cacheable(struct vm_fault *vmf);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
+extern struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma,
+					     int flags);
 void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows);
 void unmap_mapping_range(struct address_space *mapping,
@@ -1463,6 +1472,11 @@  static inline int fixup_user_fault(struct task_struct *tsk,
 	BUG();
 	return -EFAULT;
 }
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma,
+						    int flags)
+{
+	return NULL;
+}
 static inline void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows) { }
 static inline void unmap_mapping_range(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index e9cb44bd35aa..8027f082d74f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2366,7 +2366,7 @@  generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 EXPORT_SYMBOL(generic_file_read_iter);
 
 #ifdef CONFIG_MMU
-static struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int flags)
+struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int flags)
 {
 	if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == FAULT_FLAG_ALLOW_RETRY) {
 		struct file *file;
@@ -2377,6 +2377,7 @@  static struct file *maybe_unlock_mmap_for_io(struct vm_area_struct *vma, int fla
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(maybe_unlock_mmap_for_io);
 
 /**
  * page_cache_read - adds requested page to the page cache if not already there