diff mbox series

[6/8] btrfs: Use shared inode lock for direct writes within EOF

Message ID 20200622162017.21773-7-rgoldwyn@suse.de (mailing list archive)
State New, archived
Headers show
Series Rearrange inode locking | expand

Commit Message

Goldwyn Rodrigues June 22, 2020, 4:20 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

This is to parallelize direct writes within EOF or with direct I/O
reads. This covers the race with truncate() accidentally increasing the
filesize.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/file.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

Comments

David Sterba June 30, 2020, 8:42 a.m. UTC | #1
On Mon, Jun 22, 2020 at 11:20:15AM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> This is to parallelize direct writes within EOF or with direct I/O
> reads. This covers the race with truncate() accidentally increasing the
> filesize.

Please describe the race condition in more detail and how the DIO/EOF
parallelization is supposed to work.

> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/file.c | 25 +++++++------------------
>  1 file changed, 7 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aa6be931620b..c446a4aeb867 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1957,12 +1957,18 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t endbyte;
>  	int err;
>  	size_t count = 0;
> -	bool relock = false;
>  	int flags = IOMAP_DIOF_PGINVALID_FAIL;
>  	int ilock_flags = 0;
>  
>  	if (iocb->ki_flags & IOCB_NOWAIT)
>  		ilock_flags |= BTRFS_ILOCK_TRY;
> +	/*
> +	 * If the write DIO within EOF,  use a shared lock
> +	 */
> +	if (pos + count <= i_size_read(inode))

Inode size is now read outside of the inode lock, so it could change
until the lock is taken a few lines below.

> +		ilock_flags |= BTRFS_ILOCK_SHARED;
> +	else if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EAGAIN;
>  
>  	err = btrfs_inode_lock(inode, ilock_flags);

Is it necessary to revalidate that 'pos + count < i_size' still holds
when the lock was taken as SHARED?
Filipe Manana June 30, 2020, 9:37 a.m. UTC | #2
On Mon, Jun 22, 2020 at 5:22 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote:
>
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> This is to parallelize direct writes within EOF or with direct I/O
> reads. This covers the race with truncate() accidentally increasing the
> filesize.
>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/file.c | 25 +++++++------------------
>  1 file changed, 7 insertions(+), 18 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aa6be931620b..c446a4aeb867 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1957,12 +1957,18 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>         loff_t endbyte;
>         int err;
>         size_t count = 0;
> -       bool relock = false;
>         int flags = IOMAP_DIOF_PGINVALID_FAIL;
>         int ilock_flags = 0;
>
>         if (iocb->ki_flags & IOCB_NOWAIT)
>                 ilock_flags |= BTRFS_ILOCK_TRY;
> +       /*
> +        * If the write DIO within EOF,  use a shared lock
> +        */
> +       if (pos + count <= i_size_read(inode))
> +               ilock_flags |= BTRFS_ILOCK_SHARED;
> +       else if (iocb->ki_flags & IOCB_NOWAIT)
> +               return -EAGAIN;

In the next iteration, please rebase the patchset on a more recent misc-next.

That hunk returning -EAGAIN is buggy and was removed a couple weeks
ago in a patchset fixing several bugs with NOWAIT writes.

Thanks.

>
>         err = btrfs_inode_lock(inode, ilock_flags);
>         if (err < 0)
> @@ -1975,20 +1981,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>         if (check_direct_IO(fs_info, from, pos))
>                 goto buffered;
>
> -       count = iov_iter_count(from);
> -       /*
> -        * If the write DIO is beyond the EOF, we need update the isize, but it
> -        * is protected by i_mutex. So we can not unlock the i_mutex at this
> -        * case.
> -        */
> -       if (pos + count <= inode->i_size) {
> -               inode_unlock(inode);
> -               relock = true;
> -       } else if (iocb->ki_flags & IOCB_NOWAIT) {
> -               err = -EAGAIN;
> -               goto out;
> -       }
> -
>         if (is_sync_kiocb(iocb))
>                 flags |= IOMAP_DIOF_WAIT_FOR_COMPLETION;
>
> @@ -1997,9 +1989,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
>                                flags);
>         up_read(&BTRFS_I(inode)->dio_sem);
>
> -       if (relock)
> -               inode_lock(inode);
> -
>         if (written < 0 || !iov_iter_count(from)) {
>                 err = written;
>                 goto error;
> --
> 2.25.0
>
Goldwyn Rodrigues June 30, 2020, 2:20 p.m. UTC | #3
On 10:37 30/06, Filipe Manana wrote:
> On Mon, Jun 22, 2020 at 5:22 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote:
> >
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> >
> > This is to parallelize direct writes within EOF or with direct I/O
> > reads. This covers the race with truncate() accidentally increasing the
> > filesize.
> >
> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > ---
> >  fs/btrfs/file.c | 25 +++++++------------------
> >  1 file changed, 7 insertions(+), 18 deletions(-)
> >
> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> > index aa6be931620b..c446a4aeb867 100644
> > --- a/fs/btrfs/file.c
> > +++ b/fs/btrfs/file.c
> > @@ -1957,12 +1957,18 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
> >         loff_t endbyte;
> >         int err;
> >         size_t count = 0;
> > -       bool relock = false;
> >         int flags = IOMAP_DIOF_PGINVALID_FAIL;
> >         int ilock_flags = 0;
> >
> >         if (iocb->ki_flags & IOCB_NOWAIT)
> >                 ilock_flags |= BTRFS_ILOCK_TRY;
> > +       /*
> > +        * If the write DIO within EOF,  use a shared lock
> > +        */
> > +       if (pos + count <= i_size_read(inode))
> > +               ilock_flags |= BTRFS_ILOCK_SHARED;
> > +       else if (iocb->ki_flags & IOCB_NOWAIT)
> > +               return -EAGAIN;
> 
> In the next iteration, please rebase the patchset on a more recent misc-next.
> 
> That hunk returning -EAGAIN is buggy and was removed a couple weeks
> ago in a patchset fixing several bugs with NOWAIT writes.
> 

I worked against the vanilla and it is already ported in the git tree
since your patch was added in the meantime.
Goldwyn Rodrigues June 30, 2020, 2:24 p.m. UTC | #4
On 10:42 30/06, David Sterba wrote:
> On Mon, Jun 22, 2020 at 11:20:15AM -0500, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > This is to parallelize direct writes within EOF or with direct I/O
> > reads. This covers the race with truncate() accidentally increasing the
> > filesize.
> 
> Please describe the race condition in more detail and how the DIO/EOF
> parallelization is supposed to work.
> 
> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > ---
> >  fs/btrfs/file.c | 25 +++++++------------------
> >  1 file changed, 7 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> > index aa6be931620b..c446a4aeb867 100644
> > --- a/fs/btrfs/file.c
> > +++ b/fs/btrfs/file.c
> > @@ -1957,12 +1957,18 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
> >  	loff_t endbyte;
> >  	int err;
> >  	size_t count = 0;
> > -	bool relock = false;
> >  	int flags = IOMAP_DIOF_PGINVALID_FAIL;
> >  	int ilock_flags = 0;
> >  
> >  	if (iocb->ki_flags & IOCB_NOWAIT)
> >  		ilock_flags |= BTRFS_ILOCK_TRY;
> > +	/*
> > +	 * If the write DIO within EOF,  use a shared lock
> > +	 */
> > +	if (pos + count <= i_size_read(inode))
> 
> Inode size is now read outside of the inode lock, so it could change
> until the lock is taken a few lines below.

Good catch. This should be re-checked in btrfs_write_check().
Thanks.

> 
> > +		ilock_flags |= BTRFS_ILOCK_SHARED;
> > +	else if (iocb->ki_flags & IOCB_NOWAIT)
> > +		return -EAGAIN;
> >  
> >  	err = btrfs_inode_lock(inode, ilock_flags);
> 
> Is it necessary to revalidate that 'pos + count < i_size' still holds
> when the lock was taken as SHARED?

Yes, and convert the lock, if not.
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aa6be931620b..c446a4aeb867 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1957,12 +1957,18 @@  static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	loff_t endbyte;
 	int err;
 	size_t count = 0;
-	bool relock = false;
 	int flags = IOMAP_DIOF_PGINVALID_FAIL;
 	int ilock_flags = 0;
 
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		ilock_flags |= BTRFS_ILOCK_TRY;
+	/*
+	 * If the write DIO within EOF,  use a shared lock
+	 */
+	if (pos + count <= i_size_read(inode))
+		ilock_flags |= BTRFS_ILOCK_SHARED;
+	else if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
 
 	err = btrfs_inode_lock(inode, ilock_flags);
 	if (err < 0)
@@ -1975,20 +1981,6 @@  static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	if (check_direct_IO(fs_info, from, pos))
 		goto buffered;
 
-	count = iov_iter_count(from);
-	/*
-	 * If the write DIO is beyond the EOF, we need update the isize, but it
-	 * is protected by i_mutex. So we can not unlock the i_mutex at this
-	 * case.
-	 */
-	if (pos + count <= inode->i_size) {
-		inode_unlock(inode);
-		relock = true;
-	} else if (iocb->ki_flags & IOCB_NOWAIT) {
-		err = -EAGAIN;
-		goto out;
-	}
-
 	if (is_sync_kiocb(iocb))
 		flags |= IOMAP_DIOF_WAIT_FOR_COMPLETION;
 
@@ -1997,9 +1989,6 @@  static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 			       flags);
 	up_read(&BTRFS_I(inode)->dio_sem);
 
-	if (relock)
-		inode_lock(inode);
-
 	if (written < 0 || !iov_iter_count(from)) {
 		err = written;
 		goto error;