diff mbox series

[v3,1/1] FUSE: Allow non-extending parallel direct writes on the same file.

Message ID 20220520043443.17439-2-dharamhans87@gmail.com (mailing list archive)
State New, archived
Headers show
Series FUSE: Allow non-extending parallel direct writes | expand

Commit Message

Dharmendra Singh May 20, 2022, 4:34 a.m. UTC
From: Dharmendra Singh <dsingh@ddn.com>

In general, as of now, in FUSE, direct writes on the same file are
serialized over inode lock i.e we hold inode lock for the full duration
of the write request. I could not found in fuse code a comment which
clearly explains why this exclusive lock is taken for direct writes.
Our guess is some USER space fuse implementations might be relying
on this lock for seralization and also it protects for the issues
arising due to file size assumption or write failures.  This patch
relaxes this exclusive lock in some cases of direct writes.

With these changes, we allows non-extending parallel direct writes
on the same file with the help of a flag called FOPEN_PARALLEL_WRITES.
If this flag is set on the file (flag is passed from libfuse to fuse
kernel as part of file open/create), we do not take exclusive lock instead
use shared lock so that all non-extending writes can run in parallel.

Best practise would be to enable parallel direct writes of all kinds
including extending writes as well but we see some issues such as
when one write completes and other fails, how we should truncate(if
needed) the file if underlying file system does not support holes
(For file systems which supports holes, there might be a possibility
of enabling parallel writes for all cases).

FUSE implementations which rely on this inode lock for serialisation
can continue to do so and this is default behaviour i.e no parallel
direct writes.

Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
---
 fs/fuse/file.c            | 33 ++++++++++++++++++++++++++++++---
 include/uapi/linux/fuse.h |  2 ++
 2 files changed, 32 insertions(+), 3 deletions(-)

Comments

Vivek Goyal May 25, 2022, 6:41 p.m. UTC | #1
On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
> From: Dharmendra Singh <dsingh@ddn.com>
> 
> In general, as of now, in FUSE, direct writes on the same file are
> serialized over inode lock i.e we hold inode lock for the full duration
> of the write request. I could not found in fuse code a comment which
> clearly explains why this exclusive lock is taken for direct writes.
> Our guess is some USER space fuse implementations might be relying
> on this lock for seralization and also it protects for the issues
> arising due to file size assumption or write failures.  This patch
> relaxes this exclusive lock in some cases of direct writes.
> 
> With these changes, we allows non-extending parallel direct writes
> on the same file with the help of a flag called FOPEN_PARALLEL_WRITES.
> If this flag is set on the file (flag is passed from libfuse to fuse
> kernel as part of file open/create), we do not take exclusive lock instead
> use shared lock so that all non-extending writes can run in parallel.
> 
> Best practise would be to enable parallel direct writes of all kinds
> including extending writes as well but we see some issues such as
> when one write completes and other fails, how we should truncate(if
> needed) the file if underlying file system does not support holes
> (For file systems which supports holes, there might be a possibility
> of enabling parallel writes for all cases).
> 
> FUSE implementations which rely on this inode lock for serialisation
> can continue to do so and this is default behaviour i.e no parallel
> direct writes.
> 
> Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> ---
>  fs/fuse/file.c            | 33 ++++++++++++++++++++++++++++++---
>  include/uapi/linux/fuse.h |  2 ++
>  2 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 829094451774..1a93fd80a6ce 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1541,14 +1541,37 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	return res;
>  }
>  
> +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
> +					       struct iov_iter *iter)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	return (iocb->ki_flags & IOCB_APPEND ||
> +		iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode));
> +}

Hi Dharmendra,

I have a question. What makes i_size stable. This is being read outside
the inode_lock(). Can it race with truncate. I mean we checked
i_size and decided to take shared lock. In the mean time another thread
truncated the file and now our decision to take shared lock is wrong
as file will be extended due to direct write?

Thanks
Vivek

> +
>  static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct file *file = iocb->ki_filp;
> +	struct fuse_file *ff = file->private_data;
>  	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
>  	ssize_t res;
> +	bool exclusive_lock = !(ff->open_flags & FOPEN_PARALLEL_WRITES) ||
> +			       fuse_direct_write_extending_i_size(iocb, from);
> +
> +	/*
> +	 * Take exclusive lock if
> +	 * - parallel writes are disabled.
> +	 * - parallel writes are enabled and i_size is being extended
> +	 * Take shared lock if
> +	 * - parallel writes are enabled but i_size does not extend.
> +	 */
> +	if (exclusive_lock)
> +		inode_lock(inode);
> +	else
> +		inode_lock_shared(inode);
>  
> -	/* Don't allow parallel writes to the same file */
> -	inode_lock(inode);
>  	res = generic_write_checks(iocb, from);
>  	if (res > 0) {
>  		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
> @@ -1559,7 +1582,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  			fuse_write_update_attr(inode, iocb->ki_pos, res);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (exclusive_lock)
> +		inode_unlock(inode);
> +	else
> +		inode_unlock_shared(inode);
>  
>  	return res;
>  }
> @@ -2901,6 +2927,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  
>  	if (iov_iter_rw(iter) == WRITE) {
>  		fuse_write_update_attr(inode, pos, ret);
> +		/* For extending writes we already hold exclusive lock */
>  		if (ret < 0 && offset + count > i_size)
>  			fuse_do_truncate(file);
>  	}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d6ccee961891..ee5379d41906 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -301,6 +301,7 @@ struct fuse_file_lock {
>   * FOPEN_CACHE_DIR: allow caching this directory
>   * FOPEN_STREAM: the file is stream-like (no file position at all)
>   * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
> + * FOPEN_PARALLEL_WRITES: Allow concurrent writes on the same inode
>   */
>  #define FOPEN_DIRECT_IO		(1 << 0)
>  #define FOPEN_KEEP_CACHE	(1 << 1)
> @@ -308,6 +309,7 @@ struct fuse_file_lock {
>  #define FOPEN_CACHE_DIR		(1 << 3)
>  #define FOPEN_STREAM		(1 << 4)
>  #define FOPEN_NOFLUSH		(1 << 5)
> +#define FOPEN_PARALLEL_WRITES	(1 << 6)
>  
>  /**
>   * INIT request/reply flags
> -- 
> 2.17.1
>
Bernd Schubert May 25, 2022, 7:06 p.m. UTC | #2
On 5/25/22 20:41, Vivek Goyal wrote:
> On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
>> From: Dharmendra Singh <dsingh@ddn.com>
>>
>> In general, as of now, in FUSE, direct writes on the same file are
>> serialized over inode lock i.e we hold inode lock for the full duration
>> of the write request. I could not found in fuse code a comment which
>> clearly explains why this exclusive lock is taken for direct writes.
>> Our guess is some USER space fuse implementations might be relying
>> on this lock for seralization and also it protects for the issues
>> arising due to file size assumption or write failures.  This patch
>> relaxes this exclusive lock in some cases of direct writes.
>>
>> With these changes, we allows non-extending parallel direct writes
>> on the same file with the help of a flag called FOPEN_PARALLEL_WRITES.
>> If this flag is set on the file (flag is passed from libfuse to fuse
>> kernel as part of file open/create), we do not take exclusive lock instead
>> use shared lock so that all non-extending writes can run in parallel.
>>
>> Best practise would be to enable parallel direct writes of all kinds
>> including extending writes as well but we see some issues such as
>> when one write completes and other fails, how we should truncate(if
>> needed) the file if underlying file system does not support holes
>> (For file systems which supports holes, there might be a possibility
>> of enabling parallel writes for all cases).
>>
>> FUSE implementations which rely on this inode lock for serialisation
>> can continue to do so and this is default behaviour i.e no parallel
>> direct writes.
>>
>> Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
>> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
>> ---
>>   fs/fuse/file.c            | 33 ++++++++++++++++++++++++++++++---
>>   include/uapi/linux/fuse.h |  2 ++
>>   2 files changed, 32 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index 829094451774..1a93fd80a6ce 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -1541,14 +1541,37 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>   	return res;
>>   }
>>   
>> +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
>> +					       struct iov_iter *iter)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +
>> +	return (iocb->ki_flags & IOCB_APPEND ||
>> +		iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode));
>> +}
> 
> Hi Dharmendra,
> 
> I have a question. What makes i_size stable. This is being read outside
> the inode_lock(). Can it race with truncate. I mean we checked
> i_size and decided to take shared lock. In the mean time another thread
> truncated the file and now our decision to take shared lock is wrong
> as file will be extended due to direct write?

Oh right, good catch! I guess we need to take a shared lock first, read 
the size and if it is an extending lock need to unlock/switch to an 
excluding lock. Theoretically could be a loop, but I guess that would be 
overkill.


Thanks for your review!

Cheers,
Bernd
Vivek Goyal May 25, 2022, 7:12 p.m. UTC | #3
On Wed, May 25, 2022 at 09:06:37PM +0200, Bernd Schubert wrote:
> 
> 
> On 5/25/22 20:41, Vivek Goyal wrote:
> > On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
> > > From: Dharmendra Singh <dsingh@ddn.com>
> > > 
> > > In general, as of now, in FUSE, direct writes on the same file are
> > > serialized over inode lock i.e we hold inode lock for the full duration
> > > of the write request. I could not found in fuse code a comment which
> > > clearly explains why this exclusive lock is taken for direct writes.
> > > Our guess is some USER space fuse implementations might be relying
> > > on this lock for seralization and also it protects for the issues
> > > arising due to file size assumption or write failures.  This patch
> > > relaxes this exclusive lock in some cases of direct writes.
> > > 
> > > With these changes, we allows non-extending parallel direct writes
> > > on the same file with the help of a flag called FOPEN_PARALLEL_WRITES.
> > > If this flag is set on the file (flag is passed from libfuse to fuse
> > > kernel as part of file open/create), we do not take exclusive lock instead
> > > use shared lock so that all non-extending writes can run in parallel.
> > > 
> > > Best practise would be to enable parallel direct writes of all kinds
> > > including extending writes as well but we see some issues such as
> > > when one write completes and other fails, how we should truncate(if
> > > needed) the file if underlying file system does not support holes
> > > (For file systems which supports holes, there might be a possibility
> > > of enabling parallel writes for all cases).
> > > 
> > > FUSE implementations which rely on this inode lock for serialisation
> > > can continue to do so and this is default behaviour i.e no parallel
> > > direct writes.
> > > 
> > > Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
> > > Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> > > ---
> > >   fs/fuse/file.c            | 33 ++++++++++++++++++++++++++++++---
> > >   include/uapi/linux/fuse.h |  2 ++
> > >   2 files changed, 32 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > > index 829094451774..1a93fd80a6ce 100644
> > > --- a/fs/fuse/file.c
> > > +++ b/fs/fuse/file.c
> > > @@ -1541,14 +1541,37 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >   	return res;
> > >   }
> > > +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
> > > +					       struct iov_iter *iter)
> > > +{
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +
> > > +	return (iocb->ki_flags & IOCB_APPEND ||
> > > +		iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode));
> > > +}
> > 
> > Hi Dharmendra,
> > 
> > I have a question. What makes i_size stable. This is being read outside
> > the inode_lock(). Can it race with truncate. I mean we checked
> > i_size and decided to take shared lock. In the mean time another thread
> > truncated the file and now our decision to take shared lock is wrong
> > as file will be extended due to direct write?
> 
> Oh right, good catch! I guess we need to take a shared lock first, read the
> size and if it is an extending lock need to unlock/switch to an excluding
> lock. Theoretically could be a loop, but I guess that would be overkill.

Another possibility is that we need to check again after taking shared
lock if our decision to take shared lock was correct or not. If in
unlikely case something changed, then drop shared lock and go back to
exclusive lock, or drop the shared lock and re-evaluate which lock to take.

Thanks
Vivek

> 
> 
> Thanks for your review!
> 
> Cheers,
> Bernd
>
Vivek Goyal May 25, 2022, 8:31 p.m. UTC | #4
On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
> From: Dharmendra Singh <dsingh@ddn.com>
> 
> In general, as of now, in FUSE, direct writes on the same file are
> serialized over inode lock i.e we hold inode lock for the full duration
> of the write request. I could not found in fuse code a comment which
> clearly explains why this exclusive lock is taken for direct writes.
> Our guess is some USER space fuse implementations might be relying
> on this lock for seralization and also it protects for the issues
> arising due to file size assumption or write failures.  This patch
> relaxes this exclusive lock in some cases of direct writes.

I have this question as well. My understanding was that in general,
reads can do shared lock while writes have to take exclusive lock.
And I assumed that extends to both buffered as well as direct
writes.

I would also like to understand what's the fundamental restriction
and why O_DIRECT is special that this restriction does not apply.

Is any other file system doing this as well?

If fuse server dir is shared with other fuse clients, it is possible
that i_size in this client is stale. Will that be a problem. I guess
if that's the problem then, even a single write will be a problem
because two fuse clients might be trying to write.

Just trying to make sure that it is safe to allow parallel direct
writes.

Thanks
Vivek

> 
> With these changes, we allows non-extending parallel direct writes
> on the same file with the help of a flag called FOPEN_PARALLEL_WRITES.
> If this flag is set on the file (flag is passed from libfuse to fuse
> kernel as part of file open/create), we do not take exclusive lock instead
> use shared lock so that all non-extending writes can run in parallel.
> 
> Best practise would be to enable parallel direct writes of all kinds
> including extending writes as well but we see some issues such as
> when one write completes and other fails, how we should truncate(if
> needed) the file if underlying file system does not support holes
> (For file systems which supports holes, there might be a possibility
> of enabling parallel writes for all cases).
> 
> FUSE implementations which rely on this inode lock for serialisation
> can continue to do so and this is default behaviour i.e no parallel
> direct writes.
> 
> Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> ---
>  fs/fuse/file.c            | 33 ++++++++++++++++++++++++++++++---
>  include/uapi/linux/fuse.h |  2 ++
>  2 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 829094451774..1a93fd80a6ce 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1541,14 +1541,37 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	return res;
>  }
>  
> +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
> +					       struct iov_iter *iter)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	return (iocb->ki_flags & IOCB_APPEND ||
> +		iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode));
> +}
> +
>  static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct file *file = iocb->ki_filp;
> +	struct fuse_file *ff = file->private_data;
>  	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
>  	ssize_t res;
> +	bool exclusive_lock = !(ff->open_flags & FOPEN_PARALLEL_WRITES) ||
> +			       fuse_direct_write_extending_i_size(iocb, from);
> +
> +	/*
> +	 * Take exclusive lock if
> +	 * - parallel writes are disabled.
> +	 * - parallel writes are enabled and i_size is being extended
> +	 * Take shared lock if
> +	 * - parallel writes are enabled but i_size does not extend.
> +	 */
> +	if (exclusive_lock)
> +		inode_lock(inode);
> +	else
> +		inode_lock_shared(inode);
>  
> -	/* Don't allow parallel writes to the same file */
> -	inode_lock(inode);
>  	res = generic_write_checks(iocb, from);
>  	if (res > 0) {
>  		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
> @@ -1559,7 +1582,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  			fuse_write_update_attr(inode, iocb->ki_pos, res);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (exclusive_lock)
> +		inode_unlock(inode);
> +	else
> +		inode_unlock_shared(inode);
>  
>  	return res;
>  }
> @@ -2901,6 +2927,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  
>  	if (iov_iter_rw(iter) == WRITE) {
>  		fuse_write_update_attr(inode, pos, ret);
> +		/* For extending writes we already hold exclusive lock */
>  		if (ret < 0 && offset + count > i_size)
>  			fuse_do_truncate(file);
>  	}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d6ccee961891..ee5379d41906 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -301,6 +301,7 @@ struct fuse_file_lock {
>   * FOPEN_CACHE_DIR: allow caching this directory
>   * FOPEN_STREAM: the file is stream-like (no file position at all)
>   * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
> + * FOPEN_PARALLEL_WRITES: Allow concurrent writes on the same inode
>   */
>  #define FOPEN_DIRECT_IO		(1 << 0)
>  #define FOPEN_KEEP_CACHE	(1 << 1)
> @@ -308,6 +309,7 @@ struct fuse_file_lock {
>  #define FOPEN_CACHE_DIR		(1 << 3)
>  #define FOPEN_STREAM		(1 << 4)
>  #define FOPEN_NOFLUSH		(1 << 5)
> +#define FOPEN_PARALLEL_WRITES	(1 << 6)
>  
>  /**
>   * INIT request/reply flags
> -- 
> 2.17.1
>
Bernd Schubert May 25, 2022, 8:49 p.m. UTC | #5
On 5/25/22 22:31, Vivek Goyal wrote:
> On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
>> From: Dharmendra Singh <dsingh@ddn.com>
>>
>> In general, as of now, in FUSE, direct writes on the same file are
>> serialized over inode lock i.e we hold inode lock for the full duration
>> of the write request. I could not found in fuse code a comment which
>> clearly explains why this exclusive lock is taken for direct writes.
>> Our guess is some USER space fuse implementations might be relying
>> on this lock for seralization and also it protects for the issues
>> arising due to file size assumption or write failures.  This patch
>> relaxes this exclusive lock in some cases of direct writes.
> 
> I have this question as well. My understanding was that in general,
> reads can do shared lock while writes have to take exclusive lock.
> And I assumed that extends to both buffered as well as direct
> writes.
> 
> I would also like to understand what's the fundamental restriction
> and why O_DIRECT is special that this restriction does not apply.
> 
> Is any other file system doing this as well?
> 
> If fuse server dir is shared with other fuse clients, it is possible
> that i_size in this client is stale. Will that be a problem. I guess
> if that's the problem then, even a single write will be a problem
> because two fuse clients might be trying to write.
> 
> Just trying to make sure that it is safe to allow parallel direct
> writes.

I think missing in this series is to add a comment when this lock is 
needed at all. Our network file system is log structured - any parallel 
writes to the same file from different remote clients are handled 
through addition of fragments on the network server side - lockless safe 
due to byte level accuracy. With the exception of conflicting writes - 
last client wins - application is then doing 'silly' things - locking 
would not help either. And random parallel writes from the same 
(network) client are even an ideal case for us, as that is handled 
through shared blocks for different fragments (file offset + len). So 
for us shared writes are totally safe.

When Dharmendra and I discussed about the lock we came up with a few 
write error handling cases where that lock might be needed - I guess 
that should be added as comment.
Vivek Goyal May 26, 2022, 6:14 p.m. UTC | #6
On Wed, May 25, 2022 at 10:49:49PM +0200, Bernd Schubert wrote:
> 
> 
> On 5/25/22 22:31, Vivek Goyal wrote:
> > On Fri, May 20, 2022 at 10:04:43AM +0530, Dharmendra Singh wrote:
> > > From: Dharmendra Singh <dsingh@ddn.com>
> > > 
> > > In general, as of now, in FUSE, direct writes on the same file are
> > > serialized over inode lock i.e we hold inode lock for the full duration
> > > of the write request. I could not found in fuse code a comment which
> > > clearly explains why this exclusive lock is taken for direct writes.
> > > Our guess is some USER space fuse implementations might be relying
> > > on this lock for seralization and also it protects for the issues
> > > arising due to file size assumption or write failures.  This patch
> > > relaxes this exclusive lock in some cases of direct writes.
> > 
> > I have this question as well. My understanding was that in general,
> > reads can do shared lock while writes have to take exclusive lock.
> > And I assumed that extends to both buffered as well as direct
> > writes.
> > 
> > I would also like to understand what's the fundamental restriction
> > and why O_DIRECT is special that this restriction does not apply.
> > 
> > Is any other file system doing this as well?
> > 
> > If fuse server dir is shared with other fuse clients, it is possible
> > that i_size in this client is stale. Will that be a problem. I guess
> > if that's the problem then, even a single write will be a problem
> > because two fuse clients might be trying to write.
> > 
> > Just trying to make sure that it is safe to allow parallel direct
> > writes.
> 
> I think missing in this series is to add a comment when this lock is needed
> at all. Our network file system is log structured - any parallel writes to
> the same file from different remote clients are handled through addition of
> fragments on the network server side - lockless safe due to byte level
> accuracy. With the exception of conflicting writes - last client wins -
> application is then doing 'silly' things - locking would not help either.
> And random parallel writes from the same (network) client are even an ideal
> case for us, as that is handled through shared blocks for different
> fragments (file offset + len). So for us shared writes are totally safe.
> 
> When Dharmendra and I discussed about the lock we came up with a few write
> error handling cases where that lock might be needed - I guess that should
> be added as comment.

Right, please add the changelogs to make thought process clear.

So there are no restrictions on the fuse client side from parallelism
point of view on direct write path? 

Why file extending writes are not safe? Is this a restriction from
fuse client point of view or just being safe from server point of view.
If fuse user space is talking to another filesystem, I guess then it
is not problem because that filesystem will take care of locking as
needed.

I see ext4 is allowing parallel direct writes for certain cases. And
where they can't allow it, they have documented it in comments.
(ext4_dio_write_checks()). I think we need similar rationalization,
especially from fuse client's perspective and have comments in code
and specify when it is ok to have parallel direct writes and when it
is not ok and why. This will help people when they are looking at
the code later.

Thanks
Vivek
diff mbox series

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..1a93fd80a6ce 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1541,14 +1541,37 @@  static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return res;
 }
 
+static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
+					       struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return (iocb->ki_flags & IOCB_APPEND ||
+		iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode));
+}
+
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
+	bool exclusive_lock = !(ff->open_flags & FOPEN_PARALLEL_WRITES) ||
+			       fuse_direct_write_extending_i_size(iocb, from);
+
+	/*
+	 * Take exclusive lock if
+	 * - parallel writes are disabled.
+	 * - parallel writes are enabled and i_size is being extended
+	 * Take shared lock if
+	 * - parallel writes are enabled but i_size does not extend.
+	 */
+	if (exclusive_lock)
+		inode_lock(inode);
+	else
+		inode_lock_shared(inode);
 
-	/* Don't allow parallel writes to the same file */
-	inode_lock(inode);
 	res = generic_write_checks(iocb, from);
 	if (res > 0) {
 		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
@@ -1559,7 +1582,10 @@  static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			fuse_write_update_attr(inode, iocb->ki_pos, res);
 		}
 	}
-	inode_unlock(inode);
+	if (exclusive_lock)
+		inode_unlock(inode);
+	else
+		inode_unlock_shared(inode);
 
 	return res;
 }
@@ -2901,6 +2927,7 @@  fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 	if (iov_iter_rw(iter) == WRITE) {
 		fuse_write_update_attr(inode, pos, ret);
+		/* For extending writes we already hold exclusive lock */
 		if (ret < 0 && offset + count > i_size)
 			fuse_do_truncate(file);
 	}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index d6ccee961891..ee5379d41906 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -301,6 +301,7 @@  struct fuse_file_lock {
  * FOPEN_CACHE_DIR: allow caching this directory
  * FOPEN_STREAM: the file is stream-like (no file position at all)
  * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
+ * FOPEN_PARALLEL_WRITES: Allow concurrent writes on the same inode
  */
 #define FOPEN_DIRECT_IO		(1 << 0)
 #define FOPEN_KEEP_CACHE	(1 << 1)
@@ -308,6 +309,7 @@  struct fuse_file_lock {
 #define FOPEN_CACHE_DIR		(1 << 3)
 #define FOPEN_STREAM		(1 << 4)
 #define FOPEN_NOFLUSH		(1 << 5)
+#define FOPEN_PARALLEL_WRITES	(1 << 6)
 
 /**
  * INIT request/reply flags