diff mbox series

fuse: make foffset alignment opt-in for optimum backend performance

Message ID 20240705100449.60891-1-jefflexu@linux.alibaba.com (mailing list archive)
State New
Headers show
Series fuse: make foffset alignment opt-in for optimum backend performance | expand

Commit Message

Jingbo Xu July 5, 2024, 10:04 a.m. UTC
Sometimes the file offset alignment needs to be opt-in to achieve the
optimum performance at the backend store.

For example when ErasureCode [1] is used at the backend store, the
optimum write performance is achieved when the WRITE request is aligned
with the stripe size of ErasureCode.  Otherwise a non-aligned WRITE
request needs to be split at the stripe size boundary.  It is quite
costly to handle these split partial requests, as firstly the whole
stripe to which the split partial request belongs needs to be read out,
then overwrite the read stripe buffer with the request, and finally write
the whole stripe back to the persistent storage.

Thus the backend store can suffer severe performance degradation when
WRITE requests can not fit into one stripe exactly.  The write performance
can be 10x slower when the request is 256KB in size given 4MB stripe size.
Also there can be 50% performance degradation in theory if the request
is not stripe boundary aligned.

Besides, the conveyed test indicates that, the non-alignment issue
becomes more severe when decreasing fuse's max_ratio, maybe partly
because the background writeback now is more likely to run parallelly
with the dirtier.

fuse's max_ratio	ratio of aligned WRITE requests
----------------	-------------------------------
70			99.9%
40			74%
20			45%
10			20%

With the patched version, which makes the alignment constraint opt-in
when constructing WRITE requests, the ratio of aligned WRITE requests
increases to 98% (previously 20%) when fuse's max_ratio is 10.

[1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
---
 fs/fuse/file.c            | 4 ++++
 fs/fuse/fuse_i.h          | 6 ++++++
 fs/fuse/inode.c           | 9 +++++++++
 include/uapi/linux/fuse.h | 9 ++++++++-
 4 files changed, 27 insertions(+), 1 deletion(-)

Comments

Bernd Schubert July 5, 2024, 11:50 a.m. UTC | #1
On 7/5/24 12:04, Jingbo Xu wrote:
> Sometimes the file offset alignment needs to be opt-in to achieve the
> optimum performance at the backend store.
> 
> For example when ErasureCode [1] is used at the backend store, the
> optimum write performance is achieved when the WRITE request is aligned
> with the stripe size of ErasureCode.  Otherwise a non-aligned WRITE
> request needs to be split at the stripe size boundary.  It is quite
> costly to handle these split partial requests, as firstly the whole
> stripe to which the split partial request belongs needs to be read out,
> then overwrite the read stripe buffer with the request, and finally write
> the whole stripe back to the persistent storage.
> 
> Thus the backend store can suffer severe performance degradation when
> WRITE requests can not fit into one stripe exactly.  The write performance
> can be 10x slower when the request is 256KB in size given 4MB stripe size.
> Also there can be 50% performance degradation in theory if the request
> is not stripe boundary aligned.
> 
> Besides, the conveyed test indicates that, the non-alignment issue
> becomes more severe when decreasing fuse's max_ratio, maybe partly
> because the background writeback now is more likely to run parallelly
> with the dirtier.
> 
> fuse's max_ratio	ratio of aligned WRITE requests
> ----------------	-------------------------------
> 70			99.9%
> 40			74%
> 20			45%
> 10			20%
> 
> With the patched version, which makes the alignment constraint opt-in
> when constructing WRITE requests, the ratio of aligned WRITE requests
> increases to 98% (previously 20%) when fuse's max_ratio is 10.
> 
> [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4
> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
> 
> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
> ---
>  fs/fuse/file.c            | 4 ++++
>  fs/fuse/fuse_i.h          | 6 ++++++
>  fs/fuse/inode.c           | 9 +++++++++
>  include/uapi/linux/fuse.h | 9 ++++++++-
>  4 files changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index f39456c65ed7..f9b477016c2e 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
>  	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
>  		return true;
>  
> +	/* Reached alignment */
> +	if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages))
> +		return true;

I link the idea, but couldn't it just do that with 
fc->max_pages? I'm asking because fc->opt_alignment_pages
takes another uint32_t in fuse_init_out and there is not so much
space left anymore.

> +
>  	return false;
>  }
>  
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index f23919610313..5963571b394c 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -860,6 +860,9 @@ struct fuse_conn {
>  	/** Passthrough support for read/write IO */
>  	unsigned int passthrough:1;
>  
> +	/* Foffset alignment required for optimum performance */
> +	unsigned int opt_alignment:1;
> +
>  	/** Maximum stack depth for passthrough backing files */
>  	int max_stack_depth;
>  
> @@ -917,6 +920,9 @@ struct fuse_conn {
>  	/** IDR for backing files ids */
>  	struct idr backing_files_map;
>  #endif
> +
> +	/* The foffset alignment in PAGE_SIZE */
> +	unsigned int opt_alignment_pages;
>  };
>  
>  /*
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 99e44ea7d875..9266b22cce8e 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>  			}
>  			if (flags & FUSE_NO_EXPORT_SUPPORT)
>  				fm->sb->s_export_op = &fuse_export_fid_operations;
> +
> +			/* fallback to default if opt_alignment <= PAGE_SHIFT */
> +			if (flags & FUSE_OPT_ALIGNMENT) {
> +				if (arg->opt_alignment > PAGE_SHIFT) {
> +					fc->opt_alignment = 1;
> +					fc->opt_alignment_pages = 1 <<
> +						(arg->opt_alignment - PAGE_SHIFT);

opt_alignment is the number of bits required for alignment? Not 
very user friendly, from my point of view would be better to have
this in a byte or kb unit.


Thanks,
Bernd


> +				}
> +			}
>  		} else {
>  			ra_pages = fc->max_read / PAGE_SIZE;
>  			fc->no_lock = 1;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d08b99d60f6f..2c6ad1577591 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -217,6 +217,9 @@
>   *  - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag
>   *  - add FUSE_NO_EXPORT_SUPPORT init flag
>   *  - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag
> + *
> + *  7.41
> + *  - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -421,6 +424,8 @@ struct fuse_file_lock {
>   * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support
>   * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit
>   *		    of the request ID indicates resend requests
> + * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for
> + *		       foffset alignment for optimum write performance
>   */
>  #define FUSE_ASYNC_READ		(1 << 0)
>  #define FUSE_POSIX_LOCKS	(1 << 1)
> @@ -463,6 +468,7 @@ struct fuse_file_lock {
>  #define FUSE_PASSTHROUGH	(1ULL << 37)
>  #define FUSE_NO_EXPORT_SUPPORT	(1ULL << 38)
>  #define FUSE_HAS_RESEND		(1ULL << 39)
> +#define FUSE_OPT_ALIGNMENT	(1ULL << 40)
>  
>  /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
>  #define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
> @@ -893,7 +899,8 @@ struct fuse_init_out {
>  	uint16_t	map_alignment;
>  	uint32_t	flags2;
>  	uint32_t	max_stack_depth;
> -	uint32_t	unused[6];
> +	uint32_t	opt_alignment;
> +	uint32_t	unused[5];
>  };
>  
>  #define CUSE_INIT_INFO_MAX 4096
Jingbo Xu July 5, 2024, noon UTC | #2
Hi Bernd,

Thanks for the comment.


On 7/5/24 7:50 PM, Bernd Schubert wrote:
> 
> 
> On 7/5/24 12:04, Jingbo Xu wrote:
>> Sometimes the file offset alignment needs to be opt-in to achieve the
>> optimum performance at the backend store.
>>
>> For example when ErasureCode [1] is used at the backend store, the
>> optimum write performance is achieved when the WRITE request is aligned
>> with the stripe size of ErasureCode.  Otherwise a non-aligned WRITE
>> request needs to be split at the stripe size boundary.  It is quite
>> costly to handle these split partial requests, as firstly the whole
>> stripe to which the split partial request belongs needs to be read out,
>> then overwrite the read stripe buffer with the request, and finally write
>> the whole stripe back to the persistent storage.
>>
>> Thus the backend store can suffer severe performance degradation when
>> WRITE requests can not fit into one stripe exactly.  The write performance
>> can be 10x slower when the request is 256KB in size given 4MB stripe size.
>> Also there can be 50% performance degradation in theory if the request
>> is not stripe boundary aligned.
>>
>> Besides, the conveyed test indicates that, the non-alignment issue
>> becomes more severe when decreasing fuse's max_ratio, maybe partly
>> because the background writeback now is more likely to run parallelly
>> with the dirtier.
>>
>> fuse's max_ratio	ratio of aligned WRITE requests
>> ----------------	-------------------------------
>> 70			99.9%
>> 40			74%
>> 20			45%
>> 10			20%
>>
>> With the patched version, which makes the alignment constraint opt-in
>> when constructing WRITE requests, the ratio of aligned WRITE requests
>> increases to 98% (previously 20%) when fuse's max_ratio is 10.
>>
>> [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4
>> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
>>
>> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
>> ---
>>  fs/fuse/file.c            | 4 ++++
>>  fs/fuse/fuse_i.h          | 6 ++++++
>>  fs/fuse/inode.c           | 9 +++++++++
>>  include/uapi/linux/fuse.h | 9 ++++++++-
>>  4 files changed, 27 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index f39456c65ed7..f9b477016c2e 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
>>  	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
>>  		return true;
>>  
>> +	/* Reached alignment */
>> +	if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages))
>> +		return true;
> 
> I link the idea, but couldn't it just do that with 
> fc->max_pages? I'm asking because fc->opt_alignment_pages
> takes another uint32_t in fuse_init_out and there is not so much
> space left anymore.

I'm okay with resuing max_pages as the alignment constraint.  They are
the same in our internal scenarios.  But I'm not sure if it is the case
in other scenarios.



>>  /*
>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
>> index 99e44ea7d875..9266b22cce8e 100644
>> --- a/fs/fuse/inode.c
>> +++ b/fs/fuse/inode.c
>> @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>>  			}
>>  			if (flags & FUSE_NO_EXPORT_SUPPORT)
>>  				fm->sb->s_export_op = &fuse_export_fid_operations;
>> +
>> +			/* fallback to default if opt_alignment <= PAGE_SHIFT */
>> +			if (flags & FUSE_OPT_ALIGNMENT) {
>> +				if (arg->opt_alignment > PAGE_SHIFT) {
>> +					fc->opt_alignment = 1;
>> +					fc->opt_alignment_pages = 1 <<
>> +						(arg->opt_alignment - PAGE_SHIFT);
> 
> opt_alignment is the number of bits required for alignment? Not 
> very user friendly, from my point of view would be better to have
> this in a byte or kb unit.
> 

Actually I referred to fuse_init_out.map_alignment, which is also
log2(byte alignment).  Anyway I'm okay making it a more understandable name.
Miklos Szeredi Aug. 29, 2024, 7:51 a.m. UTC | #3
On Fri, 5 Jul 2024 at 14:00, Jingbo Xu <jefflexu@linux.alibaba.com> wrote:
> I'm okay with resuing max_pages as the alignment constraint.  They are
> the same in our internal scenarios.  But I'm not sure if it is the case
> in other scenarios.

max_pages < alignment makes little sense.

max_pages = n * alignment could make sense, i.e. allow writes that are
whole multiples of the alignment.

I'm not against adding a separate alignment, but it could be just
uint8_t to take up less space in init_out.   We could have done that
with max_stack_depth too.   Oh well...

Thanks,
Miklos
Jingbo Xu Aug. 30, 2024, 2:10 a.m. UTC | #4
On 8/29/24 3:51 PM, Miklos Szeredi wrote:
> On Fri, 5 Jul 2024 at 14:00, Jingbo Xu <jefflexu@linux.alibaba.com> wrote:
>> I'm okay with resuing max_pages as the alignment constraint.  They are
>> the same in our internal scenarios.  But I'm not sure if it is the case
>> in other scenarios.
> 
> max_pages < alignment makes little sense.
> 
> max_pages = n * alignment could make sense, i.e. allow writes that are
> whole multiples of the alignment.

Agreed.

> 
> I'm not against adding a separate alignment, but it could be just
> uint8_t to take up less space in init_out.   We could have done that
> with max_stack_depth too.   Oh well...

Make sense, as the new added fuse_init_out.opt_alignment is already
log2(byte alignment).  I think uint8_t is already adequate in this case.
(Actually I'm going to rename @opt_alignment field of fuse_init_out to
something like @log_opt_align to indicate it's actually a log2() value
as Berned previously suggested)

Besides, I'm not sure if it's worth adding a new init flag, i.e.
FUSE_OPT_ALIGNMENT, as the init flag bits are continually consumed.
Maybe we could stipulate that a zero log_opt_align indicates no
alignment constraint (the default behavior), while a non-zero
log_opt_align indicates an alignment constraint.  However IIUC the user
daemon may or may not zero the unused fields of fuse_init_out.  Thus if
a fuse server not supporting opt_alignment doesn't zero
fuse_init_out.unused, then the kernel side will enforce an alignment
constraint unexpectedly.
diff mbox series

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f39456c65ed7..f9b477016c2e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2246,6 +2246,10 @@  static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
 	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
 		return true;
 
+	/* Reached alignment */
+	if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages))
+		return true;
+
 	return false;
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f23919610313..5963571b394c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -860,6 +860,9 @@  struct fuse_conn {
 	/** Passthrough support for read/write IO */
 	unsigned int passthrough:1;
 
+	/* Foffset alignment required for optimum performance */
+	unsigned int opt_alignment:1;
+
 	/** Maximum stack depth for passthrough backing files */
 	int max_stack_depth;
 
@@ -917,6 +920,9 @@  struct fuse_conn {
 	/** IDR for backing files ids */
 	struct idr backing_files_map;
 #endif
+
+	/* The foffset alignment in PAGE_SIZE */
+	unsigned int opt_alignment_pages;
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 99e44ea7d875..9266b22cce8e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1331,6 +1331,15 @@  static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 			}
 			if (flags & FUSE_NO_EXPORT_SUPPORT)
 				fm->sb->s_export_op = &fuse_export_fid_operations;
+
+			/* fallback to default if opt_alignment <= PAGE_SHIFT */
+			if (flags & FUSE_OPT_ALIGNMENT) {
+				if (arg->opt_alignment > PAGE_SHIFT) {
+					fc->opt_alignment = 1;
+					fc->opt_alignment_pages = 1 <<
+						(arg->opt_alignment - PAGE_SHIFT);
+				}
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index d08b99d60f6f..2c6ad1577591 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -217,6 +217,9 @@ 
  *  - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag
  *  - add FUSE_NO_EXPORT_SUPPORT init flag
  *  - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag
+ *
+ *  7.41
+ *  - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -421,6 +424,8 @@  struct fuse_file_lock {
  * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support
  * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit
  *		    of the request ID indicates resend requests
+ * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for
+ *		       foffset alignment for optimum write performance
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -463,6 +468,7 @@  struct fuse_file_lock {
 #define FUSE_PASSTHROUGH	(1ULL << 37)
 #define FUSE_NO_EXPORT_SUPPORT	(1ULL << 38)
 #define FUSE_HAS_RESEND		(1ULL << 39)
+#define FUSE_OPT_ALIGNMENT	(1ULL << 40)
 
 /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
 #define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
@@ -893,7 +899,8 @@  struct fuse_init_out {
 	uint16_t	map_alignment;
 	uint32_t	flags2;
 	uint32_t	max_stack_depth;
-	uint32_t	unused[6];
+	uint32_t	opt_alignment;
+	uint32_t	unused[5];
 };
 
 #define CUSE_INIT_INFO_MAX 4096