Message ID | 20240705100449.60891-1-jefflexu@linux.alibaba.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fuse: make foffset alignment opt-in for optimum backend performance | expand |
On 7/5/24 12:04, Jingbo Xu wrote: > Sometimes the file offset alignment needs to be opt-in to achieve the > optimum performance at the backend store. > > For example when ErasureCode [1] is used at the backend store, the > optimum write performance is achieved when the WRITE request is aligned > with the stripe size of ErasureCode. Otherwise a non-aligned WRITE > request needs to be split at the stripe size boundary. It is quite > costly to handle these split partial requests, as firstly the whole > stripe to which the split partial request belongs needs to be read out, > then overwrite the read stripe buffer with the request, and finally write > the whole stripe back to the persistent storage. > > Thus the backend store can suffer severe performance degradation when > WRITE requests can not fit into one stripe exactly. The write performance > can be 10x slower when the request is 256KB in size given 4MB stripe size. > Also there can be 50% performance degradation in theory if the request > is not stripe boundary aligned. > > Besides, the conveyed test indicates that, the non-alignment issue > becomes more severe when decreasing fuse's max_ratio, maybe partly > because the background writeback now is more likely to run parallelly > with the dirtier. > > fuse's max_ratio ratio of aligned WRITE requests > ---------------- ------------------------------- > 70 99.9% > 40 74% > 20 45% > 10 20% > > With the patched version, which makes the alignment constraint opt-in > when constructing WRITE requests, the ratio of aligned WRITE requests > increases to 98% (previously 20%) when fuse's max_ratio is 10. > > [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4 > Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com> > > Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com> > --- > fs/fuse/file.c | 4 ++++ > fs/fuse/fuse_i.h | 6 ++++++ > fs/fuse/inode.c | 9 +++++++++ > include/uapi/linux/fuse.h | 9 ++++++++- > 4 files changed, 27 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index f39456c65ed7..f9b477016c2e 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, > if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) > return true; > > + /* Reached alignment */ > + if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages)) > + return true; I link the idea, but couldn't it just do that with fc->max_pages? I'm asking because fc->opt_alignment_pages takes another uint32_t in fuse_init_out and there is not so much space left anymore. > + > return false; > } > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index f23919610313..5963571b394c 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -860,6 +860,9 @@ struct fuse_conn { > /** Passthrough support for read/write IO */ > unsigned int passthrough:1; > > + /* Foffset alignment required for optimum performance */ > + unsigned int opt_alignment:1; > + > /** Maximum stack depth for passthrough backing files */ > int max_stack_depth; > > @@ -917,6 +920,9 @@ struct fuse_conn { > /** IDR for backing files ids */ > struct idr backing_files_map; > #endif > + > + /* The foffset alignment in PAGE_SIZE */ > + unsigned int opt_alignment_pages; > }; > > /* > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 99e44ea7d875..9266b22cce8e 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > } > if (flags & FUSE_NO_EXPORT_SUPPORT) > fm->sb->s_export_op = &fuse_export_fid_operations; > + > + /* fallback to default if opt_alignment <= PAGE_SHIFT */ > + if (flags & FUSE_OPT_ALIGNMENT) { > + if (arg->opt_alignment > PAGE_SHIFT) { > + fc->opt_alignment = 1; > + fc->opt_alignment_pages = 1 << > + (arg->opt_alignment - PAGE_SHIFT); opt_alignment is the number of bits required for alignment? Not very user friendly, from my point of view would be better to have this in a byte or kb unit. Thanks, Bernd > + } > + } > } else { > ra_pages = fc->max_read / PAGE_SIZE; > fc->no_lock = 1; > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index d08b99d60f6f..2c6ad1577591 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -217,6 +217,9 @@ > * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag > * - add FUSE_NO_EXPORT_SUPPORT init flag > * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag > + * > + * 7.41 > + * - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag > */ > > #ifndef _LINUX_FUSE_H > @@ -421,6 +424,8 @@ struct fuse_file_lock { > * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support > * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit > * of the request ID indicates resend requests > + * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for > + * foffset alignment for optimum write performance > */ > #define FUSE_ASYNC_READ (1 << 0) > #define FUSE_POSIX_LOCKS (1 << 1) > @@ -463,6 +468,7 @@ struct fuse_file_lock { > #define FUSE_PASSTHROUGH (1ULL << 37) > #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) > #define FUSE_HAS_RESEND (1ULL << 39) > +#define FUSE_OPT_ALIGNMENT (1ULL << 40) > > /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ > #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP > @@ -893,7 +899,8 @@ struct fuse_init_out { > uint16_t map_alignment; > uint32_t flags2; > uint32_t max_stack_depth; > - uint32_t unused[6]; > + uint32_t opt_alignment; > + uint32_t unused[5]; > }; > > #define CUSE_INIT_INFO_MAX 4096
Hi Bernd, Thanks for the comment. On 7/5/24 7:50 PM, Bernd Schubert wrote: > > > On 7/5/24 12:04, Jingbo Xu wrote: >> Sometimes the file offset alignment needs to be opt-in to achieve the >> optimum performance at the backend store. >> >> For example when ErasureCode [1] is used at the backend store, the >> optimum write performance is achieved when the WRITE request is aligned >> with the stripe size of ErasureCode. Otherwise a non-aligned WRITE >> request needs to be split at the stripe size boundary. It is quite >> costly to handle these split partial requests, as firstly the whole >> stripe to which the split partial request belongs needs to be read out, >> then overwrite the read stripe buffer with the request, and finally write >> the whole stripe back to the persistent storage. >> >> Thus the backend store can suffer severe performance degradation when >> WRITE requests can not fit into one stripe exactly. The write performance >> can be 10x slower when the request is 256KB in size given 4MB stripe size. >> Also there can be 50% performance degradation in theory if the request >> is not stripe boundary aligned. >> >> Besides, the conveyed test indicates that, the non-alignment issue >> becomes more severe when decreasing fuse's max_ratio, maybe partly >> because the background writeback now is more likely to run parallelly >> with the dirtier. >> >> fuse's max_ratio ratio of aligned WRITE requests >> ---------------- ------------------------------- >> 70 99.9% >> 40 74% >> 20 45% >> 10 20% >> >> With the patched version, which makes the alignment constraint opt-in >> when constructing WRITE requests, the ratio of aligned WRITE requests >> increases to 98% (previously 20%) when fuse's max_ratio is 10. >> >> [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4 >> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com> >> >> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com> >> --- >> fs/fuse/file.c | 4 ++++ >> fs/fuse/fuse_i.h | 6 ++++++ >> fs/fuse/inode.c | 9 +++++++++ >> include/uapi/linux/fuse.h | 9 ++++++++- >> 4 files changed, 27 insertions(+), 1 deletion(-) >> >> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >> index f39456c65ed7..f9b477016c2e 100644 >> --- a/fs/fuse/file.c >> +++ b/fs/fuse/file.c >> @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, >> if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) >> return true; >> >> + /* Reached alignment */ >> + if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages)) >> + return true; > > I link the idea, but couldn't it just do that with > fc->max_pages? I'm asking because fc->opt_alignment_pages > takes another uint32_t in fuse_init_out and there is not so much > space left anymore. I'm okay with resuing max_pages as the alignment constraint. They are the same in our internal scenarios. But I'm not sure if it is the case in other scenarios. >> /* >> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >> index 99e44ea7d875..9266b22cce8e 100644 >> --- a/fs/fuse/inode.c >> +++ b/fs/fuse/inode.c >> @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, >> } >> if (flags & FUSE_NO_EXPORT_SUPPORT) >> fm->sb->s_export_op = &fuse_export_fid_operations; >> + >> + /* fallback to default if opt_alignment <= PAGE_SHIFT */ >> + if (flags & FUSE_OPT_ALIGNMENT) { >> + if (arg->opt_alignment > PAGE_SHIFT) { >> + fc->opt_alignment = 1; >> + fc->opt_alignment_pages = 1 << >> + (arg->opt_alignment - PAGE_SHIFT); > > opt_alignment is the number of bits required for alignment? Not > very user friendly, from my point of view would be better to have > this in a byte or kb unit. > Actually I referred to fuse_init_out.map_alignment, which is also log2(byte alignment). Anyway I'm okay making it a more understandable name.
On Fri, 5 Jul 2024 at 14:00, Jingbo Xu <jefflexu@linux.alibaba.com> wrote: > I'm okay with resuing max_pages as the alignment constraint. They are > the same in our internal scenarios. But I'm not sure if it is the case > in other scenarios. max_pages < alignment makes little sense. max_pages = n * alignment could make sense, i.e. allow writes that are whole multiples of the alignment. I'm not against adding a separate alignment, but it could be just uint8_t to take up less space in init_out. We could have done that with max_stack_depth too. Oh well... Thanks, Miklos
On 8/29/24 3:51 PM, Miklos Szeredi wrote: > On Fri, 5 Jul 2024 at 14:00, Jingbo Xu <jefflexu@linux.alibaba.com> wrote: >> I'm okay with resuing max_pages as the alignment constraint. They are >> the same in our internal scenarios. But I'm not sure if it is the case >> in other scenarios. > > max_pages < alignment makes little sense. > > max_pages = n * alignment could make sense, i.e. allow writes that are > whole multiples of the alignment. Agreed. > > I'm not against adding a separate alignment, but it could be just > uint8_t to take up less space in init_out. We could have done that > with max_stack_depth too. Oh well... Make sense, as the new added fuse_init_out.opt_alignment is already log2(byte alignment). I think uint8_t is already adequate in this case. (Actually I'm going to rename @opt_alignment field of fuse_init_out to something like @log_opt_align to indicate it's actually a log2() value as Berned previously suggested) Besides, I'm not sure if it's worth adding a new init flag, i.e. FUSE_OPT_ALIGNMENT, as the init flag bits are continually consumed. Maybe we could stipulate that a zero log_opt_align indicates no alignment constraint (the default behavior), while a non-zero log_opt_align indicates an alignment constraint. However IIUC the user daemon may or may not zero the unused fields of fuse_init_out. Thus if a fuse server not supporting opt_alignment doesn't zero fuse_init_out.unused, then the kernel side will enforce an alignment constraint unexpectedly.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7..f9b477016c2e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) return true; + /* Reached alignment */ + if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages)) + return true; + return false; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f23919610313..5963571b394c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -860,6 +860,9 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* Foffset alignment required for optimum performance */ + unsigned int opt_alignment:1; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -917,6 +920,9 @@ struct fuse_conn { /** IDR for backing files ids */ struct idr backing_files_map; #endif + + /* The foffset alignment in PAGE_SIZE */ + unsigned int opt_alignment_pages; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 99e44ea7d875..9266b22cce8e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + + /* fallback to default if opt_alignment <= PAGE_SHIFT */ + if (flags & FUSE_OPT_ALIGNMENT) { + if (arg->opt_alignment > PAGE_SHIFT) { + fc->opt_alignment = 1; + fc->opt_alignment_pages = 1 << + (arg->opt_alignment - PAGE_SHIFT); + } + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d08b99d60f6f..2c6ad1577591 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -217,6 +217,9 @@ * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag * - add FUSE_NO_EXPORT_SUPPORT init flag * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag + * + * 7.41 + * - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag */ #ifndef _LINUX_FUSE_H @@ -421,6 +424,8 @@ struct fuse_file_lock { * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit * of the request ID indicates resend requests + * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for + * foffset alignment for optimum write performance */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -463,6 +468,7 @@ struct fuse_file_lock { #define FUSE_PASSTHROUGH (1ULL << 37) #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) #define FUSE_HAS_RESEND (1ULL << 39) +#define FUSE_OPT_ALIGNMENT (1ULL << 40) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -893,7 +899,8 @@ struct fuse_init_out { uint16_t map_alignment; uint32_t flags2; uint32_t max_stack_depth; - uint32_t unused[6]; + uint32_t opt_alignment; + uint32_t unused[5]; }; #define CUSE_INIT_INFO_MAX 4096