Message ID | 1574129643-14664-3-git-send-email-jiufei.xue@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ovl: implement async IO routines | expand |
On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote: > > A performance regression is observed since linux v4.19 when we do aio > test using fio with iodepth 128 on overlayfs. And we found that queue > depth of the device is always 1 which is unexpected. > > After investigation, it is found that commit 16914e6fc7 > (“ovl: add ovl_read_iter()”) and commit 2a92e07edc > (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit > requests to real filesystem. Async IOs are converted to sync IOs here > and cause performance regression. > > So implement async IO for stacked reading and writing. > > Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com> > --- > fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- > fs/overlayfs/overlayfs.h | 2 + > fs/overlayfs/super.c | 12 +++++- > 3 files changed, 95 insertions(+), 16 deletions(-) > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c > index e235a63..07d94e7 100644 > --- a/fs/overlayfs/file.c > +++ b/fs/overlayfs/file.c > @@ -11,6 +11,14 @@ > #include <linux/uaccess.h> > #include "overlayfs.h" > > +struct ovl_aio_req { > + struct kiocb iocb; > + struct kiocb *orig_iocb; > + struct fd fd; > +}; > + > +static struct kmem_cache *ovl_aio_request_cachep; > + > static char ovl_whatisit(struct inode *inode, struct inode *realinode) > { > if (realinode != ovl_inode_upper(inode)) > @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) > return flags; > } > > +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) > +{ > + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); > + struct kiocb *orig_iocb = aio_req->orig_iocb; > + > + if (iocb->ki_flags & IOCB_WRITE) > + file_end_write(iocb->ki_filp); > + > + orig_iocb->ki_pos = iocb->ki_pos; > + orig_iocb->ki_complete(orig_iocb, res, res2); > + > + fdput(aio_req->fd); > + kmem_cache_free(ovl_aio_request_cachep, aio_req); > +} > + > static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) > { > struct file *file = iocb->ki_filp; > @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) > return ret; > > old_cred = ovl_override_creds(file_inode(file)->i_sb); > - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, > - ovl_iocb_to_rwf(iocb)); > + if (is_sync_kiocb(iocb)) { > + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, > + ovl_iocb_to_rwf(iocb)); > + ovl_file_accessed(file); > + fdput(real); > + } else { > + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, > + GFP_NOFS); > + aio_req->fd = real; > + aio_req->orig_iocb = iocb; > + kiocb_clone(&aio_req->iocb, iocb, real.file); > + aio_req->iocb.ki_complete = ovl_aio_rw_complete; > + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); > + ovl_file_accessed(file); That should be done in completion/error > + if (ret != -EIOCBQUEUED) { > + iocb->ki_pos = aio_req->iocb.ki_pos; > + fdput(real); > + kmem_cache_free(ovl_aio_request_cachep, aio_req); > + } Suggest cleanup helper for completion/error > + } > revert_creds(old_cred); > > - ovl_file_accessed(file); > - > - fdput(real); > - > return ret; > } > > @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) > > old_cred = ovl_override_creds(file_inode(file)->i_sb); > file_start_write(real.file); > - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, > - ovl_iocb_to_rwf(iocb)); > - file_end_write(real.file); > + if (is_sync_kiocb(iocb)) { > + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, > + ovl_iocb_to_rwf(iocb)); > + file_end_write(real.file); > + /* Update size */ > + ovl_copyattr(ovl_inode_real(inode), inode); > + fdput(real); > + } else { > + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, > + GFP_NOFS); > + aio_req->fd = real; > + aio_req->orig_iocb = iocb; > + kiocb_clone(&aio_req->iocb, iocb, real.file); > + aio_req->iocb.ki_complete = ovl_aio_rw_complete; > + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); > + /* Update size */ > + ovl_copyattr(ovl_inode_real(inode), inode); That should be in completion/error Thanks, Amir.
Hi Amir, On 2019/11/19 下午12:22, Amir Goldstein wrote: > On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote: >> >> A performance regression is observed since linux v4.19 when we do aio >> test using fio with iodepth 128 on overlayfs. And we found that queue >> depth of the device is always 1 which is unexpected. >> >> After investigation, it is found that commit 16914e6fc7 >> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc >> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit >> requests to real filesystem. Async IOs are converted to sync IOs here >> and cause performance regression. >> >> So implement async IO for stacked reading and writing. >> >> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com> >> --- >> fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- >> fs/overlayfs/overlayfs.h | 2 + >> fs/overlayfs/super.c | 12 +++++- >> 3 files changed, 95 insertions(+), 16 deletions(-) >> >> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c >> index e235a63..07d94e7 100644 >> --- a/fs/overlayfs/file.c >> +++ b/fs/overlayfs/file.c >> @@ -11,6 +11,14 @@ >> #include <linux/uaccess.h> >> #include "overlayfs.h" >> >> +struct ovl_aio_req { >> + struct kiocb iocb; >> + struct kiocb *orig_iocb; >> + struct fd fd; >> +}; >> + >> +static struct kmem_cache *ovl_aio_request_cachep; >> + >> static char ovl_whatisit(struct inode *inode, struct inode *realinode) >> { >> if (realinode != ovl_inode_upper(inode)) >> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) >> return flags; >> } >> >> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) >> +{ >> + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); >> + struct kiocb *orig_iocb = aio_req->orig_iocb; >> + >> + if (iocb->ki_flags & IOCB_WRITE) >> + file_end_write(iocb->ki_filp); >> + >> + orig_iocb->ki_pos = iocb->ki_pos; >> + orig_iocb->ki_complete(orig_iocb, res, res2); >> + >> + fdput(aio_req->fd); >> + kmem_cache_free(ovl_aio_request_cachep, aio_req); >> +} >> + >> static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >> { >> struct file *file = iocb->ki_filp; >> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >> return ret; >> >> old_cred = ovl_override_creds(file_inode(file)->i_sb); >> - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >> - ovl_iocb_to_rwf(iocb)); >> + if (is_sync_kiocb(iocb)) { >> + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >> + ovl_iocb_to_rwf(iocb)); >> + ovl_file_accessed(file); >> + fdput(real); >> + } else { >> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, >> + GFP_NOFS); >> + aio_req->fd = real; >> + aio_req->orig_iocb = iocb; >> + kiocb_clone(&aio_req->iocb, iocb, real.file); >> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; >> + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); >> + ovl_file_accessed(file); > > That should be done in completion/error > Refer to function generic_file_read_iter(), in direct IO path, file_accessed() is done before IO submission, so I think ovl_file_accessed() should be done here no matter completion/error or IO is queued. >> + if (ret != -EIOCBQUEUED) { >> + iocb->ki_pos = aio_req->iocb.ki_pos; >> + fdput(real); >> + kmem_cache_free(ovl_aio_request_cachep, aio_req); >> + } > > Suggest cleanup helper for completion/error > Yes, that will be more clearly. I will add a cleanup helper in version 2. > >> + } >> revert_creds(old_cred); >> >> - ovl_file_accessed(file); >> - >> - fdput(real); >> - >> return ret; >> } >> >> @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) >> >> old_cred = ovl_override_creds(file_inode(file)->i_sb); >> file_start_write(real.file); >> - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, >> - ovl_iocb_to_rwf(iocb)); >> - file_end_write(real.file); >> + if (is_sync_kiocb(iocb)) { >> + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, >> + ovl_iocb_to_rwf(iocb)); >> + file_end_write(real.file); >> + /* Update size */ >> + ovl_copyattr(ovl_inode_real(inode), inode); >> + fdput(real); >> + } else { >> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, >> + GFP_NOFS); >> + aio_req->fd = real; >> + aio_req->orig_iocb = iocb; >> + kiocb_clone(&aio_req->iocb, iocb, real.file); >> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; >> + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); >> + /* Update size */ >> + ovl_copyattr(ovl_inode_real(inode), inode); > > That should be in completion/error > Yes, I will move it to the newly added cleanup helper. Thanks, Jiufei > Thanks, > Amir. >
On Tue, Nov 19, 2019 at 10:37 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote: > > Hi Amir, > > On 2019/11/19 下午12:22, Amir Goldstein wrote: > > On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote: > >> > >> A performance regression is observed since linux v4.19 when we do aio > >> test using fio with iodepth 128 on overlayfs. And we found that queue > >> depth of the device is always 1 which is unexpected. > >> > >> After investigation, it is found that commit 16914e6fc7 > >> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc > >> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit > >> requests to real filesystem. Async IOs are converted to sync IOs here > >> and cause performance regression. > >> > >> So implement async IO for stacked reading and writing. > >> > >> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com> > >> --- > >> fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- > >> fs/overlayfs/overlayfs.h | 2 + > >> fs/overlayfs/super.c | 12 +++++- > >> 3 files changed, 95 insertions(+), 16 deletions(-) > >> > >> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c > >> index e235a63..07d94e7 100644 > >> --- a/fs/overlayfs/file.c > >> +++ b/fs/overlayfs/file.c > >> @@ -11,6 +11,14 @@ > >> #include <linux/uaccess.h> > >> #include "overlayfs.h" > >> > >> +struct ovl_aio_req { > >> + struct kiocb iocb; > >> + struct kiocb *orig_iocb; > >> + struct fd fd; > >> +}; > >> + > >> +static struct kmem_cache *ovl_aio_request_cachep; > >> + > >> static char ovl_whatisit(struct inode *inode, struct inode *realinode) > >> { > >> if (realinode != ovl_inode_upper(inode)) > >> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) > >> return flags; > >> } > >> > >> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) > >> +{ > >> + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); > >> + struct kiocb *orig_iocb = aio_req->orig_iocb; > >> + > >> + if (iocb->ki_flags & IOCB_WRITE) > >> + file_end_write(iocb->ki_filp); > >> + > >> + orig_iocb->ki_pos = iocb->ki_pos; > >> + orig_iocb->ki_complete(orig_iocb, res, res2); > >> + > >> + fdput(aio_req->fd); > >> + kmem_cache_free(ovl_aio_request_cachep, aio_req); > >> +} > >> + > >> static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) > >> { > >> struct file *file = iocb->ki_filp; > >> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) > >> return ret; > >> > >> old_cred = ovl_override_creds(file_inode(file)->i_sb); > >> - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, > >> - ovl_iocb_to_rwf(iocb)); > >> + if (is_sync_kiocb(iocb)) { > >> + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, > >> + ovl_iocb_to_rwf(iocb)); > >> + ovl_file_accessed(file); > >> + fdput(real); > >> + } else { > >> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, > >> + GFP_NOFS); > >> + aio_req->fd = real; > >> + aio_req->orig_iocb = iocb; > >> + kiocb_clone(&aio_req->iocb, iocb, real.file); > >> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; > >> + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); > >> + ovl_file_accessed(file); > > > > That should be done in completion/error > > > > Refer to function generic_file_read_iter(), in direct IO path, > file_accessed() is done before IO submission, so I think ovl_file_accessed() > should be done here no matter completion/error or IO is queued. Mmm, it doesn't matter much if atime is updated before or after, but ovl_file_accessed() does not only update atime, it also copies ctime which could have been modified as a result of the io, so I think it is safer to put it in the cleanup hook. Thanks, Amir.
Hi Amir, On 2019/11/19 下午5:38, Amir Goldstein wrote: > On Tue, Nov 19, 2019 at 10:37 AM Jiufei Xue > <jiufei.xue@linux.alibaba.com> wrote: >> >> Hi Amir, >> >> On 2019/11/19 下午12:22, Amir Goldstein wrote: >>> On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote: >>>> >>>> A performance regression is observed since linux v4.19 when we do aio >>>> test using fio with iodepth 128 on overlayfs. And we found that queue >>>> depth of the device is always 1 which is unexpected. >>>> >>>> After investigation, it is found that commit 16914e6fc7 >>>> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc >>>> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit >>>> requests to real filesystem. Async IOs are converted to sync IOs here >>>> and cause performance regression. >>>> >>>> So implement async IO for stacked reading and writing. >>>> >>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com> >>>> --- >>>> fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- >>>> fs/overlayfs/overlayfs.h | 2 + >>>> fs/overlayfs/super.c | 12 +++++- >>>> 3 files changed, 95 insertions(+), 16 deletions(-) >>>> >>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c >>>> index e235a63..07d94e7 100644 >>>> --- a/fs/overlayfs/file.c >>>> +++ b/fs/overlayfs/file.c >>>> @@ -11,6 +11,14 @@ >>>> #include <linux/uaccess.h> >>>> #include "overlayfs.h" >>>> >>>> +struct ovl_aio_req { >>>> + struct kiocb iocb; >>>> + struct kiocb *orig_iocb; >>>> + struct fd fd; >>>> +}; >>>> + >>>> +static struct kmem_cache *ovl_aio_request_cachep; >>>> + >>>> static char ovl_whatisit(struct inode *inode, struct inode *realinode) >>>> { >>>> if (realinode != ovl_inode_upper(inode)) >>>> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) >>>> return flags; >>>> } >>>> >>>> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) >>>> +{ >>>> + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); >>>> + struct kiocb *orig_iocb = aio_req->orig_iocb; >>>> + >>>> + if (iocb->ki_flags & IOCB_WRITE) >>>> + file_end_write(iocb->ki_filp); >>>> + >>>> + orig_iocb->ki_pos = iocb->ki_pos; >>>> + orig_iocb->ki_complete(orig_iocb, res, res2); >>>> + >>>> + fdput(aio_req->fd); >>>> + kmem_cache_free(ovl_aio_request_cachep, aio_req); >>>> +} >>>> + >>>> static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >>>> { >>>> struct file *file = iocb->ki_filp; >>>> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >>>> return ret; >>>> >>>> old_cred = ovl_override_creds(file_inode(file)->i_sb); >>>> - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >>>> - ovl_iocb_to_rwf(iocb)); >>>> + if (is_sync_kiocb(iocb)) { >>>> + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >>>> + ovl_iocb_to_rwf(iocb)); >>>> + ovl_file_accessed(file); >>>> + fdput(real); >>>> + } else { >>>> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, >>>> + GFP_NOFS); >>>> + aio_req->fd = real; >>>> + aio_req->orig_iocb = iocb; >>>> + kiocb_clone(&aio_req->iocb, iocb, real.file); >>>> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; >>>> + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); >>>> + ovl_file_accessed(file); >>> >>> That should be done in completion/error >>> >> >> Refer to function generic_file_read_iter(), in direct IO path, >> file_accessed() is done before IO submission, so I think ovl_file_accessed() >> should be done here no matter completion/error or IO is queued. > > Mmm, it doesn't matter much if atime is updated before or after, > but ovl_file_accessed() does not only update atime, it also copies > ctime which could have been modified as a result of the io, so > I think it is safer to put it in the cleanup hook. > Can you give a more detailed description that a read op will modify ctime as a result of the io? I found that it will trigger BUG_ON(irqs_disabled()) while calling ovl_file_accessed() on async IO return path. The calltrace is pasted below: ovl_file_accessed -> touch_atime -> ovl_update_time -> generic_update_time -> __mark_inode_dirty -> ext4_dirty_inode -> __ext4_get_inode_loc -> __find_get_block -> lookup_bh_lru -> check_irqs_on So I need more detail to find how to fix this issue. Thanks, Jiufei. > Thanks, > Amir. >
> >> > >> Refer to function generic_file_read_iter(), in direct IO path, > >> file_accessed() is done before IO submission, so I think ovl_file_accessed() > >> should be done here no matter completion/error or IO is queued. > > > > Mmm, it doesn't matter much if atime is updated before or after, > > but ovl_file_accessed() does not only update atime, it also copies > > ctime which could have been modified as a result of the io, so > > I think it is safer to put it in the cleanup hook. > > > > Can you give a more detailed description that a read op will modify > ctime as a result of the io? > > I found that it will trigger BUG_ON(irqs_disabled()) while > calling ovl_file_accessed() on async IO return path. The calltrace > is pasted below: > > ovl_file_accessed > -> touch_atime > -> ovl_update_time > -> generic_update_time > -> __mark_inode_dirty > -> ext4_dirty_inode > -> __ext4_get_inode_loc > -> __find_get_block > -> lookup_bh_lru > -> check_irqs_on > > So I need more detail to find how to fix this issue. > It's not important. Please ignore. I din't know there was an issue with placing touch_atime() in completion context. Thanks, Amir.
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index e235a63..07d94e7 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -11,6 +11,14 @@ #include <linux/uaccess.h> #include "overlayfs.h" +struct ovl_aio_req { + struct kiocb iocb; + struct kiocb *orig_iocb; + struct fd fd; +}; + +static struct kmem_cache *ovl_aio_request_cachep; + static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) return flags; } +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + file_end_write(iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + orig_iocb->ki_complete(orig_iocb, res, res2); + + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + ovl_file_accessed(file); + fdput(real); + } else { + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, + GFP_NOFS); + aio_req->fd = real; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + ovl_file_accessed(file); + if (ret != -EIOCBQUEUED) { + iocb->ki_pos = aio_req->iocb.ki_pos; + fdput(real); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } + } revert_creds(old_cred); - ovl_file_accessed(file); - - fdput(real); - return ret; } @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); - file_end_write(real.file); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + fdput(real); + } else { + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, + GFP_NOFS); + aio_req->fd = real; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + if (ret != -EIOCBQUEUED) { + iocb->ki_pos = aio_req->iocb.ki_pos; + file_end_write(real.file); + fdput(real); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } + } revert_creds(old_cred); - /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); - - fdput(real); - out_unlock: inode_unlock(inode); @@ -651,3 +704,19 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; + +int __init ovl_init_aio_request_cache(void) +{ + ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", + sizeof(struct ovl_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ovl_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void ovl_exit_aio_request_cache(void) +{ + kmem_cache_destroy(ovl_aio_request_cachep); +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf..afd1631 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -416,6 +416,8 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, /* file.c */ extern const struct file_operations ovl_file_operations; +int __init ovl_init_aio_request_cache(void); +void ovl_exit_aio_request_cache(void); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb11..83cef1f 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1739,9 +1739,17 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; + err = ovl_init_aio_request_cache(); + if (err) { + kmem_cache_destroy(ovl_inode_cachep); + return -ENOMEM; + } + err = register_filesystem(&ovl_fs_type); - if (err) + if (err) { kmem_cache_destroy(ovl_inode_cachep); + ovl_exit_aio_request_cache(); + } return err; } @@ -1756,7 +1764,7 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - + ovl_exit_aio_request_cache(); } module_init(ovl_init);
A performance regression is observed since linux v4.19 when we do aio test using fio with iodepth 128 on overlayfs. And we found that queue depth of the device is always 1 which is unexpected. After investigation, it is found that commit 16914e6fc7 (“ovl: add ovl_read_iter()”) and commit 2a92e07edc (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit requests to real filesystem. Async IOs are converted to sync IOs here and cause performance regression. So implement async IO for stacked reading and writing. Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com> --- fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- fs/overlayfs/overlayfs.h | 2 + fs/overlayfs/super.c | 12 +++++- 3 files changed, 95 insertions(+), 16 deletions(-)