diff mbox series

[2/2] ovl: implement async IO routines

Message ID 1574129643-14664-3-git-send-email-jiufei.xue@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series ovl: implement async IO routines | expand

Commit Message

Jiufei Xue Nov. 19, 2019, 2:14 a.m. UTC
A performance regression is observed since linux v4.19 when we do aio
test using fio with iodepth 128 on overlayfs. And we found that queue
depth of the device is always 1 which is unexpected.

After investigation, it is found that commit 16914e6fc7
(“ovl: add ovl_read_iter()”) and commit 2a92e07edc
(“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit
requests to real filesystem. Async IOs are converted to sync IOs here
and cause performance regression.

So implement async IO for stacked reading and writing.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
---
 fs/overlayfs/file.c      | 97 +++++++++++++++++++++++++++++++++++++++++-------
 fs/overlayfs/overlayfs.h |  2 +
 fs/overlayfs/super.c     | 12 +++++-
 3 files changed, 95 insertions(+), 16 deletions(-)

Comments

Amir Goldstein Nov. 19, 2019, 4:22 a.m. UTC | #1
On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote:
>
> A performance regression is observed since linux v4.19 when we do aio
> test using fio with iodepth 128 on overlayfs. And we found that queue
> depth of the device is always 1 which is unexpected.
>
> After investigation, it is found that commit 16914e6fc7
> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc
> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit
> requests to real filesystem. Async IOs are converted to sync IOs here
> and cause performance regression.
>
> So implement async IO for stacked reading and writing.
>
> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
> ---
>  fs/overlayfs/file.c      | 97 +++++++++++++++++++++++++++++++++++++++++-------
>  fs/overlayfs/overlayfs.h |  2 +
>  fs/overlayfs/super.c     | 12 +++++-
>  3 files changed, 95 insertions(+), 16 deletions(-)
>
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index e235a63..07d94e7 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -11,6 +11,14 @@
>  #include <linux/uaccess.h>
>  #include "overlayfs.h"
>
> +struct ovl_aio_req {
> +       struct kiocb iocb;
> +       struct kiocb *orig_iocb;
> +       struct fd fd;
> +};
> +
> +static struct kmem_cache *ovl_aio_request_cachep;
> +
>  static char ovl_whatisit(struct inode *inode, struct inode *realinode)
>  {
>         if (realinode != ovl_inode_upper(inode))
> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
>         return flags;
>  }
>
> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
> +{
> +       struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb);
> +       struct kiocb *orig_iocb = aio_req->orig_iocb;
> +
> +       if (iocb->ki_flags & IOCB_WRITE)
> +               file_end_write(iocb->ki_filp);
> +
> +       orig_iocb->ki_pos = iocb->ki_pos;
> +       orig_iocb->ki_complete(orig_iocb, res, res2);
> +
> +       fdput(aio_req->fd);
> +       kmem_cache_free(ovl_aio_request_cachep, aio_req);
> +}
> +
>  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>  {
>         struct file *file = iocb->ki_filp;
> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>                 return ret;
>
>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
> -       ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
> -                           ovl_iocb_to_rwf(iocb));
> +       if (is_sync_kiocb(iocb)) {
> +               ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
> +                                   ovl_iocb_to_rwf(iocb));
> +               ovl_file_accessed(file);
> +               fdput(real);
> +       } else {
> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
> +                                                              GFP_NOFS);
> +               aio_req->fd = real;
> +               aio_req->orig_iocb = iocb;
> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> +               ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
> +               ovl_file_accessed(file);

That should be done in completion/error

> +               if (ret != -EIOCBQUEUED) {
> +                       iocb->ki_pos = aio_req->iocb.ki_pos;
> +                       fdput(real);
> +                       kmem_cache_free(ovl_aio_request_cachep, aio_req);
> +               }

Suggest cleanup helper for completion/error


> +       }
>         revert_creds(old_cred);
>
> -       ovl_file_accessed(file);
> -
> -       fdput(real);
> -
>         return ret;
>  }
>
> @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>
>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
>         file_start_write(real.file);
> -       ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
> -                            ovl_iocb_to_rwf(iocb));
> -       file_end_write(real.file);
> +       if (is_sync_kiocb(iocb)) {
> +               ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
> +                                    ovl_iocb_to_rwf(iocb));
> +               file_end_write(real.file);
> +               /* Update size */
> +               ovl_copyattr(ovl_inode_real(inode), inode);
> +               fdput(real);
> +       } else {
> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
> +                                                              GFP_NOFS);
> +               aio_req->fd = real;
> +               aio_req->orig_iocb = iocb;
> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> +               ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
> +               /* Update size */
> +               ovl_copyattr(ovl_inode_real(inode), inode);

That should be in completion/error

Thanks,
Amir.
Jiufei Xue Nov. 19, 2019, 8:37 a.m. UTC | #2
Hi Amir,

On 2019/11/19 下午12:22, Amir Goldstein wrote:
> On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote:
>>
>> A performance regression is observed since linux v4.19 when we do aio
>> test using fio with iodepth 128 on overlayfs. And we found that queue
>> depth of the device is always 1 which is unexpected.
>>
>> After investigation, it is found that commit 16914e6fc7
>> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc
>> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit
>> requests to real filesystem. Async IOs are converted to sync IOs here
>> and cause performance regression.
>>
>> So implement async IO for stacked reading and writing.
>>
>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>> ---
>>  fs/overlayfs/file.c      | 97 +++++++++++++++++++++++++++++++++++++++++-------
>>  fs/overlayfs/overlayfs.h |  2 +
>>  fs/overlayfs/super.c     | 12 +++++-
>>  3 files changed, 95 insertions(+), 16 deletions(-)
>>
>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>> index e235a63..07d94e7 100644
>> --- a/fs/overlayfs/file.c
>> +++ b/fs/overlayfs/file.c
>> @@ -11,6 +11,14 @@
>>  #include <linux/uaccess.h>
>>  #include "overlayfs.h"
>>
>> +struct ovl_aio_req {
>> +       struct kiocb iocb;
>> +       struct kiocb *orig_iocb;
>> +       struct fd fd;
>> +};
>> +
>> +static struct kmem_cache *ovl_aio_request_cachep;
>> +
>>  static char ovl_whatisit(struct inode *inode, struct inode *realinode)
>>  {
>>         if (realinode != ovl_inode_upper(inode))
>> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
>>         return flags;
>>  }
>>
>> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
>> +{
>> +       struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb);
>> +       struct kiocb *orig_iocb = aio_req->orig_iocb;
>> +
>> +       if (iocb->ki_flags & IOCB_WRITE)
>> +               file_end_write(iocb->ki_filp);
>> +
>> +       orig_iocb->ki_pos = iocb->ki_pos;
>> +       orig_iocb->ki_complete(orig_iocb, res, res2);
>> +
>> +       fdput(aio_req->fd);
>> +       kmem_cache_free(ovl_aio_request_cachep, aio_req);
>> +}
>> +
>>  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>>  {
>>         struct file *file = iocb->ki_filp;
>> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>>                 return ret;
>>
>>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> -       ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
>> -                           ovl_iocb_to_rwf(iocb));
>> +       if (is_sync_kiocb(iocb)) {
>> +               ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
>> +                                   ovl_iocb_to_rwf(iocb));
>> +               ovl_file_accessed(file);
>> +               fdput(real);
>> +       } else {
>> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
>> +                                                              GFP_NOFS);
>> +               aio_req->fd = real;
>> +               aio_req->orig_iocb = iocb;
>> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
>> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
>> +               ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
>> +               ovl_file_accessed(file);
> 
> That should be done in completion/error
>

Refer to function generic_file_read_iter(), in direct IO path,
file_accessed() is done before IO submission, so I think ovl_file_accessed()
should be done here no matter completion/error or IO is queued.

>> +               if (ret != -EIOCBQUEUED) {
>> +                       iocb->ki_pos = aio_req->iocb.ki_pos;
>> +                       fdput(real);
>> +                       kmem_cache_free(ovl_aio_request_cachep, aio_req);
>> +               }
> 
> Suggest cleanup helper for completion/error
> 
Yes, that will be more clearly. I will add a cleanup helper in version 2.

> 
>> +       }
>>         revert_creds(old_cred);
>>
>> -       ovl_file_accessed(file);
>> -
>> -       fdput(real);
>> -
>>         return ret;
>>  }
>>
>> @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>>
>>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
>>         file_start_write(real.file);
>> -       ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
>> -                            ovl_iocb_to_rwf(iocb));
>> -       file_end_write(real.file);
>> +       if (is_sync_kiocb(iocb)) {
>> +               ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
>> +                                    ovl_iocb_to_rwf(iocb));
>> +               file_end_write(real.file);
>> +               /* Update size */
>> +               ovl_copyattr(ovl_inode_real(inode), inode);
>> +               fdput(real);
>> +       } else {
>> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
>> +                                                              GFP_NOFS);
>> +               aio_req->fd = real;
>> +               aio_req->orig_iocb = iocb;
>> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
>> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
>> +               ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
>> +               /* Update size */
>> +               ovl_copyattr(ovl_inode_real(inode), inode);
> 
> That should be in completion/error
> 
Yes, I will move it to the newly added cleanup helper.

Thanks,
Jiufei

> Thanks,
> Amir.
>
Amir Goldstein Nov. 19, 2019, 9:38 a.m. UTC | #3
On Tue, Nov 19, 2019 at 10:37 AM Jiufei Xue
<jiufei.xue@linux.alibaba.com> wrote:
>
> Hi Amir,
>
> On 2019/11/19 下午12:22, Amir Goldstein wrote:
> > On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote:
> >>
> >> A performance regression is observed since linux v4.19 when we do aio
> >> test using fio with iodepth 128 on overlayfs. And we found that queue
> >> depth of the device is always 1 which is unexpected.
> >>
> >> After investigation, it is found that commit 16914e6fc7
> >> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc
> >> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit
> >> requests to real filesystem. Async IOs are converted to sync IOs here
> >> and cause performance regression.
> >>
> >> So implement async IO for stacked reading and writing.
> >>
> >> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
> >> ---
> >>  fs/overlayfs/file.c      | 97 +++++++++++++++++++++++++++++++++++++++++-------
> >>  fs/overlayfs/overlayfs.h |  2 +
> >>  fs/overlayfs/super.c     | 12 +++++-
> >>  3 files changed, 95 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> >> index e235a63..07d94e7 100644
> >> --- a/fs/overlayfs/file.c
> >> +++ b/fs/overlayfs/file.c
> >> @@ -11,6 +11,14 @@
> >>  #include <linux/uaccess.h>
> >>  #include "overlayfs.h"
> >>
> >> +struct ovl_aio_req {
> >> +       struct kiocb iocb;
> >> +       struct kiocb *orig_iocb;
> >> +       struct fd fd;
> >> +};
> >> +
> >> +static struct kmem_cache *ovl_aio_request_cachep;
> >> +
> >>  static char ovl_whatisit(struct inode *inode, struct inode *realinode)
> >>  {
> >>         if (realinode != ovl_inode_upper(inode))
> >> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
> >>         return flags;
> >>  }
> >>
> >> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
> >> +{
> >> +       struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb);
> >> +       struct kiocb *orig_iocb = aio_req->orig_iocb;
> >> +
> >> +       if (iocb->ki_flags & IOCB_WRITE)
> >> +               file_end_write(iocb->ki_filp);
> >> +
> >> +       orig_iocb->ki_pos = iocb->ki_pos;
> >> +       orig_iocb->ki_complete(orig_iocb, res, res2);
> >> +
> >> +       fdput(aio_req->fd);
> >> +       kmem_cache_free(ovl_aio_request_cachep, aio_req);
> >> +}
> >> +
> >>  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> >>  {
> >>         struct file *file = iocb->ki_filp;
> >> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> >>                 return ret;
> >>
> >>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
> >> -       ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
> >> -                           ovl_iocb_to_rwf(iocb));
> >> +       if (is_sync_kiocb(iocb)) {
> >> +               ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
> >> +                                   ovl_iocb_to_rwf(iocb));
> >> +               ovl_file_accessed(file);
> >> +               fdput(real);
> >> +       } else {
> >> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
> >> +                                                              GFP_NOFS);
> >> +               aio_req->fd = real;
> >> +               aio_req->orig_iocb = iocb;
> >> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
> >> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> >> +               ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
> >> +               ovl_file_accessed(file);
> >
> > That should be done in completion/error
> >
>
> Refer to function generic_file_read_iter(), in direct IO path,
> file_accessed() is done before IO submission, so I think ovl_file_accessed()
> should be done here no matter completion/error or IO is queued.

Mmm, it doesn't matter much if atime is updated before or after,
but ovl_file_accessed() does not only update atime, it also copies
ctime which could have been modified as a result of the io, so
I think it is safer to put it in the cleanup hook.

Thanks,
Amir.
Jiufei Xue Nov. 19, 2019, 11:22 a.m. UTC | #4
Hi Amir,
On 2019/11/19 下午5:38, Amir Goldstein wrote:
> On Tue, Nov 19, 2019 at 10:37 AM Jiufei Xue
> <jiufei.xue@linux.alibaba.com> wrote:
>>
>> Hi Amir,
>>
>> On 2019/11/19 下午12:22, Amir Goldstein wrote:
>>> On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@linux.alibaba.com> wrote:
>>>>
>>>> A performance regression is observed since linux v4.19 when we do aio
>>>> test using fio with iodepth 128 on overlayfs. And we found that queue
>>>> depth of the device is always 1 which is unexpected.
>>>>
>>>> After investigation, it is found that commit 16914e6fc7
>>>> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc
>>>> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit
>>>> requests to real filesystem. Async IOs are converted to sync IOs here
>>>> and cause performance regression.
>>>>
>>>> So implement async IO for stacked reading and writing.
>>>>
>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>> ---
>>>>  fs/overlayfs/file.c      | 97 +++++++++++++++++++++++++++++++++++++++++-------
>>>>  fs/overlayfs/overlayfs.h |  2 +
>>>>  fs/overlayfs/super.c     | 12 +++++-
>>>>  3 files changed, 95 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>>>> index e235a63..07d94e7 100644
>>>> --- a/fs/overlayfs/file.c
>>>> +++ b/fs/overlayfs/file.c
>>>> @@ -11,6 +11,14 @@
>>>>  #include <linux/uaccess.h>
>>>>  #include "overlayfs.h"
>>>>
>>>> +struct ovl_aio_req {
>>>> +       struct kiocb iocb;
>>>> +       struct kiocb *orig_iocb;
>>>> +       struct fd fd;
>>>> +};
>>>> +
>>>> +static struct kmem_cache *ovl_aio_request_cachep;
>>>> +
>>>>  static char ovl_whatisit(struct inode *inode, struct inode *realinode)
>>>>  {
>>>>         if (realinode != ovl_inode_upper(inode))
>>>> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
>>>>         return flags;
>>>>  }
>>>>
>>>> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
>>>> +{
>>>> +       struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb);
>>>> +       struct kiocb *orig_iocb = aio_req->orig_iocb;
>>>> +
>>>> +       if (iocb->ki_flags & IOCB_WRITE)
>>>> +               file_end_write(iocb->ki_filp);
>>>> +
>>>> +       orig_iocb->ki_pos = iocb->ki_pos;
>>>> +       orig_iocb->ki_complete(orig_iocb, res, res2);
>>>> +
>>>> +       fdput(aio_req->fd);
>>>> +       kmem_cache_free(ovl_aio_request_cachep, aio_req);
>>>> +}
>>>> +
>>>>  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>>>>  {
>>>>         struct file *file = iocb->ki_filp;
>>>> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>>>>                 return ret;
>>>>
>>>>         old_cred = ovl_override_creds(file_inode(file)->i_sb);
>>>> -       ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
>>>> -                           ovl_iocb_to_rwf(iocb));
>>>> +       if (is_sync_kiocb(iocb)) {
>>>> +               ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
>>>> +                                   ovl_iocb_to_rwf(iocb));
>>>> +               ovl_file_accessed(file);
>>>> +               fdput(real);
>>>> +       } else {
>>>> +               struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
>>>> +                                                              GFP_NOFS);
>>>> +               aio_req->fd = real;
>>>> +               aio_req->orig_iocb = iocb;
>>>> +               kiocb_clone(&aio_req->iocb, iocb, real.file);
>>>> +               aio_req->iocb.ki_complete = ovl_aio_rw_complete;
>>>> +               ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
>>>> +               ovl_file_accessed(file);
>>>
>>> That should be done in completion/error
>>>
>>
>> Refer to function generic_file_read_iter(), in direct IO path,
>> file_accessed() is done before IO submission, so I think ovl_file_accessed()
>> should be done here no matter completion/error or IO is queued.
> 
> Mmm, it doesn't matter much if atime is updated before or after,
> but ovl_file_accessed() does not only update atime, it also copies
> ctime which could have been modified as a result of the io, so
> I think it is safer to put it in the cleanup hook.
>

Can you give a more detailed description that a read op will modify
ctime as a result of the io?

I found that it will trigger BUG_ON(irqs_disabled()) while
calling ovl_file_accessed() on async IO return path. The calltrace
is pasted below:

ovl_file_accessed
  -> touch_atime
    -> ovl_update_time
      -> generic_update_time
        -> __mark_inode_dirty
          -> ext4_dirty_inode
            -> __ext4_get_inode_loc
              -> __find_get_block
                -> lookup_bh_lru
                   -> check_irqs_on

So I need more detail to find how to fix this issue.

Thanks,
Jiufei.

> Thanks,
> Amir.
>
Amir Goldstein Nov. 19, 2019, 12:50 p.m. UTC | #5
> >>
> >> Refer to function generic_file_read_iter(), in direct IO path,
> >> file_accessed() is done before IO submission, so I think ovl_file_accessed()
> >> should be done here no matter completion/error or IO is queued.
> >
> > Mmm, it doesn't matter much if atime is updated before or after,
> > but ovl_file_accessed() does not only update atime, it also copies
> > ctime which could have been modified as a result of the io, so
> > I think it is safer to put it in the cleanup hook.
> >
>
> Can you give a more detailed description that a read op will modify
> ctime as a result of the io?
>
> I found that it will trigger BUG_ON(irqs_disabled()) while
> calling ovl_file_accessed() on async IO return path. The calltrace
> is pasted below:
>
> ovl_file_accessed
>   -> touch_atime
>     -> ovl_update_time
>       -> generic_update_time
>         -> __mark_inode_dirty
>           -> ext4_dirty_inode
>             -> __ext4_get_inode_loc
>               -> __find_get_block
>                 -> lookup_bh_lru
>                    -> check_irqs_on
>
> So I need more detail to find how to fix this issue.
>

It's not important. Please ignore. I din't know there was an issue
with placing touch_atime() in completion context.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index e235a63..07d94e7 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -11,6 +11,14 @@ 
 #include <linux/uaccess.h>
 #include "overlayfs.h"
 
+struct ovl_aio_req {
+	struct kiocb iocb;
+	struct kiocb *orig_iocb;
+	struct fd fd;
+};
+
+static struct kmem_cache *ovl_aio_request_cachep;
+
 static char ovl_whatisit(struct inode *inode, struct inode *realinode)
 {
 	if (realinode != ovl_inode_upper(inode))
@@ -225,6 +233,21 @@  static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
 	return flags;
 }
 
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+{
+	struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb);
+	struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+	if (iocb->ki_flags & IOCB_WRITE)
+		file_end_write(iocb->ki_filp);
+
+	orig_iocb->ki_pos = iocb->ki_pos;
+	orig_iocb->ki_complete(orig_iocb, res, res2);
+
+	fdput(aio_req->fd);
+	kmem_cache_free(ovl_aio_request_cachep, aio_req);
+}
+
 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
@@ -240,14 +263,28 @@  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		return ret;
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
-			    ovl_iocb_to_rwf(iocb));
+	if (is_sync_kiocb(iocb)) {
+		ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
+				    ovl_iocb_to_rwf(iocb));
+		ovl_file_accessed(file);
+		fdput(real);
+	} else {
+		struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
+							       GFP_NOFS);
+		aio_req->fd = real;
+		aio_req->orig_iocb = iocb;
+		kiocb_clone(&aio_req->iocb, iocb, real.file);
+		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+		ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+		ovl_file_accessed(file);
+		if (ret != -EIOCBQUEUED) {
+			iocb->ki_pos = aio_req->iocb.ki_pos;
+			fdput(real);
+			kmem_cache_free(ovl_aio_request_cachep, aio_req);
+		}
+	}
 	revert_creds(old_cred);
 
-	ovl_file_accessed(file);
-
-	fdput(real);
-
 	return ret;
 }
 
@@ -275,16 +312,32 @@  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
 	file_start_write(real.file);
-	ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
-			     ovl_iocb_to_rwf(iocb));
-	file_end_write(real.file);
+	if (is_sync_kiocb(iocb)) {
+		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
+				     ovl_iocb_to_rwf(iocb));
+		file_end_write(real.file);
+		/* Update size */
+		ovl_copyattr(ovl_inode_real(inode), inode);
+		fdput(real);
+	} else {
+		struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep,
+							       GFP_NOFS);
+		aio_req->fd = real;
+		aio_req->orig_iocb = iocb;
+		kiocb_clone(&aio_req->iocb, iocb, real.file);
+		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+		/* Update size */
+		ovl_copyattr(ovl_inode_real(inode), inode);
+		if (ret != -EIOCBQUEUED) {
+			iocb->ki_pos = aio_req->iocb.ki_pos;
+			file_end_write(real.file);
+			fdput(real);
+			kmem_cache_free(ovl_aio_request_cachep, aio_req);
+		}
+	}
 	revert_creds(old_cred);
 
-	/* Update size */
-	ovl_copyattr(ovl_inode_real(inode), inode);
-
-	fdput(real);
-
 out_unlock:
 	inode_unlock(inode);
 
@@ -651,3 +704,19 @@  static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
 	.copy_file_range	= ovl_copy_file_range,
 	.remap_file_range	= ovl_remap_file_range,
 };
+
+int __init ovl_init_aio_request_cache(void)
+{
+	ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
+						   sizeof(struct ovl_aio_req),
+						   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ovl_aio_request_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ovl_exit_aio_request_cache(void)
+{
+	kmem_cache_destroy(ovl_aio_request_cachep);
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6934bcf..afd1631 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -416,6 +416,8 @@  struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
 
 /* file.c */
 extern const struct file_operations ovl_file_operations;
+int __init ovl_init_aio_request_cache(void);
+void ovl_exit_aio_request_cache(void);
 
 /* copy_up.c */
 int ovl_copy_up(struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index afbcb11..83cef1f 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1739,9 +1739,17 @@  static int __init ovl_init(void)
 	if (ovl_inode_cachep == NULL)
 		return -ENOMEM;
 
+	err = ovl_init_aio_request_cache();
+	if (err) {
+		kmem_cache_destroy(ovl_inode_cachep);
+		return -ENOMEM;
+	}
+
 	err = register_filesystem(&ovl_fs_type);
-	if (err)
+	if (err) {
 		kmem_cache_destroy(ovl_inode_cachep);
+		ovl_exit_aio_request_cache();
+	}
 
 	return err;
 }
@@ -1756,7 +1764,7 @@  static void __exit ovl_exit(void)
 	 */
 	rcu_barrier();
 	kmem_cache_destroy(ovl_inode_cachep);
-
+	ovl_exit_aio_request_cache();
 }
 
 module_init(ovl_init);