diff mbox series

[RFC,3/3] ovl: implement stacked mmap for shared map

Message ID 20200829095101.25350-4-cgxu519@mykernel.net (mailing list archive)
State New, archived
Headers show
Series ovl: stacked mmap for shared map | expand

Commit Message

Chengguang Xu Aug. 29, 2020, 9:51 a.m. UTC
Implement stacked mmap for shared map to keep data
consistency.

Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
---
 fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 114 insertions(+), 6 deletions(-)

Comments

Amir Goldstein Aug. 30, 2020, 11:33 a.m. UTC | #1
On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@mykernel.net> wrote:
>
> Implement stacked mmap for shared map to keep data
> consistency.
>
> Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
> ---
>  fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 114 insertions(+), 6 deletions(-)
>
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 14ab5344a918..db5ab200d984 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -21,9 +21,17 @@ struct ovl_aio_req {
>         struct fd fd;
>  };
>
> +static vm_fault_t ovl_fault(struct vm_fault *vmf);
> +static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
> +
> +static const struct vm_operations_struct ovl_vm_ops = {
> +       .fault          = ovl_fault,
> +       .page_mkwrite   = ovl_page_mkwrite,
> +};
> +

Interesting direction, not sure if this is workable.
I don't know enough about mm to say.

But what about the rest of the operations?
Did you go over them and decide that overlay doesn't need to implement them?
I doubt it, but if you did, please document that.

>  struct ovl_file_entry {
>         struct file *realfile;
> -       void *vm_ops;
> +       const struct vm_operations_struct *vm_ops;
>  };
>
>  struct file *ovl_get_realfile(struct file *file)
> @@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
>         ofe->realfile = realfile;
>  }
>
> -void *ovl_get_real_vmops(struct file *file)
> +const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
>  {
>         struct ovl_file_entry *ofe = file->private_data;
>
>         return ofe->vm_ops;
>  }
>
> -void ovl_set_real_vmops(struct file *file, void *vm_ops)
> +void ovl_set_real_vmops(struct file *file,
> +                       const struct vm_operations_struct *vm_ops)
>  {
>         struct ovl_file_entry *ofe = file->private_data;
>
> @@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>         return ret;
>  }
>
> +vm_fault_t ovl_fault(struct vm_fault *vmf)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +       struct file *file = vma->vm_file;
> +       struct file *realfile;
> +       struct file *fpin, *tmp;
> +       struct inode *inode = file_inode(file);
> +       struct inode *realinode;
> +       const struct cred *old_cred;
> +       bool retry_allowed;
> +       vm_fault_t ret;
> +       int err = 0;
> +
> +       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
> +               realfile = ovl_get_realfile(file);
> +
> +               if (!ovl_has_upperdata(inode) ||
> +                   realfile->f_inode != ovl_inode_upper(inode) ||
> +                   !realfile->f_op->mmap)
> +                       return VM_FAULT_SIGBUS;
> +
> +               if (!ovl_get_real_vmops(file)) {
> +                       old_cred = ovl_override_creds(inode->i_sb);
> +                       err = call_mmap(realfile, vma);
> +                       revert_creds(old_cred);
> +
> +                       vma->vm_file = file;
> +                       if (err) {
> +                               vma->vm_ops = &ovl_vm_ops;
> +                               return VM_FAULT_SIGBUS;
> +                       }
> +                       ovl_set_real_vmops(file, vma->vm_ops);
> +                       vma->vm_ops = &ovl_vm_ops;
> +               }
> +
> +               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
> +               if (retry_allowed)
> +                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
> +               vma->vm_file = realfile;
> +               ret = ovl_get_real_vmops(file)->fault(vmf);
> +               vma->vm_file = file;
> +               if (retry_allowed)
> +                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
> +               return ret;
> +
> +       } else {
> +               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
> +               if (!fpin)
> +                       return VM_FAULT_SIGBUS;
> +
> +               ret = VM_FAULT_RETRY;
> +               if (!ovl_has_upperdata(inode)) {
> +                       err = ovl_copy_up_with_data(file->f_path.dentry);
> +                       if (err)
> +                               goto out;
> +               }
> +
> +               realinode = ovl_inode_realdata(inode);
> +               realfile = ovl_open_realfile(file, realinode);
> +               if (IS_ERR(realfile))
> +                       goto out;
> +
> +               tmp = ovl_get_realfile(file);
> +               ovl_set_realfile(file, realfile);
> +               fput(tmp);
> +
> +out:
> +               fput(fpin);
> +               return ret;
> +       }
> +}


Please add some documentation to explain the method used.
Do we need to retry if real_vmops are already set?

Thanks,
Amir.
Chengguang Xu Aug. 31, 2020, 1:47 p.m. UTC | #2
On 8/30/20 7:33 PM, Amir Goldstein wrote:
> On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@mykernel.net> wrote:
>>
>> Implement stacked mmap for shared map to keep data
>> consistency.
>>
>> Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
>> ---
>>   fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 114 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>> index 14ab5344a918..db5ab200d984 100644
>> --- a/fs/overlayfs/file.c
>> +++ b/fs/overlayfs/file.c
>> @@ -21,9 +21,17 @@ struct ovl_aio_req {
>>          struct fd fd;
>>   };
>>
>> +static vm_fault_t ovl_fault(struct vm_fault *vmf);
>> +static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
>> +
>> +static const struct vm_operations_struct ovl_vm_ops = {
>> +       .fault          = ovl_fault,
>> +       .page_mkwrite   = ovl_page_mkwrite,
>> +};
>> +
> 
> Interesting direction, not sure if this is workable.
> I don't know enough about mm to say.
> 
> But what about the rest of the operations?
> Did you go over them and decide that overlay doesn't need to implement them?
> I doubt it, but if you did, please document that.

I did some check for rest of them, IIUC ->fault will be enough for this 
special case (shared read-only mmap with no upper), I will remove 
->page_mkwrite in v2.

# I do not consider support ->huge_fault in current stage due to many fs 
cannot support DAX properly.

BTW, do you know who should I add to CC list for further deep review of
this code? fadevel-list?



> 
>>   struct ovl_file_entry {
>>          struct file *realfile;
>> -       void *vm_ops;
>> +       const struct vm_operations_struct *vm_ops;
>>   };
>>
>>   struct file *ovl_get_realfile(struct file *file)
>> @@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
>>          ofe->realfile = realfile;
>>   }
>>
>> -void *ovl_get_real_vmops(struct file *file)
>> +const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
>>   {
>>          struct ovl_file_entry *ofe = file->private_data;
>>
>>          return ofe->vm_ops;
>>   }
>>
>> -void ovl_set_real_vmops(struct file *file, void *vm_ops)
>> +void ovl_set_real_vmops(struct file *file,
>> +                       const struct vm_operations_struct *vm_ops)
>>   {
>>          struct ovl_file_entry *ofe = file->private_data;
>>
>> @@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>          return ret;
>>   }
>>
>> +vm_fault_t ovl_fault(struct vm_fault *vmf)
>> +{
>> +       struct vm_area_struct *vma = vmf->vma;
>> +       struct file *file = vma->vm_file;
>> +       struct file *realfile;
>> +       struct file *fpin, *tmp;
>> +       struct inode *inode = file_inode(file);
>> +       struct inode *realinode;
>> +       const struct cred *old_cred;
>> +       bool retry_allowed;
>> +       vm_fault_t ret;
>> +       int err = 0;
>> +
>> +       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
>> +               realfile = ovl_get_realfile(file);
>> +
>> +               if (!ovl_has_upperdata(inode) ||
>> +                   realfile->f_inode != ovl_inode_upper(inode) ||
>> +                   !realfile->f_op->mmap)
>> +                       return VM_FAULT_SIGBUS;
>> +
>> +               if (!ovl_get_real_vmops(file)) {
>> +                       old_cred = ovl_override_creds(inode->i_sb);
>> +                       err = call_mmap(realfile, vma);
>> +                       revert_creds(old_cred);
>> +
>> +                       vma->vm_file = file;
>> +                       if (err) {
>> +                               vma->vm_ops = &ovl_vm_ops;
>> +                               return VM_FAULT_SIGBUS;
>> +                       }
>> +                       ovl_set_real_vmops(file, vma->vm_ops);
>> +                       vma->vm_ops = &ovl_vm_ops;
>> +               }
>> +
>> +               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
>> +               if (retry_allowed)
>> +                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
>> +               vma->vm_file = realfile;
>> +               ret = ovl_get_real_vmops(file)->fault(vmf);
>> +               vma->vm_file = file;
>> +               if (retry_allowed)
>> +                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
>> +               return ret;
>> +
>> +       } else {
>> +               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
>> +               if (!fpin)
>> +                       return VM_FAULT_SIGBUS;
>> +
>> +               ret = VM_FAULT_RETRY;
>> +               if (!ovl_has_upperdata(inode)) {
>> +                       err = ovl_copy_up_with_data(file->f_path.dentry);
>> +                       if (err)
>> +                               goto out;
>> +               }
>> +
>> +               realinode = ovl_inode_realdata(inode);
>> +               realfile = ovl_open_realfile(file, realinode);
>> +               if (IS_ERR(realfile))
>> +                       goto out;
>> +
>> +               tmp = ovl_get_realfile(file);
>> +               ovl_set_realfile(file, realfile);
>> +               fput(tmp);
>> +
>> +out:
>> +               fput(fpin);
>> +               return ret;
>> +       }
>> +}
> 
> 
> Please add some documentation to explain the method used.
> Do we need to retry if real_vmops are already set?
> 

Good catch, actually retry is not needed in that case.

Basically, we unlock(mmap_lock)->copy-up->open when
detecting no upper inode then retry fault operation.
However, we need to check fault retry flag carefully
for avoiding endless retry.

I'll add more explanation in v2.

---
cgxu
Amir Goldstein Aug. 31, 2020, 3:51 p.m. UTC | #3
On Mon, Aug 31, 2020 at 4:47 PM cgxu <cgxu519@mykernel.net> wrote:
>
> On 8/30/20 7:33 PM, Amir Goldstein wrote:
> > On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@mykernel.net> wrote:
> >>
> >> Implement stacked mmap for shared map to keep data
> >> consistency.
> >>
> >> Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
> >> ---
> >>   fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
> >>   1 file changed, 114 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> >> index 14ab5344a918..db5ab200d984 100644
> >> --- a/fs/overlayfs/file.c
> >> +++ b/fs/overlayfs/file.c
> >> @@ -21,9 +21,17 @@ struct ovl_aio_req {
> >>          struct fd fd;
> >>   };
> >>
> >> +static vm_fault_t ovl_fault(struct vm_fault *vmf);
> >> +static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
> >> +
> >> +static const struct vm_operations_struct ovl_vm_ops = {
> >> +       .fault          = ovl_fault,
> >> +       .page_mkwrite   = ovl_page_mkwrite,
> >> +};
> >> +
> >
> > Interesting direction, not sure if this is workable.
> > I don't know enough about mm to say.
> >
> > But what about the rest of the operations?
> > Did you go over them and decide that overlay doesn't need to implement them?
> > I doubt it, but if you did, please document that.
>
> I did some check for rest of them, IIUC ->fault will be enough for this
> special case (shared read-only mmap with no upper), I will remove
> ->page_mkwrite in v2.

Ok I suppose you checked that ->map_pages is not relevant?

>
> # I do not consider support ->huge_fault in current stage due to many fs
> cannot support DAX properly.
>
> BTW, do you know who should I add to CC list for further deep review of
> this code? fadevel-list?
>

fsdevel would be good, but I would wait for initial feedback from Miklos
before you post v2...

>
>
> >
> >>   struct ovl_file_entry {
> >>          struct file *realfile;
> >> -       void *vm_ops;
> >> +       const struct vm_operations_struct *vm_ops;
> >>   };
> >>
> >>   struct file *ovl_get_realfile(struct file *file)
> >> @@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
> >>          ofe->realfile = realfile;
> >>   }
> >>
> >> -void *ovl_get_real_vmops(struct file *file)
> >> +const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
> >>   {
> >>          struct ovl_file_entry *ofe = file->private_data;
> >>
> >>          return ofe->vm_ops;
> >>   }
> >>
> >> -void ovl_set_real_vmops(struct file *file, void *vm_ops)
> >> +void ovl_set_real_vmops(struct file *file,
> >> +                       const struct vm_operations_struct *vm_ops)
> >>   {
> >>          struct ovl_file_entry *ofe = file->private_data;
> >>
> >> @@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> >>          return ret;
> >>   }
> >>
> >> +vm_fault_t ovl_fault(struct vm_fault *vmf)
> >> +{
> >> +       struct vm_area_struct *vma = vmf->vma;
> >> +       struct file *file = vma->vm_file;
> >> +       struct file *realfile;
> >> +       struct file *fpin, *tmp;
> >> +       struct inode *inode = file_inode(file);
> >> +       struct inode *realinode;
> >> +       const struct cred *old_cred;
> >> +       bool retry_allowed;
> >> +       vm_fault_t ret;
> >> +       int err = 0;
> >> +
> >> +       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
> >> +               realfile = ovl_get_realfile(file);
> >> +
> >> +               if (!ovl_has_upperdata(inode) ||
> >> +                   realfile->f_inode != ovl_inode_upper(inode) ||
> >> +                   !realfile->f_op->mmap)
> >> +                       return VM_FAULT_SIGBUS;
> >> +
> >> +               if (!ovl_get_real_vmops(file)) {
> >> +                       old_cred = ovl_override_creds(inode->i_sb);
> >> +                       err = call_mmap(realfile, vma);
> >> +                       revert_creds(old_cred);
> >> +
> >> +                       vma->vm_file = file;
> >> +                       if (err) {
> >> +                               vma->vm_ops = &ovl_vm_ops;
> >> +                               return VM_FAULT_SIGBUS;
> >> +                       }
> >> +                       ovl_set_real_vmops(file, vma->vm_ops);
> >> +                       vma->vm_ops = &ovl_vm_ops;
> >> +               }
> >> +
> >> +               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
> >> +               if (retry_allowed)
> >> +                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
> >> +               vma->vm_file = realfile;
> >> +               ret = ovl_get_real_vmops(file)->fault(vmf);
> >> +               vma->vm_file = file;
> >> +               if (retry_allowed)
> >> +                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
> >> +               return ret;
> >> +
> >> +       } else {
> >> +               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
> >> +               if (!fpin)
> >> +                       return VM_FAULT_SIGBUS;
> >> +
> >> +               ret = VM_FAULT_RETRY;
> >> +               if (!ovl_has_upperdata(inode)) {
> >> +                       err = ovl_copy_up_with_data(file->f_path.dentry);
> >> +                       if (err)
> >> +                               goto out;
> >> +               }
> >> +
> >> +               realinode = ovl_inode_realdata(inode);
> >> +               realfile = ovl_open_realfile(file, realinode);
> >> +               if (IS_ERR(realfile))
> >> +                       goto out;
> >> +
> >> +               tmp = ovl_get_realfile(file);
> >> +               ovl_set_realfile(file, realfile);
> >> +               fput(tmp);
> >> +
> >> +out:
> >> +               fput(fpin);
> >> +               return ret;
> >> +       }
> >> +}
> >
> >
> > Please add some documentation to explain the method used.
> > Do we need to retry if real_vmops are already set?
> >
>
> Good catch, actually retry is not needed in that case.
>
> Basically, we unlock(mmap_lock)->copy-up->open when
> detecting no upper inode then retry fault operation.
> However, we need to check fault retry flag carefully
> for avoiding endless retry.

That much I got, but the details of setting ->vm_file and vmops
look subtle, so better explain them.

Thanks,
Amir.
Miklos Szeredi Sept. 1, 2020, 7:44 a.m. UTC | #4
On Mon, Aug 31, 2020 at 5:52 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Aug 31, 2020 at 4:47 PM cgxu <cgxu519@mykernel.net> wrote:
> >
> > On 8/30/20 7:33 PM, Amir Goldstein wrote:

> > > Interesting direction, not sure if this is workable.
> > > I don't know enough about mm to say.
> > >
> > > But what about the rest of the operations?
> > > Did you go over them and decide that overlay doesn't need to implement them?
> > > I doubt it, but if you did, please document that.
> >
> > I did some check for rest of them, IIUC ->fault will be enough for this
> > special case (shared read-only mmap with no upper), I will remove
> > ->page_mkwrite in v2.

Hmm, so this is just for that specific corner case?  Not a generic
shared mmap implementation?

>
> Ok I suppose you checked that ->map_pages is not relevant?
>
> >
> > # I do not consider support ->huge_fault in current stage due to many fs
> > cannot support DAX properly.
> >
> > BTW, do you know who should I add to CC list for further deep review of
> > this code? fadevel-list?
> >
>
> fsdevel would be good, but I would wait for initial feedback from Miklos
> before you post v2...

I could dig into the details of page fault handling, but that's not an
area that I'm intimately familiar with.    So yeah, a high level
description of what happens on mmap, page fault, write fault, etc...
would be really appreciated.

Thanks,
Miklos
Chengguang Xu Sept. 1, 2020, 1:20 p.m. UTC | #5
On 8/31/20 11:51 PM, Amir Goldstein wrote:
> On Mon, Aug 31, 2020 at 4:47 PM cgxu <cgxu519@mykernel.net> wrote:
>>
>> On 8/30/20 7:33 PM, Amir Goldstein wrote:
>>> On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@mykernel.net> wrote:
>>>>
>>>> Implement stacked mmap for shared map to keep data
>>>> consistency.
>>>>
>>>> Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
>>>> ---
>>>>    fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
>>>>    1 file changed, 114 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>>>> index 14ab5344a918..db5ab200d984 100644
>>>> --- a/fs/overlayfs/file.c
>>>> +++ b/fs/overlayfs/file.c
>>>> @@ -21,9 +21,17 @@ struct ovl_aio_req {
>>>>           struct fd fd;
>>>>    };
>>>>
>>>> +static vm_fault_t ovl_fault(struct vm_fault *vmf);
>>>> +static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
>>>> +
>>>> +static const struct vm_operations_struct ovl_vm_ops = {
>>>> +       .fault          = ovl_fault,
>>>> +       .page_mkwrite   = ovl_page_mkwrite,
>>>> +};
>>>> +
>>>
>>> Interesting direction, not sure if this is workable.
>>> I don't know enough about mm to say.
>>>
>>> But what about the rest of the operations?
>>> Did you go over them and decide that overlay doesn't need to implement them?
>>> I doubt it, but if you did, please document that.
>>
>> I did some check for rest of them, IIUC ->fault will be enough for this
>> special case (shared read-only mmap with no upper), I will remove
>> ->page_mkwrite in v2.
> 
> Ok I suppose you checked that ->map_pages is not relevant?


->map_pages() does easy maps and fallback to ->fault if the offset is 
not ready, so I think without ->map_pages() it still could work 
properly, we can also implement it for acceleration.


> 
>>
>> # I do not consider support ->huge_fault in current stage due to many fs
>> cannot support DAX properly.
>>
>> BTW, do you know who should I add to CC list for further deep review of
>> this code? fadevel-list?
>>
> 
> fsdevel would be good, but I would wait for initial feedback from Miklos
> before you post v2...
> 
>>
>>
>>>
>>>>    struct ovl_file_entry {
>>>>           struct file *realfile;
>>>> -       void *vm_ops;
>>>> +       const struct vm_operations_struct *vm_ops;
>>>>    };
>>>>
>>>>    struct file *ovl_get_realfile(struct file *file)
>>>> @@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
>>>>           ofe->realfile = realfile;
>>>>    }
>>>>
>>>> -void *ovl_get_real_vmops(struct file *file)
>>>> +const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
>>>>    {
>>>>           struct ovl_file_entry *ofe = file->private_data;
>>>>
>>>>           return ofe->vm_ops;
>>>>    }
>>>>
>>>> -void ovl_set_real_vmops(struct file *file, void *vm_ops)
>>>> +void ovl_set_real_vmops(struct file *file,
>>>> +                       const struct vm_operations_struct *vm_ops)
>>>>    {
>>>>           struct ovl_file_entry *ofe = file->private_data;
>>>>
>>>> @@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>>>           return ret;
>>>>    }
>>>>
>>>> +vm_fault_t ovl_fault(struct vm_fault *vmf)
>>>> +{
>>>> +       struct vm_area_struct *vma = vmf->vma;
>>>> +       struct file *file = vma->vm_file;
>>>> +       struct file *realfile;
>>>> +       struct file *fpin, *tmp;
>>>> +       struct inode *inode = file_inode(file);
>>>> +       struct inode *realinode;
>>>> +       const struct cred *old_cred;
>>>> +       bool retry_allowed;
>>>> +       vm_fault_t ret;
>>>> +       int err = 0;
>>>> +
>>>> +       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
>>>> +               realfile = ovl_get_realfile(file);
>>>> +
>>>> +               if (!ovl_has_upperdata(inode) ||
>>>> +                   realfile->f_inode != ovl_inode_upper(inode) ||
>>>> +                   !realfile->f_op->mmap)
>>>> +                       return VM_FAULT_SIGBUS;
>>>> +
>>>> +               if (!ovl_get_real_vmops(file)) {
>>>> +                       old_cred = ovl_override_creds(inode->i_sb);
>>>> +                       err = call_mmap(realfile, vma);
>>>> +                       revert_creds(old_cred);
>>>> +
>>>> +                       vma->vm_file = file;
>>>> +                       if (err) {
>>>> +                               vma->vm_ops = &ovl_vm_ops;
>>>> +                               return VM_FAULT_SIGBUS;
>>>> +                       }
>>>> +                       ovl_set_real_vmops(file, vma->vm_ops);
>>>> +                       vma->vm_ops = &ovl_vm_ops;
>>>> +               }
>>>> +
>>>> +               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
>>>> +               if (retry_allowed)
>>>> +                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
>>>> +               vma->vm_file = realfile;
>>>> +               ret = ovl_get_real_vmops(file)->fault(vmf);
>>>> +               vma->vm_file = file;
>>>> +               if (retry_allowed)
>>>> +                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
>>>> +               return ret;
>>>> +
>>>> +       } else {
>>>> +               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
>>>> +               if (!fpin)
>>>> +                       return VM_FAULT_SIGBUS;
>>>> +
>>>> +               ret = VM_FAULT_RETRY;
>>>> +               if (!ovl_has_upperdata(inode)) {
>>>> +                       err = ovl_copy_up_with_data(file->f_path.dentry);
>>>> +                       if (err)
>>>> +                               goto out;
>>>> +               }
>>>> +
>>>> +               realinode = ovl_inode_realdata(inode);
>>>> +               realfile = ovl_open_realfile(file, realinode);
>>>> +               if (IS_ERR(realfile))
>>>> +                       goto out;
>>>> +
>>>> +               tmp = ovl_get_realfile(file);
>>>> +               ovl_set_realfile(file, realfile);
>>>> +               fput(tmp);
>>>> +
>>>> +out:
>>>> +               fput(fpin);
>>>> +               return ret;
>>>> +       }
>>>> +}
>>>
>>>
>>> Please add some documentation to explain the method used.
>>> Do we need to retry if real_vmops are already set?
>>>
>>
>> Good catch, actually retry is not needed in that case.
>>
>> Basically, we unlock(mmap_lock)->copy-up->open when
>> detecting no upper inode then retry fault operation.
>> However, we need to check fault retry flag carefully
>> for avoiding endless retry.
> 
> That much I got, but the details of setting ->vm_file and vmops
> look subtle, so better explain them.
> 

I'll add some explanations in V2, but before that let me write some
comments based on code logic below. If there is still something not 
clear you can point out that again.



+	if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
+		realfile = ovl_get_realfile(file);
+
+		if (!ovl_has_upperdata(inode) ||
+		    realfile->f_inode != ovl_inode_upper(inode) ||
+		    !realfile->f_op->mmap)
+			return VM_FAULT_SIGBUS;

Above condition indicates (copy-up)/(open real-file) failed or
(real-file does not support mmap), so we have to return SIGBUS.



+
+		if (!ovl_get_real_vmops(file)) {
+			old_cred = ovl_override_creds(inode->i_sb);
+			err = call_mmap(realfile, vma);
+			revert_creds(old_cred);
+
+			vma->vm_file = file;
+			if (err) {
+				vma->vm_ops = &ovl_vm_ops;
+				return VM_FAULT_SIGBUS;
+			}
+			ovl_set_real_vmops(file, vma->vm_ops);
+			vma->vm_ops = &ovl_vm_ops;


call_mmap() will rewrite vma->vm_file and vma->vm_ops to upper layer's,
so here recover to overlay's in order to jump into this ovl_fault()
in other page-faults.


+		}
+
+		retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
+		if (retry_allowed)
+			vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;

here, we disallow retry in real ->fault because retry will unlock 
mmap_lock, touching vma after unlock is not safe behavior.


+		vma->vm_file = realfile;
+		ret = ovl_get_real_vmops(file)->fault(vmf);


calling real fault handler.


+		vma->vm_file = file;
+		if (retry_allowed)
+			vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;


recover vm_file and vm_ops to overlay's so that we can jump into
ovl_fault in other page-faults.


+		return ret;
+
+	} else {



Thanks,
cgxu
diff mbox series

Patch

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 14ab5344a918..db5ab200d984 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -21,9 +21,17 @@  struct ovl_aio_req {
 	struct fd fd;
 };
 
+static vm_fault_t ovl_fault(struct vm_fault *vmf);
+static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
+
+static const struct vm_operations_struct ovl_vm_ops = {
+	.fault		= ovl_fault,
+	.page_mkwrite	= ovl_page_mkwrite,
+};
+
 struct ovl_file_entry {
 	struct file *realfile;
-	void *vm_ops;
+	const struct vm_operations_struct *vm_ops;
 };
 
 struct file *ovl_get_realfile(struct file *file)
@@ -40,14 +48,15 @@  void ovl_set_realfile(struct file *file, struct file *realfile)
 	ofe->realfile = realfile;
 }
 
-void *ovl_get_real_vmops(struct file *file)
+const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
 {
 	struct ovl_file_entry *ofe = file->private_data;
 
 	return ofe->vm_ops;
 }
 
-void ovl_set_real_vmops(struct file *file, void *vm_ops)
+void ovl_set_real_vmops(struct file *file,
+			const struct vm_operations_struct *vm_ops)
 {
 	struct ovl_file_entry *ofe = file->private_data;
 
@@ -493,11 +502,104 @@  static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	return ret;
 }
 
+vm_fault_t ovl_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct file *file = vma->vm_file;
+	struct file *realfile;
+	struct file *fpin, *tmp;
+	struct inode *inode = file_inode(file);
+	struct inode *realinode;
+	const struct cred *old_cred;
+	bool retry_allowed;
+	vm_fault_t ret;
+	int err = 0;
+
+	if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
+		realfile = ovl_get_realfile(file);
+
+		if (!ovl_has_upperdata(inode) ||
+		    realfile->f_inode != ovl_inode_upper(inode) ||
+		    !realfile->f_op->mmap)
+			return VM_FAULT_SIGBUS;
+
+		if (!ovl_get_real_vmops(file)) {
+			old_cred = ovl_override_creds(inode->i_sb);
+			err = call_mmap(realfile, vma);
+			revert_creds(old_cred);
+
+			vma->vm_file = file;
+			if (err) {
+				vma->vm_ops = &ovl_vm_ops;
+				return VM_FAULT_SIGBUS;
+			}
+			ovl_set_real_vmops(file, vma->vm_ops);
+			vma->vm_ops = &ovl_vm_ops;
+		}
+
+		retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
+		if (retry_allowed)
+			vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+		vma->vm_file = realfile;
+		ret = ovl_get_real_vmops(file)->fault(vmf);
+		vma->vm_file = file;
+		if (retry_allowed)
+			vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
+		return ret;
+
+	} else {
+		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+		if (!fpin)
+			return VM_FAULT_SIGBUS;
+
+		ret = VM_FAULT_RETRY;
+		if (!ovl_has_upperdata(inode)) {
+			err = ovl_copy_up_with_data(file->f_path.dentry);
+			if (err)
+				goto out;
+		}
+
+		realinode = ovl_inode_realdata(inode);
+		realfile = ovl_open_realfile(file, realinode);
+		if (IS_ERR(realfile))
+			goto out;
+
+		tmp = ovl_get_realfile(file);
+		ovl_set_realfile(file, realfile);
+		fput(tmp);
+
+out:
+		fput(fpin);
+		return ret;
+	}
+}
+
+static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct file *file = vma->vm_file;
+	struct file *realfile;
+	struct inode *inode = file_inode(file);
+	vm_fault_t ret;
+
+	realfile = ovl_get_realfile(file);
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(file);
+
+	vma->vm_file = realfile;
+	ret = ovl_get_real_vmops(file)->page_mkwrite(vmf);
+	vma->vm_file = file;
+
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
 static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct file *realfile = ovl_get_realfile(file);
 	const struct cred *old_cred;
-	int ret;
+	int ret = 0;
 
 	if (!realfile->f_op->mmap)
 		return -ENODEV;
@@ -505,6 +607,13 @@  static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 	if (WARN_ON(file != vma->vm_file))
 		return -EIO;
 
+	if (!ovl_has_upperdata(file_inode(file)) &&
+	    (vma->vm_flags & (VM_SHARED|VM_MAYSHARE))) {
+		vma->vm_ops = &ovl_vm_ops;
+		ovl_file_accessed(file);
+		return 0;
+	}
+
 	vma->vm_file = get_file(realfile);
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
@@ -517,10 +626,9 @@  static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 	} else {
 		/* Drop reference count from previous vm_file value */
 		fput(file);
+		ovl_file_accessed(file);
 	}
 
-	ovl_file_accessed(file);
-
 	return ret;
 }