diff mbox series

[-next] ovl: respect underlying filesystem's get_unmapped_area()

Message ID 20241205143038.3260233-1-tujinjiang@huawei.com (mailing list archive)
State New
Headers show
Series [-next] ovl: respect underlying filesystem's get_unmapped_area() | expand

Commit Message

Jinjiang Tu Dec. 5, 2024, 2:30 p.m. UTC
During our tests in containers, there is a read-only file (i.e., shared
libraies) in the overlayfs filesystem, and the underlying filesystem is
ext4, which supports large folio. We mmap the file with PROT_READ prot,
and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
fails and returns EINVAL.

The reason is that the mapping address isn't aligned to PMD size. Since
overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
thp_get_unmapped_area() to get a THP aligned address.

To fix it, call get_unmapped_area() with the realfile.

Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
export get_unmapped_area().

Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 fs/overlayfs/file.c | 20 ++++++++++++++++++++
 mm/mmap.c           |  1 +
 2 files changed, 21 insertions(+)

Comments

Lorenzo Stoakes Dec. 5, 2024, 3:04 p.m. UTC | #1
+ Matthew for large folio aspect

On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> During our tests in containers, there is a read-only file (i.e., shared
> libraies) in the overlayfs filesystem, and the underlying filesystem is
> ext4, which supports large folio. We mmap the file with PROT_READ prot,
> and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> fails and returns EINVAL.
>
> The reason is that the mapping address isn't aligned to PMD size. Since
> overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> thp_get_unmapped_area() to get a THP aligned address.
>
> To fix it, call get_unmapped_area() with the realfile.

Isn't the correct solution to get overlayfs to support large folios?

>
> Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> export get_unmapped_area().

Yeah, not in favour of this at all. This is an internal implementation
detail. It seems like you're trying to hack your way into avoiding
providing support for large folios and to hand it off to the underlying
file system.

Again, why don't you just support large folios in overlayfs?

Literally no other file system or driver appears to make use of this
directly in this manner.

And there's absolutely no way this should be exported non-GPL as if it were
unavoidable core functionality that everyone needs. Only you seem to...

>
> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
> ---
>  fs/overlayfs/file.c | 20 ++++++++++++++++++++
>  mm/mmap.c           |  1 +
>  2 files changed, 21 insertions(+)
>
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 969b458100fe..d0dcf675ebe8 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -653,6 +653,25 @@ static int ovl_flush(struct file *file, fl_owner_t id)
>  	return err;
>  }
>
> +static unsigned long ovl_get_unmapped_area(struct file *file,
> +		unsigned long addr, unsigned long len, unsigned long pgoff,
> +		unsigned long flags)
> +{
> +	struct file *realfile;
> +	const struct cred *old_cred;
> +	unsigned long ret;
> +
> +	realfile = ovl_real_file(file);
> +	if (IS_ERR(realfile))
> +		return PTR_ERR(realfile);
> +
> +	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +	ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
> +	ovl_revert_creds(old_cred);

Why are you overriding credentials, then reinstating them here? That
seems... iffy? I knew nothing about overlayfs so this may just be a
misunderstanding...

> +
> +	return ret;
> +}
> +
>  const struct file_operations ovl_file_operations = {
>  	.open		= ovl_open,
>  	.release	= ovl_release,
> @@ -661,6 +680,7 @@ const struct file_operations ovl_file_operations = {
>  	.write_iter	= ovl_write_iter,
>  	.fsync		= ovl_fsync,
>  	.mmap		= ovl_mmap,
> +	.get_unmapped_area = ovl_get_unmapped_area,
>  	.fallocate	= ovl_fallocate,
>  	.fadvise	= ovl_fadvise,
>  	.flush		= ovl_flush,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 16f8e8be01f8..60eb1ff7c9a8 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -913,6 +913,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>  	error = security_mmap_addr(addr);
>  	return error ? error : addr;
>  }
> +EXPORT_SYMBOL(__get_unmapped_area);

We'll need a VERY good reason to export this internal implementation
detail, and if that were provided we'd need a VERY good reason for it not
to be GPL.

This just seems to be a cheap way of invoking thp_get_unmapped_area(),
maybe, if it is being used by the underlying file system.

And again... why not just add large folio support? We can't just take a
hack here.

>
>  unsigned long
>  mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
> --
> 2.34.1
>
Amir Goldstein Dec. 5, 2024, 3:12 p.m. UTC | #2
On Thu, Dec 5, 2024 at 4:04 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> + Matthew for large folio aspect
>
> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > During our tests in containers, there is a read-only file (i.e., shared
> > libraies) in the overlayfs filesystem, and the underlying filesystem is
> > ext4, which supports large folio. We mmap the file with PROT_READ prot,
> > and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> > fails and returns EINVAL.
> >
> > The reason is that the mapping address isn't aligned to PMD size. Since
> > overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> > thp_get_unmapped_area() to get a THP aligned address.
> >
> > To fix it, call get_unmapped_area() with the realfile.
>
> Isn't the correct solution to get overlayfs to support large folios?
>
> >
> > Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> > export get_unmapped_area().
>
> Yeah, not in favour of this at all. This is an internal implementation
> detail. It seems like you're trying to hack your way into avoiding
> providing support for large folios and to hand it off to the underlying
> file system.
>
> Again, why don't you just support large folios in overlayfs?
>

This whole discussion seems moot.
overlayfs does not have address_space operations
It does not have its own page cache.

The file in  vma->vm_file is not an overlayfs file at all - it is the
real (e.g. ext4) file
when returning from ovl_mmap() => backing_file_mmap()
so I have very little clue why the proposed solution even works,
but it certainly does not look correct.

Thanks,
Amir.
Lorenzo Stoakes Dec. 5, 2024, 3:24 p.m. UTC | #3
(fixing typo in cc list: tujinjiang@huawe.com -> tujinjiang@huawei.com)

+ Liam

(JinJiang - you forgot to cc the correct maintainers, please ensure you run
scripts/get_maintainers.pl on files you change)

On Thu, Dec 05, 2024 at 04:12:12PM +0100, Amir Goldstein wrote:
> On Thu, Dec 5, 2024 at 4:04 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > + Matthew for large folio aspect
> >
> > On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > > During our tests in containers, there is a read-only file (i.e., shared
> > > libraies) in the overlayfs filesystem, and the underlying filesystem is
> > > ext4, which supports large folio. We mmap the file with PROT_READ prot,
> > > and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> > > fails and returns EINVAL.
> > >
> > > The reason is that the mapping address isn't aligned to PMD size. Since
> > > overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> > > thp_get_unmapped_area() to get a THP aligned address.
> > >
> > > To fix it, call get_unmapped_area() with the realfile.
> >
> > Isn't the correct solution to get overlayfs to support large folios?
> >
> > >
> > > Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> > > export get_unmapped_area().
> >
> > Yeah, not in favour of this at all. This is an internal implementation
> > detail. It seems like you're trying to hack your way into avoiding
> > providing support for large folios and to hand it off to the underlying
> > file system.
> >
> > Again, why don't you just support large folios in overlayfs?
> >
>
> This whole discussion seems moot.
> overlayfs does not have address_space operations
> It does not have its own page cache.

And here we see my total lack of knowledge of overlayfs coming into play
here :) Thanks for pointing this out.

In that case, I object even further to the original of course...

>
> The file in  vma->vm_file is not an overlayfs file at all - it is the
> real (e.g. ext4) file
> when returning from ovl_mmap() => backing_file_mmap()
> so I have very little clue why the proposed solution even works,
> but it certainly does not look correct.

I think then Jinjiang in this cause you ought to go back to the drawing
board and reconsider what might be the underlying issue here.

>
> Thanks,
> Amir.

Cheers, Lorenzo
kernel test robot Dec. 5, 2024, 9:21 p.m. UTC | #4
Hi Jinjiang,

kernel test robot noticed the following build errors:

[auto build test ERROR on next-20241204]

url:    https://github.com/intel-lab-lkp/linux/commits/Jinjiang-Tu/ovl-respect-underlying-filesystem-s-get_unmapped_area/20241205-224026
base:   next-20241204
patch link:    https://lore.kernel.org/r/20241205143038.3260233-1-tujinjiang%40huawei.com
patch subject: [PATCH -next] ovl: respect underlying filesystem's get_unmapped_area()
config: arm-randconfig-001-20241206 (https://download.01.org/0day-ci/archive/20241206/202412060524.5Ata5N0H-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241206/202412060524.5Ata5N0H-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412060524.5Ata5N0H-lkp@intel.com/

All errors (new ones prefixed by >>):

   arm-linux-gnueabi-ld: fs/overlayfs/file.o: in function `ovl_get_unmapped_area':
>> file.c:(.text+0x998): undefined reference to `__get_unmapped_area'
Jinjiang Tu Dec. 6, 2024, 3:35 a.m. UTC | #5
在 2024/12/5 23:04, Lorenzo Stoakes 写道:
> + Matthew for large folio aspect
>
> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>> During our tests in containers, there is a read-only file (i.e., shared
>> libraies) in the overlayfs filesystem, and the underlying filesystem is
>> ext4, which supports large folio. We mmap the file with PROT_READ prot,
>> and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
>> fails and returns EINVAL.
>>
>> The reason is that the mapping address isn't aligned to PMD size. Since
>> overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
>> thp_get_unmapped_area() to get a THP aligned address.
>>
>> To fix it, call get_unmapped_area() with the realfile.
> Isn't the correct solution to get overlayfs to support large folios?
>
>> Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
>> export get_unmapped_area().
> Yeah, not in favour of this at all. This is an internal implementation
> detail. It seems like you're trying to hack your way into avoiding
> providing support for large folios and to hand it off to the underlying
> file system.
>
> Again, why don't you just support large folios in overlayfs?
>
> Literally no other file system or driver appears to make use of this
> directly in this manner.
>
> And there's absolutely no way this should be exported non-GPL as if it were
> unavoidable core functionality that everyone needs. Only you seem to...
>
>> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
>> ---
>>   fs/overlayfs/file.c | 20 ++++++++++++++++++++
>>   mm/mmap.c           |  1 +
>>   2 files changed, 21 insertions(+)
>>
>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>> index 969b458100fe..d0dcf675ebe8 100644
>> --- a/fs/overlayfs/file.c
>> +++ b/fs/overlayfs/file.c
>> @@ -653,6 +653,25 @@ static int ovl_flush(struct file *file, fl_owner_t id)
>>   	return err;
>>   }
>>
>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>> +		unsigned long addr, unsigned long len, unsigned long pgoff,
>> +		unsigned long flags)
>> +{
>> +	struct file *realfile;
>> +	const struct cred *old_cred;
>> +	unsigned long ret;
>> +
>> +	realfile = ovl_real_file(file);
>> +	if (IS_ERR(realfile))
>> +		return PTR_ERR(realfile);
>> +
>> +	old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> +	ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
>> +	ovl_revert_creds(old_cred);
> Why are you overriding credentials, then reinstating them here? That
> seems... iffy? I knew nothing about overlayfs so this may just be a
> misunderstanding...

I refer to other file operations in overlayfs (i.e., ovl_fallocate, backing_file_mmap).
Since get_unmapped_area() has security related operations (e.g., security_mmap_addr()),
We should call it with the cred of the underlying file.

>
>> +
>> +	return ret;
>> +}
>> +
>>   const struct file_operations ovl_file_operations = {
>>   	.open		= ovl_open,
>>   	.release	= ovl_release,
>> @@ -661,6 +680,7 @@ const struct file_operations ovl_file_operations = {
>>   	.write_iter	= ovl_write_iter,
>>   	.fsync		= ovl_fsync,
>>   	.mmap		= ovl_mmap,
>> +	.get_unmapped_area = ovl_get_unmapped_area,
>>   	.fallocate	= ovl_fallocate,
>>   	.fadvise	= ovl_fadvise,
>>   	.flush		= ovl_flush,
>> diff --git a/mm/mmap.c b/mm/mmap.c
>> index 16f8e8be01f8..60eb1ff7c9a8 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -913,6 +913,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>>   	error = security_mmap_addr(addr);
>>   	return error ? error : addr;
>>   }
>> +EXPORT_SYMBOL(__get_unmapped_area);
> We'll need a VERY good reason to export this internal implementation
> detail, and if that were provided we'd need a VERY good reason for it not
> to be GPL.
>
> This just seems to be a cheap way of invoking (),
> maybe, if it is being used by the underlying file system.

But the underlying file system may not support large folio. In this case,
the mmap address doesn't need to be aligned with THP size.

>
> And again... why not just add large folio support? We can't just take a
> hack here.
>
>>   unsigned long
>>   mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
>> --
>> 2.34.1
>>
Jinjiang Tu Dec. 6, 2024, 3:35 a.m. UTC | #6
在 2024/12/5 23:24, Lorenzo Stoakes 写道:
> (fixing typo in cc list: tujinjiang@huawe.com -> tujinjiang@huawei.com)
>
> + Liam
>
> (JinJiang - you forgot to cc the correct maintainers, please ensure you run
> scripts/get_maintainers.pl on files you change)
>
> On Thu, Dec 05, 2024 at 04:12:12PM +0100, Amir Goldstein wrote:
>> On Thu, Dec 5, 2024 at 4:04 PM Lorenzo Stoakes
>> <lorenzo.stoakes@oracle.com> wrote:
>>> + Matthew for large folio aspect
>>>
>>> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>>>> During our tests in containers, there is a read-only file (i.e., shared
>>>> libraies) in the overlayfs filesystem, and the underlying filesystem is
>>>> ext4, which supports large folio. We mmap the file with PROT_READ prot,
>>>> and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
>>>> fails and returns EINVAL.
>>>>
>>>> The reason is that the mapping address isn't aligned to PMD size. Since
>>>> overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
>>>> thp_get_unmapped_area() to get a THP aligned address.
>>>>
>>>> To fix it, call get_unmapped_area() with the realfile.
>>> Isn't the correct solution to get overlayfs to support large folios?
>>>
>>>> Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
>>>> export get_unmapped_area().
>>> Yeah, not in favour of this at all. This is an internal implementation
>>> detail. It seems like you're trying to hack your way into avoiding
>>> providing support for large folios and to hand it off to the underlying
>>> file system.
>>>
>>> Again, why don't you just support large folios in overlayfs?
>>>
>> This whole discussion seems moot.
>> overlayfs does not have address_space operations
>> It does not have its own page cache.
> And here we see my total lack of knowledge of overlayfs coming into play
> here :) Thanks for pointing this out.
>
> In that case, I object even further to the original of course...
>
>> The file in  vma->vm_file is not an overlayfs file at all - it is the
>> real (e.g. ext4) file
>> when returning from ovl_mmap() => backing_file_mmap()
>> so I have very little clue why the proposed solution even works,
>> but it certainly does not look correct.
> I think then Jinjiang in this cause you ought to go back to the drawing
> board and reconsider what might be the underlying issue here.

When usespace calls mmap syscall, the call trace is as follows:

do_mmap
   __get_unmapped_area
   mmap_region
     mmap_file
       ovl_mmap

__get_unmapped_area() gets the address to mmap at, the file here is an overlayfs file.
Since ovl_file_operations doesn't defines get_unmapped_area callback, __get_unmapped_area()
fallbacks to mm_get_unmapped_area_vmflags(), and it doesn't return an address aligned to
large folio size.

>
>> Thanks,
>> Amir.
> Cheers, Lorenzo
>
Lorenzo Stoakes Dec. 6, 2024, 9:25 a.m. UTC | #7
To be clear - I'm not accepting the export of __get_unmapped_area() so if
you depend on this for this approach, you can't take this approach.

It's an internal implementation detail. That you choose to make your
filesystem possibly a module doesn't mean that mm is required to export
internal impl details to you. Sorry.

To rescind this would require a very strong argument, you have not provided
it.

On Fri, Dec 06, 2024 at 11:35:08AM +0800, Jinjiang Tu wrote:
>
> 在 2024/12/5 23:04, Lorenzo Stoakes 写道:
> > + Matthew for large folio aspect
> >
> > On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > > During our tests in containers, there is a read-only file (i.e., shared
> > > libraies) in the overlayfs filesystem, and the underlying filesystem is
> > > ext4, which supports large folio. We mmap the file with PROT_READ prot,
> > > and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> > > fails and returns EINVAL.
> > >
> > > The reason is that the mapping address isn't aligned to PMD size. Since
> > > overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> > > thp_get_unmapped_area() to get a THP aligned address.
> > >
> > > To fix it, call get_unmapped_area() with the realfile.
> > Isn't the correct solution to get overlayfs to support large folios?
> >
> > > Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> > > export get_unmapped_area().
> > Yeah, not in favour of this at all. This is an internal implementation
> > detail. It seems like you're trying to hack your way into avoiding
> > providing support for large folios and to hand it off to the underlying
> > file system.
> >
> > Again, why don't you just support large folios in overlayfs?
> >
> > Literally no other file system or driver appears to make use of this
> > directly in this manner.
> >
> > And there's absolutely no way this should be exported non-GPL as if it were
> > unavoidable core functionality that everyone needs. Only you seem to...
> >
> > > Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
> > > ---
> > >   fs/overlayfs/file.c | 20 ++++++++++++++++++++
> > >   mm/mmap.c           |  1 +
> > >   2 files changed, 21 insertions(+)
> > >
> > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > index 969b458100fe..d0dcf675ebe8 100644
> > > --- a/fs/overlayfs/file.c
> > > +++ b/fs/overlayfs/file.c
> > > @@ -653,6 +653,25 @@ static int ovl_flush(struct file *file, fl_owner_t id)
> > >   	return err;
> > >   }
> > >
> > > +static unsigned long ovl_get_unmapped_area(struct file *file,
> > > +		unsigned long addr, unsigned long len, unsigned long pgoff,
> > > +		unsigned long flags)
> > > +{
> > > +	struct file *realfile;
> > > +	const struct cred *old_cred;
> > > +	unsigned long ret;
> > > +
> > > +	realfile = ovl_real_file(file);
> > > +	if (IS_ERR(realfile))
> > > +		return PTR_ERR(realfile);
> > > +
> > > +	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> > > +	ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
> > > +	ovl_revert_creds(old_cred);
> > Why are you overriding credentials, then reinstating them here? That
> > seems... iffy? I knew nothing about overlayfs so this may just be a
> > misunderstanding...
>
> I refer to other file operations in overlayfs (i.e., ovl_fallocate, backing_file_mmap).
> Since get_unmapped_area() has security related operations (e.g., security_mmap_addr()),
> We should call it with the cred of the underlying file.
>
> >
> > > +
> > > +	return ret;
> > > +}
> > > +
> > >   const struct file_operations ovl_file_operations = {
> > >   	.open		= ovl_open,
> > >   	.release	= ovl_release,
> > > @@ -661,6 +680,7 @@ const struct file_operations ovl_file_operations = {
> > >   	.write_iter	= ovl_write_iter,
> > >   	.fsync		= ovl_fsync,
> > >   	.mmap		= ovl_mmap,
> > > +	.get_unmapped_area = ovl_get_unmapped_area,
> > >   	.fallocate	= ovl_fallocate,
> > >   	.fadvise	= ovl_fadvise,
> > >   	.flush		= ovl_flush,
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 16f8e8be01f8..60eb1ff7c9a8 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -913,6 +913,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
> > >   	error = security_mmap_addr(addr);
> > >   	return error ? error : addr;
> > >   }
> > > +EXPORT_SYMBOL(__get_unmapped_area);
> > We'll need a VERY good reason to export this internal implementation
> > detail, and if that were provided we'd need a VERY good reason for it not
> > to be GPL.
> >
> > This just seems to be a cheap way of invoking (),
> > maybe, if it is being used by the underlying file system.
>
> But the underlying file system may not support large folio. In this case,
> the mmap address doesn't need to be aligned with THP size.

But it'd not cause any problems to just do that anyway right? I don't think
many people think 'oh no I have a PMD aligned mapping now what will I do'?

Again - the right solution here is to handle large folios in overlayfs as
far as I can tell.

In any case as per the above, we're just not exporting
__get_unmapped_area(), sorry.

>
> >
> > And again... why not just add large folio support? We can't just take a
> > hack here.
> >
> > >   unsigned long
> > >   mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
> > > --
> > > 2.34.1
> > >
Kefeng Wang Dec. 6, 2024, 10:45 a.m. UTC | #8
On 2024/12/6 17:25, Lorenzo Stoakes wrote:
> To be clear - I'm not accepting the export of __get_unmapped_area() so if
> you depend on this for this approach, you can't take this approach.
> 
> It's an internal implementation detail. That you choose to make your
> filesystem possibly a module doesn't mean that mm is required to export
> internal impl details to you. Sorry.
> 
> To rescind this would require a very strong argument, you have not provided
> it.
> 
> On Fri, Dec 06, 2024 at 11:35:08AM +0800, Jinjiang Tu wrote:
>>
>> 在 2024/12/5 23:04, Lorenzo Stoakes 写道:
>>> + Matthew for large folio aspect
>>>
>>> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>>>> During our tests in containers, there is a read-only file (i.e., shared
>>>> libraies) in the overlayfs filesystem, and the underlying filesystem is
>>>> ext4, which supports large folio. We mmap the file with PROT_READ prot,
>>>> and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
>>>> fails and returns EINVAL.
>>>>
>>>> The reason is that the mapping address isn't aligned to PMD size. Since
>>>> overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
>>>> thp_get_unmapped_area() to get a THP aligned address.
>>>>
>>>> To fix it, call get_unmapped_area() with the realfile.
>>> Isn't the correct solution to get overlayfs to support large folios?
>>>
>>>> Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
>>>> export get_unmapped_area().
>>> Yeah, not in favour of this at all. This is an internal implementation
>>> detail. It seems like you're trying to hack your way into avoiding
>>> providing support for large folios and to hand it off to the underlying
>>> file system.
>>>
>>> Again, why don't you just support large folios in overlayfs?
>>>
>>> Literally no other file system or driver appears to make use of this
>>> directly in this manner.
>>>
>>> And there's absolutely no way this should be exported non-GPL as if it were
>>> unavoidable core functionality that everyone needs. Only you seem to...
>>>
>>>> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
>>>> ---
>>>>    fs/overlayfs/file.c | 20 ++++++++++++++++++++
>>>>    mm/mmap.c           |  1 +
>>>>    2 files changed, 21 insertions(+)
>>>>
>>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>>>> index 969b458100fe..d0dcf675ebe8 100644
>>>> --- a/fs/overlayfs/file.c
>>>> +++ b/fs/overlayfs/file.c
>>>> @@ -653,6 +653,25 @@ static int ovl_flush(struct file *file, fl_owner_t id)
>>>>    	return err;
>>>>    }
>>>>
>>>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>>>> +		unsigned long addr, unsigned long len, unsigned long pgoff,
>>>> +		unsigned long flags)
>>>> +{
>>>> +	struct file *realfile;
>>>> +	const struct cred *old_cred;
>>>> +	unsigned long ret;
>>>> +
>>>> +	realfile = ovl_real_file(file);
>>>> +	if (IS_ERR(realfile))
>>>> +		return PTR_ERR(realfile);
>>>> +
>>>> +	old_cred = ovl_override_creds(file_inode(file)->i_sb);
>>>> +	ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
>>>> +	ovl_revert_creds(old_cred);
>>> Why are you overriding credentials, then reinstating them here? That
>>> seems... iffy? I knew nothing about overlayfs so this may just be a
>>> misunderstanding...
>>
>> I refer to other file operations in overlayfs (i.e., ovl_fallocate, backing_file_mmap).
>> Since get_unmapped_area() has security related operations (e.g., security_mmap_addr()),
>> We should call it with the cred of the underlying file.
>>
>>>
>>>> +
>>>> +	return ret;
>>>> +}
>>>> +
>>>>    const struct file_operations ovl_file_operations = {
>>>>    	.open		= ovl_open,
>>>>    	.release	= ovl_release,
>>>> @@ -661,6 +680,7 @@ const struct file_operations ovl_file_operations = {
>>>>    	.write_iter	= ovl_write_iter,
>>>>    	.fsync		= ovl_fsync,
>>>>    	.mmap		= ovl_mmap,
>>>> +	.get_unmapped_area = ovl_get_unmapped_area,
>>>>    	.fallocate	= ovl_fallocate,
>>>>    	.fadvise	= ovl_fadvise,
>>>>    	.flush		= ovl_flush,
>>>> diff --git a/mm/mmap.c b/mm/mmap.c
>>>> index 16f8e8be01f8..60eb1ff7c9a8 100644
>>>> --- a/mm/mmap.c
>>>> +++ b/mm/mmap.c
>>>> @@ -913,6 +913,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>>>>    	error = security_mmap_addr(addr);
>>>>    	return error ? error : addr;
>>>>    }
>>>> +EXPORT_SYMBOL(__get_unmapped_area);
>>> We'll need a VERY good reason to export this internal implementation
>>> detail, and if that were provided we'd need a VERY good reason for it not
>>> to be GPL.
>>>
>>> This just seems to be a cheap way of invoking (),
>>> maybe, if it is being used by the underlying file system.
>>
>> But the underlying file system may not support large folio. In this case,
>> the mmap address doesn't need to be aligned with THP size.
> 
> But it'd not cause any problems to just do that anyway right? I don't think
> many people think 'oh no I have a PMD aligned mapping now what will I do'?
> 
> Again - the right solution here is to handle large folios in overlayfs as
> far as I can tell.

I think this is not to handle large folio for overlayfs, it is about vma
alignment or vma allocation for memory mapped files,


1) many fs support THP mapping, using thp_get_unmapped_area(),

fs/bcachefs/fs.c:       .get_unmapped_area = thp_get_unmapped_area,
fs/btrfs/file.c:        .get_unmapped_area = thp_get_unmapped_area,
fs/erofs/data.c:        .get_unmapped_area = thp_get_unmapped_area,
fs/ext2/file.c: .get_unmapped_area = thp_get_unmapped_area,
fs/ext4/file.c: .get_unmapped_area = thp_get_unmapped_area,
fs/fuse/file.c: .get_unmapped_area = thp_get_unmapped_area,
fs/xfs/xfs_file.c:      .get_unmapped_area = thp_get_unmapped_area,

2) and some fs has own get_unmapped_area callback too,

fs/cramfs/inode.c:	.get_unmapped_area	= cramfs_physmem_get_unmapped_area,
fs/hugetlbfs/inode.c:	.get_unmapped_area	= hugetlb_get_unmapped_area,
fs/ramfs/file-mmu.c:	.get_unmapped_area	= ramfs_mmu_get_unmapped_area,
fs/ramfs/file-nommu.c:	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
fs/romfs/mmap-nommu.c:	.get_unmapped_area	= romfs_get_unmapped_area,
mm/shmem.c:	.get_unmapped_area = shmem_get_unmapped_area,

They has own rules to get a vma area, but with overlayfs(tries to
present a filesystem which is the result over overlaying one filesystem
on top of the other), we now only use the default
mm_get_unmapped_area_vmflags() to get a vma area, since the overlayfs
has no '.get_unmapped_area' callback.

do_mmap
   __get_unmapped_area
     // get_area = NULL
     mm_get_unmapped_area_vmflags
   mmap_region
     mmap_file
       ovl_mmap

It looks wrong, so we need to get the readfile via ovl_real_file()
and use realfile' get_unmapped_area callback, and if the realfile
is not with the callback, fallback to the default
mm_get_unmapped_area(),

> 
> In any case as per the above, we're just not exporting
> __get_unmapped_area(), sorry.
> 

So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
something like below,

+static unsigned long ovl_get_unmapped_area(struct file *file,
+               unsigned long addr, unsigned long len, unsigned long pgoff,
+               unsigned long flags)
+{
+       struct file *realfile;
+       const struct cred *old_cred;
+
+       realfile = ovl_real_file(file);
+       if (IS_ERR(realfile))
+               return PTR_ERR(realfile);
+
+       if (realfile->f_op->get_unmapped_area) {
+               unsigned long ret;
+
+               old_cred = ovl_override_creds(file_inode(file)->i_sb);
+               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
+                                                       pgoff, flags);
+               ovl_revert_creds(old_cred);
+
+               if (ret)
+                       return ret;
+       }
+
+       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, 
flags);
+}

Correct me If I'm wrong.

Thanks.


>>
>>>
>>> And again... why not just add large folio support? We can't just take a
>>> hack here.
>>>
>>>>    unsigned long
>>>>    mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
>>>> --
>>>> 2.34.1
>>>>
>
Lorenzo Stoakes Dec. 6, 2024, 11:53 a.m. UTC | #9
On Fri, Dec 06, 2024 at 06:45:11PM +0800, Kefeng Wang wrote:
> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
> something like below,
>
> +static unsigned long ovl_get_unmapped_area(struct file *file,
> +               unsigned long addr, unsigned long len, unsigned long pgoff,
> +               unsigned long flags)
> +{
> +       struct file *realfile;
> +       const struct cred *old_cred;
> +
> +       realfile = ovl_real_file(file);
> +       if (IS_ERR(realfile))
> +               return PTR_ERR(realfile);
> +
> +       if (realfile->f_op->get_unmapped_area) {
> +               unsigned long ret;
> +
> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
> +                                                       pgoff, flags);
> +               ovl_revert_creds(old_cred);

Credentials stuff necessary now you're not having security_mmap_addr()
called etc.?

> +
> +               if (ret)
> +                       return ret;

Surely you'd unconditionally return in this case? I don't think there's any case
where you'd want to fall through.

> +       }
> +
> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
> flags);
> +}
>
> Correct me If I'm wrong.
>
> Thanks.
>

I mean this doesn't export anything we don't want exported so this is fine
from that perspective :)

And I guess in principle this is OK in that __get_unmapped_area() would be
invoked on the overlay file, will do the required arch_mmap_check(), then
will invoke your overlay handler.

I did think of suggesting invoking the f_op directly, but it feels icky
vs. just supporting large folios.

But actually... hm I realise I overlooked the fact that underlying _files_
will always provide a large folio-aware handler.

I'm guessing you can't use overlayfs somehow with a MAP_ANON | MAP_SHARED
mapping or similar, thinking of:

	if (file) {
		...
	} else if (flags & MAP_SHARED) {
		/*
		 * mmap_region() will call shmem_zero_setup() to create a file,
		 * so use shmem's get_unmapped_area in case it can be huge.
		 */
		get_area = shmem_get_unmapped_area;
	}

But surely actually any case that works with overlayfs will have a file and
so... yeah.

Hm, I actually think this should work.

Can you make sure to do some pretty thorough testing on this just to make
sure you're not hitting on any weirdness?
Amir Goldstein Dec. 6, 2024, 12:58 p.m. UTC | #10
On Fri, Dec 6, 2024 at 11:45 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>
>
>
> On 2024/12/6 17:25, Lorenzo Stoakes wrote:
> > To be clear - I'm not accepting the export of __get_unmapped_area() so if
> > you depend on this for this approach, you can't take this approach.
> >
> > It's an internal implementation detail. That you choose to make your
> > filesystem possibly a module doesn't mean that mm is required to export
> > internal impl details to you. Sorry.
> >
> > To rescind this would require a very strong argument, you have not provided
> > it.
> >
> > On Fri, Dec 06, 2024 at 11:35:08AM +0800, Jinjiang Tu wrote:
> >>
> >> 在 2024/12/5 23:04, Lorenzo Stoakes 写道:
> >>> + Matthew for large folio aspect
> >>>
> >>> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> >>>> During our tests in containers, there is a read-only file (i.e., shared
> >>>> libraies) in the overlayfs filesystem, and the underlying filesystem is
> >>>> ext4, which supports large folio. We mmap the file with PROT_READ prot,
> >>>> and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> >>>> fails and returns EINVAL.
> >>>>
> >>>> The reason is that the mapping address isn't aligned to PMD size. Since
> >>>> overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> >>>> thp_get_unmapped_area() to get a THP aligned address.
> >>>>
> >>>> To fix it, call get_unmapped_area() with the realfile.
> >>> Isn't the correct solution to get overlayfs to support large folios?
> >>>
> >>>> Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> >>>> export get_unmapped_area().
> >>> Yeah, not in favour of this at all. This is an internal implementation
> >>> detail. It seems like you're trying to hack your way into avoiding
> >>> providing support for large folios and to hand it off to the underlying
> >>> file system.
> >>>
> >>> Again, why don't you just support large folios in overlayfs?
> >>>
> >>> Literally no other file system or driver appears to make use of this
> >>> directly in this manner.
> >>>
> >>> And there's absolutely no way this should be exported non-GPL as if it were
> >>> unavoidable core functionality that everyone needs. Only you seem to...
> >>>
> >>>> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
> >>>> ---
> >>>>    fs/overlayfs/file.c | 20 ++++++++++++++++++++
> >>>>    mm/mmap.c           |  1 +
> >>>>    2 files changed, 21 insertions(+)
> >>>>
> >>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> >>>> index 969b458100fe..d0dcf675ebe8 100644
> >>>> --- a/fs/overlayfs/file.c
> >>>> +++ b/fs/overlayfs/file.c
> >>>> @@ -653,6 +653,25 @@ static int ovl_flush(struct file *file, fl_owner_t id)
> >>>>            return err;
> >>>>    }
> >>>>
> >>>> +static unsigned long ovl_get_unmapped_area(struct file *file,
> >>>> +          unsigned long addr, unsigned long len, unsigned long pgoff,
> >>>> +          unsigned long flags)
> >>>> +{
> >>>> +  struct file *realfile;
> >>>> +  const struct cred *old_cred;
> >>>> +  unsigned long ret;
> >>>> +
> >>>> +  realfile = ovl_real_file(file);
> >>>> +  if (IS_ERR(realfile))
> >>>> +          return PTR_ERR(realfile);
> >>>> +
> >>>> +  old_cred = ovl_override_creds(file_inode(file)->i_sb);
> >>>> +  ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
> >>>> +  ovl_revert_creds(old_cred);
> >>> Why are you overriding credentials, then reinstating them here? That
> >>> seems... iffy? I knew nothing about overlayfs so this may just be a
> >>> misunderstanding...
> >>
> >> I refer to other file operations in overlayfs (i.e., ovl_fallocate, backing_file_mmap).
> >> Since get_unmapped_area() has security related operations (e.g., security_mmap_addr()),
> >> We should call it with the cred of the underlying file.
> >>
> >>>
> >>>> +
> >>>> +  return ret;
> >>>> +}
> >>>> +
> >>>>    const struct file_operations ovl_file_operations = {
> >>>>            .open           = ovl_open,
> >>>>            .release        = ovl_release,
> >>>> @@ -661,6 +680,7 @@ const struct file_operations ovl_file_operations = {
> >>>>            .write_iter     = ovl_write_iter,
> >>>>            .fsync          = ovl_fsync,
> >>>>            .mmap           = ovl_mmap,
> >>>> +  .get_unmapped_area = ovl_get_unmapped_area,
> >>>>            .fallocate      = ovl_fallocate,
> >>>>            .fadvise        = ovl_fadvise,
> >>>>            .flush          = ovl_flush,
> >>>> diff --git a/mm/mmap.c b/mm/mmap.c
> >>>> index 16f8e8be01f8..60eb1ff7c9a8 100644
> >>>> --- a/mm/mmap.c
> >>>> +++ b/mm/mmap.c
> >>>> @@ -913,6 +913,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
> >>>>            error = security_mmap_addr(addr);
> >>>>            return error ? error : addr;
> >>>>    }
> >>>> +EXPORT_SYMBOL(__get_unmapped_area);
> >>> We'll need a VERY good reason to export this internal implementation
> >>> detail, and if that were provided we'd need a VERY good reason for it not
> >>> to be GPL.
> >>>
> >>> This just seems to be a cheap way of invoking (),
> >>> maybe, if it is being used by the underlying file system.
> >>
> >> But the underlying file system may not support large folio. In this case,
> >> the mmap address doesn't need to be aligned with THP size.
> >
> > But it'd not cause any problems to just do that anyway right? I don't think
> > many people think 'oh no I have a PMD aligned mapping now what will I do'?
> >
> > Again - the right solution here is to handle large folios in overlayfs as
> > far as I can tell.
>
> I think this is not to handle large folio for overlayfs, it is about vma
> alignment or vma allocation for memory mapped files,
>
>
> 1) many fs support THP mapping, using thp_get_unmapped_area(),
>
> fs/bcachefs/fs.c:       .get_unmapped_area = thp_get_unmapped_area,
> fs/btrfs/file.c:        .get_unmapped_area = thp_get_unmapped_area,
> fs/erofs/data.c:        .get_unmapped_area = thp_get_unmapped_area,
> fs/ext2/file.c: .get_unmapped_area = thp_get_unmapped_area,
> fs/ext4/file.c: .get_unmapped_area = thp_get_unmapped_area,
> fs/fuse/file.c: .get_unmapped_area = thp_get_unmapped_area,
> fs/xfs/xfs_file.c:      .get_unmapped_area = thp_get_unmapped_area,
>
> 2) and some fs has own get_unmapped_area callback too,
>
> fs/cramfs/inode.c:      .get_unmapped_area      = cramfs_physmem_get_unmapped_area,
> fs/hugetlbfs/inode.c:   .get_unmapped_area      = hugetlb_get_unmapped_area,
> fs/ramfs/file-mmu.c:    .get_unmapped_area      = ramfs_mmu_get_unmapped_area,
> fs/ramfs/file-nommu.c:  .get_unmapped_area      = ramfs_nommu_get_unmapped_area,
> fs/romfs/mmap-nommu.c:  .get_unmapped_area      = romfs_get_unmapped_area,
> mm/shmem.c:     .get_unmapped_area = shmem_get_unmapped_area,
>
> They has own rules to get a vma area, but with overlayfs(tries to
> present a filesystem which is the result over overlaying one filesystem
> on top of the other), we now only use the default
> mm_get_unmapped_area_vmflags() to get a vma area, since the overlayfs
> has no '.get_unmapped_area' callback.
>
> do_mmap
>    __get_unmapped_area
>      // get_area = NULL
>      mm_get_unmapped_area_vmflags
>    mmap_region
>      mmap_file
>        ovl_mmap
>
> It looks wrong, so we need to get the readfile via ovl_real_file()
> and use realfile' get_unmapped_area callback, and if the realfile
> is not with the callback, fallback to the default
> mm_get_unmapped_area(),
>
> >
> > In any case as per the above, we're just not exporting
> > __get_unmapped_area(), sorry.
> >
>
> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
> something like below,
>
> +static unsigned long ovl_get_unmapped_area(struct file *file,
> +               unsigned long addr, unsigned long len, unsigned long pgoff,
> +               unsigned long flags)
> +{
> +       struct file *realfile;
> +       const struct cred *old_cred;
> +
> +       realfile = ovl_real_file(file);
> +       if (IS_ERR(realfile))
> +               return PTR_ERR(realfile);
> +
> +       if (realfile->f_op->get_unmapped_area) {
> +               unsigned long ret;
> +
> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
> +                                                       pgoff, flags);
> +               ovl_revert_creds(old_cred);
> +
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
> flags);
> +}
>
> Correct me If I'm wrong.
>

You just need to be aware of the fact that between ovl_get_unmapped_area()
and ovl_mmap(), ovl_real_file(file) could change from the lower file, to the
upper file due to another operation that initiated copy-up.

Thanks,
Amir.
Matthew Wilcox (Oracle) Dec. 6, 2024, 1:52 p.m. UTC | #11
On Fri, Dec 06, 2024 at 11:35:20AM +0800, Jinjiang Tu wrote:
> When usespace calls mmap syscall, the call trace is as follows:
> 
> do_mmap
>   __get_unmapped_area
>   mmap_region
>     mmap_file
>       ovl_mmap
> 
> __get_unmapped_area() gets the address to mmap at, the file here is an overlayfs file.
> Since ovl_file_operations doesn't defines get_unmapped_area callback, __get_unmapped_area()
> fallbacks to mm_get_unmapped_area_vmflags(), and it doesn't return an address aligned to
> large folio size.

It doesn't need to.  large folios can be mapped at any alignment.
The get_unmapped_area overrides are just for efficiency (ie be
able to use PMD mappings when we happen to get a PMD-sized folio).
Matthew Wilcox (Oracle) Dec. 6, 2024, 1:58 p.m. UTC | #12
On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> During our tests in containers, there is a read-only file (i.e., shared

Show your test.
Jinjiang Tu Dec. 9, 2024, 6:43 a.m. UTC | #13
在 2024/12/6 21:58, Matthew Wilcox 写道:
> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>> During our tests in containers, there is a read-only file (i.e., shared
> Show your test.

I mmap an overlayfs file with PROT_READ, and call madvise(MADV_COLLAPSE), the code
is as follows:

	fd = open(path, O_RDONLY);
	addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
	ret = madvise(addr, size, MADV_COLLAPSE);

The addr isn't THP-aligned and ret is -1, errno is EINVAL.

>
Matthew Wilcox (Oracle) Dec. 9, 2024, 1:33 p.m. UTC | #14
On Mon, Dec 09, 2024 at 02:43:08PM +0800, Jinjiang Tu wrote:
> 
> 在 2024/12/6 21:58, Matthew Wilcox 写道:
> > On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > > During our tests in containers, there is a read-only file (i.e., shared
> > Show your test.
> 
> I mmap an overlayfs file with PROT_READ, and call madvise(MADV_COLLAPSE), the code
> is as follows:
> 
> 	fd = open(path, O_RDONLY);
> 	addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
> 	ret = madvise(addr, size, MADV_COLLAPSE);
> 
> The addr isn't THP-aligned and ret is -1, errno is EINVAL.

Then your test is buggy.

         * Check alignment for file vma and size for both file and anon vma by
         * filtering out the unsuitable orders.

You didn't align your mmap, so it's expected to fail.
Kefeng Wang Dec. 10, 2024, 7:09 a.m. UTC | #15
On 2024/12/6 19:53, Lorenzo Stoakes wrote:
> On Fri, Dec 06, 2024 at 06:45:11PM +0800, Kefeng Wang wrote:
>> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
>> something like below,
>>
>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>> +               unsigned long addr, unsigned long len, unsigned long pgoff,
>> +               unsigned long flags)
>> +{
>> +       struct file *realfile;
>> +       const struct cred *old_cred;
>> +
>> +       realfile = ovl_real_file(file);
>> +       if (IS_ERR(realfile))
>> +               return PTR_ERR(realfile);
>> +
>> +       if (realfile->f_op->get_unmapped_area) {
>> +               unsigned long ret;
>> +
>> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
>> +                                                       pgoff, flags);
>> +               ovl_revert_creds(old_cred);
> 
> Credentials stuff necessary now you're not having security_mmap_addr()
> called etc.?

Not familiar with ovl, but we convert from file to realfile, adding cred
to prevent potential problems. maybe Amir could help to confirm it  :)

> 
>> +
>> +               if (ret)
>> +                       return ret;
> 
> Surely you'd unconditionally return in this case? I don't think there's any case
> where you'd want to fall through.

Yes, should directly return.

> 
>> +       }
>> +
>> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
>> flags);
>> +}
>>
>> Correct me If I'm wrong.
>>
>> Thanks.
>>
> 
> I mean this doesn't export anything we don't want exported so this is fine
> from that perspective :)
> 
> And I guess in principle this is OK in that __get_unmapped_area() would be
> invoked on the overlay file, will do the required arch_mmap_check(), then
> will invoke your overlay handler.
> 
> I did think of suggesting invoking the f_op directly, but it feels icky
> vs. just supporting large folios.
> 
> But actually... hm I realise I overlooked the fact that underlying _files_
> will always provide a large folio-aware handler.
> 
> I'm guessing you can't use overlayfs somehow with a MAP_ANON | MAP_SHARED
> mapping or similar, thinking of:
> 
> 	if (file) {
> 		...
> 	} else if (flags & MAP_SHARED) {
> 		/*
> 		 * mmap_region() will call shmem_zero_setup() to create a file,
> 		 * so use shmem's get_unmapped_area in case it can be huge.
> 		 */
> 		get_area = shmem_get_unmapped_area;
> 	}
> 
> But surely actually any case that works with overlayfs will have a file and
> so... yeah.
> 
> Hm, I actually think this should work.
> 
> Can you make sure to do some pretty thorough testing on this just to make
> sure you're not hitting on any weirdness?
> 

Great, I thin Jinjiang could continue to this work.
Jinjiang Tu Dec. 10, 2024, 7:13 a.m. UTC | #16
在 2024/12/9 21:33, Matthew Wilcox 写道:
> On Mon, Dec 09, 2024 at 02:43:08PM +0800, Jinjiang Tu wrote:
>> 在 2024/12/6 21:58, Matthew Wilcox 写道:
>>> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>>>> During our tests in containers, there is a read-only file (i.e., shared
>>> Show your test.
>> I mmap an overlayfs file with PROT_READ, and call madvise(MADV_COLLAPSE), the code
>> is as follows:
>>
>> 	fd = open(path, O_RDONLY);
>> 	addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
>> 	ret = madvise(addr, size, MADV_COLLAPSE);
>>
>> The addr isn't THP-aligned and ret is -1, errno is EINVAL.
> Then your test is buggy.
>
>           * Check alignment for file vma and size for both file and anon vma by
>           * filtering out the unsuitable orders.
>
> You didn't align your mmap, so it's expected to fail.

When mmap an ext4 file, since ext4_file_operations defines ".get_unmapped_area = thp_get_unmapped_area", it calls thp_get_unmapped_area() in __get_unmapped_area to
mmap at a THP-aligned address.

For overlayfs file, it's underlying filesystem may be ext4, which supports large folio. For this situation, should we mmap at a THP-aligned address too, to map
THP?

Thanks.
Kefeng Wang Dec. 10, 2024, 7:19 a.m. UTC | #17
On 2024/12/6 20:58, Amir Goldstein wrote:
> On Fri, Dec 6, 2024 at 11:45 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>>
...
>>
>> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
>> something like below,
>>
>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>> +               unsigned long addr, unsigned long len, unsigned long pgoff,
>> +               unsigned long flags)
>> +{
>> +       struct file *realfile;
>> +       const struct cred *old_cred;
>> +
>> +       realfile = ovl_real_file(file);
>> +       if (IS_ERR(realfile))
>> +               return PTR_ERR(realfile);
>> +
>> +       if (realfile->f_op->get_unmapped_area) {
>> +               unsigned long ret;
>> +
>> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
>> +                                                       pgoff, flags);
>> +               ovl_revert_creds(old_cred);
>> +
>> +               if (ret)
>> +                       return ret;
>> +       }
>> +
>> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
>> flags);
>> +}
>>
>> Correct me If I'm wrong.
>>
> 
> You just need to be aware of the fact that between ovl_get_unmapped_area()
> and ovl_mmap(), ovl_real_file(file) could change from the lower file, to the
> upper file due to another operation that initiated copy-up.

Not sure about this part(I have very little knowledge of ovl), do you
mean that we could not use ovl_real_file()?  The ovl_mmap() using
realfile = file->private_data, we may use similar way in
ovl_get_unmapped_area(). but I may have misunderstood.

Thanks.
Jann Horn Dec. 10, 2024, 5:36 p.m. UTC | #18
On Thu, Dec 5, 2024 at 4:24 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> (fixing typo in cc list: tujinjiang@huawe.com -> tujinjiang@huawei.com)
>
> + Liam
>
> (JinJiang - you forgot to cc the correct maintainers, please ensure you run
> scripts/get_maintainers.pl on files you change)
>
> On Thu, Dec 05, 2024 at 04:12:12PM +0100, Amir Goldstein wrote:
> > On Thu, Dec 5, 2024 at 4:04 PM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > + Matthew for large folio aspect
> > >
> > > On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > > > During our tests in containers, there is a read-only file (i.e., shared
> > > > libraies) in the overlayfs filesystem, and the underlying filesystem is
> > > > ext4, which supports large folio. We mmap the file with PROT_READ prot,
> > > > and then call madvise(MADV_COLLAPSE) for it. However, the madvise call
> > > > fails and returns EINVAL.
> > > >
> > > > The reason is that the mapping address isn't aligned to PMD size. Since
> > > > overlayfs doesn't support large folio, __get_unmapped_area() doesn't call
> > > > thp_get_unmapped_area() to get a THP aligned address.
> > > >
> > > > To fix it, call get_unmapped_area() with the realfile.
> > >
> > > Isn't the correct solution to get overlayfs to support large folios?
> > >
> > > >
> > > > Besides, since overlayfs may be built with CONFIG_OVERLAY_FS=m, we should
> > > > export get_unmapped_area().
> > >
> > > Yeah, not in favour of this at all. This is an internal implementation
> > > detail. It seems like you're trying to hack your way into avoiding
> > > providing support for large folios and to hand it off to the underlying
> > > file system.
> > >
> > > Again, why don't you just support large folios in overlayfs?
> > >
> >
> > This whole discussion seems moot.
> > overlayfs does not have address_space operations
> > It does not have its own page cache.
>
> And here we see my total lack of knowledge of overlayfs coming into play
> here :) Thanks for pointing this out.
>
> In that case, I object even further to the original of course...
>
> >
> > The file in  vma->vm_file is not an overlayfs file at all - it is the
> > real (e.g. ext4) file
> > when returning from ovl_mmap() => backing_file_mmap()
> > so I have very little clue why the proposed solution even works,
> > but it certainly does not look correct.
>
> I think then Jinjiang in this cause you ought to go back to the drawing
> board and reconsider what might be the underlying issue here.

To summarize: overlayfs switches out the VMA's backing file in the
->mmap handler. ->get_unmapped_area has to be called on the original
file, before the VMA is set up (obviously), but the VMA's ->vm_file
can only be overridden once the overlayfs ->mmap handler is called. So
the ->get_unmapped_area you see early in the mmap path is provided by
overlayfs, while the VMA you have in the end is actually basically
just a VMA of the backing file that doesn't have much to do with the
original file.

So I guess some possible solutions would be that overlayfs forwards
the .get_unmapped_area to the backing file manually, or that the
->vm_file swapping mechanism is changed to use some new separate
file_operations handler for "I want to use another backing file" that
is called before the get_unmapped_area stuff? (But to be clear, I'm
not saying whether these are good ideas or not. Maybe Lorenzo has more
of an opinion on that than I do.)

By the way, I think FUSE is kinda similar, FUSE also has a
"passthrough" mode that uses backing_file_mmap(); FUSE also doesn't
have any special code in their .get_unmapped_area handler for this.
But FUSE's .get_unmapped_area is set to thp_get_unmapped_area, which I
guess the passthrough mode it is sorta wrong the other way around and
unnecessarily over-aligns even if the backing file can't do THP?
Matthew Wilcox (Oracle) Dec. 10, 2024, 5:58 p.m. UTC | #19
On Tue, Dec 10, 2024 at 03:13:21PM +0800, Jinjiang Tu wrote:
> 在 2024/12/9 21:33, Matthew Wilcox 写道:
> > On Mon, Dec 09, 2024 at 02:43:08PM +0800, Jinjiang Tu wrote:
> > > 在 2024/12/6 21:58, Matthew Wilcox 写道:
> > > > On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
> > > > > During our tests in containers, there is a read-only file (i.e., shared
> > > > Show your test.
> > > I mmap an overlayfs file with PROT_READ, and call madvise(MADV_COLLAPSE), the code
> > > is as follows:
> > > 
> > > 	fd = open(path, O_RDONLY);
> > > 	addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
> > > 	ret = madvise(addr, size, MADV_COLLAPSE);
> > > 
> > > The addr isn't THP-aligned and ret is -1, errno is EINVAL.
> > Then your test is buggy.
> > 
> >           * Check alignment for file vma and size for both file and anon vma by
> >           * filtering out the unsuitable orders.
> > 
> > You didn't align your mmap, so it's expected to fail.
> 
> When mmap an ext4 file, since ext4_file_operations defines ".get_unmapped_area = thp_get_unmapped_area", it calls thp_get_unmapped_area() in __get_unmapped_area to
> mmap at a THP-aligned address.
> 
> For overlayfs file, it's underlying filesystem may be ext4, which supports large folio. For this situation, should we mmap at a THP-aligned address too, to map
> THP?

Actually, ext4 doesn't support large folios.  thp_get_unmapped_area()
was added for DAX in 2016 (dbe6ec815641).
Zhang Yi Dec. 11, 2024, 2:21 a.m. UTC | #20
On 2024/12/11 1:58, Matthew Wilcox wrote:
> On Tue, Dec 10, 2024 at 03:13:21PM +0800, Jinjiang Tu wrote:
>> 在 2024/12/9 21:33, Matthew Wilcox 写道:
>>> On Mon, Dec 09, 2024 at 02:43:08PM +0800, Jinjiang Tu wrote:
>>>> 在 2024/12/6 21:58, Matthew Wilcox 写道:
>>>>> On Thu, Dec 05, 2024 at 10:30:38PM +0800, Jinjiang Tu wrote:
>>>>>> During our tests in containers, there is a read-only file (i.e., shared
>>>>> Show your test.
>>>> I mmap an overlayfs file with PROT_READ, and call madvise(MADV_COLLAPSE), the code
>>>> is as follows:
>>>>
>>>> 	fd = open(path, O_RDONLY);
>>>> 	addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
>>>> 	ret = madvise(addr, size, MADV_COLLAPSE);
>>>>
>>>> The addr isn't THP-aligned and ret is -1, errno is EINVAL.
>>> Then your test is buggy.
>>>
>>>           * Check alignment for file vma and size for both file and anon vma by
>>>           * filtering out the unsuitable orders.
>>>
>>> You didn't align your mmap, so it's expected to fail.
>>
>> When mmap an ext4 file, since ext4_file_operations defines ".get_unmapped_area = thp_get_unmapped_area", it calls thp_get_unmapped_area() in __get_unmapped_area to
>> mmap at a THP-aligned address.
>>
>> For overlayfs file, it's underlying filesystem may be ext4, which supports large folio. For this situation, should we mmap at a THP-aligned address too, to map
>> THP?
> 
> Actually, ext4 doesn't support large folios.
> 

I have sent out two series to enable ext4 support for large folios.
Feel free to give it a try.

https://lore.kernel.org/linux-ext4/20241022111059.2566137-1-yi.zhang@huaweicloud.com/
https://lore.kernel.org/linux-ext4/20241125114419.903270-1-yi.zhang@huaweicloud.com/

Thanks,
Yi.
Amir Goldstein Dec. 11, 2024, 9:43 a.m. UTC | #21
On Tue, Dec 10, 2024 at 8:19 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>
>
>
> On 2024/12/6 20:58, Amir Goldstein wrote:
> > On Fri, Dec 6, 2024 at 11:45 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> >>
> ...
> >>
> >> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
> >> something like below,
> >>
> >> +static unsigned long ovl_get_unmapped_area(struct file *file,
> >> +               unsigned long addr, unsigned long len, unsigned long pgoff,
> >> +               unsigned long flags)
> >> +{
> >> +       struct file *realfile;
> >> +       const struct cred *old_cred;
> >> +
> >> +       realfile = ovl_real_file(file);
> >> +       if (IS_ERR(realfile))
> >> +               return PTR_ERR(realfile);
> >> +
> >> +       if (realfile->f_op->get_unmapped_area) {
> >> +               unsigned long ret;
> >> +
> >> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
> >> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
> >> +                                                       pgoff, flags);
> >> +               ovl_revert_creds(old_cred);
> >> +
> >> +               if (ret)
> >> +                       return ret;
> >> +       }
> >> +
> >> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
> >> flags);
> >> +}
> >>
> >> Correct me If I'm wrong.
> >>
> >
> > You just need to be aware of the fact that between ovl_get_unmapped_area()
> > and ovl_mmap(), ovl_real_file(file) could change from the lower file, to the
> > upper file due to another operation that initiated copy-up.
>
> Not sure about this part(I have very little knowledge of ovl), do you
> mean that we could not use ovl_real_file()?  The ovl_mmap() using
> realfile = file->private_data, we may use similar way in
> ovl_get_unmapped_area(). but I may have misunderstood.
>

First of all, you may add to your patch:
Acked-by: Amir Goldstein <amir73il@gmail.com>

I think this patch is fine as is.
w.r.t. question about ovl_override_creds(), I think it is good practice to
user mounter credentials when calling into real fs methods, regardless
of the fact that in most cases known today the ->get_unmapped_area()
methods do not check credentials.

My comment was referring to the fact that ovl_real_file(file), when called
two subsequent times in a row (once from ovl_get_unmapped_area() and
then again from ovl_mmap()) may not return the same realfile.

This is because during the lifetime of an overlayfs file/inode, its realinode/
realfile can change once, in the event known as "copy-up", so you may
start by calling ovl_get_unmapped_area() on a lower ext4 realfile and then end
up actually mapping an upper tmpfs realfile, because someone has opened
the overlayfs file for write in the meanwhile.

I guess in this corner case, the alignment may be wrong, or just too strict for
the actual mapping, but it is not critical, so just FYI.
There are worse issues with mmap of overlayfs file documented in:
https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior
"If a file residing on a lower layer is opened for read-only and then
memory mapped
 with MAP_SHARED, then subsequent changes to the file are not reflected in the
 memory mapping."

Thanks,
Amir.
Kefeng Wang Dec. 11, 2024, 3:01 p.m. UTC | #22
On 2024/12/11 17:43, Amir Goldstein wrote:
> On Tue, Dec 10, 2024 at 8:19 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>>
>>
>>
>> On 2024/12/6 20:58, Amir Goldstein wrote:
>>> On Fri, Dec 6, 2024 at 11:45 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>>>>
>> ...
>>>>
>>>> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
>>>> something like below,
>>>>
>>>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>>>> +               unsigned long addr, unsigned long len, unsigned long pgoff,
>>>> +               unsigned long flags)
>>>> +{
>>>> +       struct file *realfile;
>>>> +       const struct cred *old_cred;
>>>> +
>>>> +       realfile = ovl_real_file(file);
>>>> +       if (IS_ERR(realfile))
>>>> +               return PTR_ERR(realfile);
>>>> +
>>>> +       if (realfile->f_op->get_unmapped_area) {
>>>> +               unsigned long ret;
>>>> +
>>>> +               old_cred = ovl_override_creds(file_inode(file)->i_sb);
>>>> +               ret = realfile->f_op->get_unmapped_area(realfile, addr, len,
>>>> +                                                       pgoff, flags);
>>>> +               ovl_revert_creds(old_cred);
>>>> +
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +       }
>>>> +
>>>> +       return mm_get_unmapped_area(current->mm, file, addr, len, pgoff,
>>>> flags);
>>>> +}
>>>>
>>>> Correct me If I'm wrong.
>>>>
>>>
>>> You just need to be aware of the fact that between ovl_get_unmapped_area()
>>> and ovl_mmap(), ovl_real_file(file) could change from the lower file, to the
>>> upper file due to another operation that initiated copy-up.
>>
>> Not sure about this part(I have very little knowledge of ovl), do you
>> mean that we could not use ovl_real_file()?  The ovl_mmap() using
>> realfile = file->private_data, we may use similar way in
>> ovl_get_unmapped_area(). but I may have misunderstood.
>>
> 
> First of all, you may add to your patch:
> Acked-by: Amir Goldstein <amir73il@gmail.com>

Thanks,

> 
> I think this patch is fine as is.
> w.r.t. question about ovl_override_creds(), I think it is good practice to
> user mounter credentials when calling into real fs methods, regardless
> of the fact that in most cases known today the ->get_unmapped_area()
> methods do not check credentials.
> 
> My comment was referring to the fact that ovl_real_file(file), when called
> two subsequent times in a row (once from ovl_get_unmapped_area() and
> then again from ovl_mmap()) may not return the same realfile.
> 
> This is because during the lifetime of an overlayfs file/inode, its realinode/
> realfile can change once, in the event known as "copy-up", so you may
> start by calling ovl_get_unmapped_area() on a lower ext4 realfile and then end
> up actually mapping an upper tmpfs realfile, because someone has opened
> the overlayfs file for write in the meanwhile.

Got it, thanks for your detail explanation.

> 
> I guess in this corner case, the alignment may be wrong, or just too strict for
> the actual mapping, but it is not critical, so just FYI.

Yes, not critical, at least not too much worse.

Ovl is always lack of vma THP alignment or some other VMA allocation
requirements.

> There are worse issues with mmap of overlayfs file documented in:
> https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior
> "If a file residing on a lower layer is opened for read-only and then
> memory mapped
>   with MAP_SHARED, then subsequent changes to the file are not reflected in the
>   memory mapping."

I think we could ignore above issue in this fixup and if there is a need 
to check vm_flags in ->get_unmapped_area(), we could deal with it later.

And Jinjiang, please send a v2 according to all the discussion.

Thanks.

> 
> Thanks,
> Amir.
Jinjiang Tu Dec. 13, 2024, 1:51 a.m. UTC | #23
在 2024/12/11 23:01, Kefeng Wang 写道:
>
>
> On 2024/12/11 17:43, Amir Goldstein wrote:
>> On Tue, Dec 10, 2024 at 8:19 AM Kefeng Wang 
>> <wangkefeng.wang@huawei.com> wrote:
>>>
>>>
>>>
>>> On 2024/12/6 20:58, Amir Goldstein wrote:
>>>> On Fri, Dec 6, 2024 at 11:45 AM Kefeng Wang 
>>>> <wangkefeng.wang@huawei.com> wrote:
>>>>>
>>> ...
>>>>>
>>>>> So maybe use mm_get_unmapped_area() instead of __get_unmapped_area(),
>>>>> something like below,
>>>>>
>>>>> +static unsigned long ovl_get_unmapped_area(struct file *file,
>>>>> +               unsigned long addr, unsigned long len, unsigned 
>>>>> long pgoff,
>>>>> +               unsigned long flags)
>>>>> +{
>>>>> +       struct file *realfile;
>>>>> +       const struct cred *old_cred;
>>>>> +
>>>>> +       realfile = ovl_real_file(file);
>>>>> +       if (IS_ERR(realfile))
>>>>> +               return PTR_ERR(realfile);
>>>>> +
>>>>> +       if (realfile->f_op->get_unmapped_area) {
>>>>> +               unsigned long ret;
>>>>> +
>>>>> +               old_cred = 
>>>>> ovl_override_creds(file_inode(file)->i_sb);
>>>>> +               ret = realfile->f_op->get_unmapped_area(realfile, 
>>>>> addr, len,
>>>>> + pgoff, flags);
>>>>> +               ovl_revert_creds(old_cred);
>>>>> +
>>>>> +               if (ret)
>>>>> +                       return ret;
>>>>> +       }
>>>>> +
>>>>> +       return mm_get_unmapped_area(current->mm, file, addr, len, 
>>>>> pgoff,
>>>>> flags);
>>>>> +}
>>>>>
>>>>> Correct me If I'm wrong.
>>>>>
>>>>
>>>> You just need to be aware of the fact that between 
>>>> ovl_get_unmapped_area()
>>>> and ovl_mmap(), ovl_real_file(file) could change from the lower 
>>>> file, to the
>>>> upper file due to another operation that initiated copy-up.
>>>
>>> Not sure about this part(I have very little knowledge of ovl), do you
>>> mean that we could not use ovl_real_file()?  The ovl_mmap() using
>>> realfile = file->private_data, we may use similar way in
>>> ovl_get_unmapped_area(). but I may have misunderstood.
>>>
>>
>> First of all, you may add to your patch:
>> Acked-by: Amir Goldstein <amir73il@gmail.com>
>
> Thanks,
>
>>
>> I think this patch is fine as is.
>> w.r.t. question about ovl_override_creds(), I think it is good 
>> practice to
>> user mounter credentials when calling into real fs methods, regardless
>> of the fact that in most cases known today the ->get_unmapped_area()
>> methods do not check credentials.
>>
>> My comment was referring to the fact that ovl_real_file(file), when 
>> called
>> two subsequent times in a row (once from ovl_get_unmapped_area() and
>> then again from ovl_mmap()) may not return the same realfile.
>>
>> This is because during the lifetime of an overlayfs file/inode, its 
>> realinode/
>> realfile can change once, in the event known as "copy-up", so you may
>> start by calling ovl_get_unmapped_area() on a lower ext4 realfile and 
>> then end
>> up actually mapping an upper tmpfs realfile, because someone has opened
>> the overlayfs file for write in the meanwhile.
>
> Got it, thanks for your detail explanation.
>
>>
>> I guess in this corner case, the alignment may be wrong, or just too 
>> strict for
>> the actual mapping, but it is not critical, so just FYI.
>
> Yes, not critical, at least not too much worse.
>
> Ovl is always lack of vma THP alignment or some other VMA allocation
> requirements.
>
>> There are worse issues with mmap of overlayfs file documented in:
>> https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior
>> "If a file residing on a lower layer is opened for read-only and then
>> memory mapped
>>   with MAP_SHARED, then subsequent changes to the file are not 
>> reflected in the
>>   memory mapping."
>
> I think we could ignore above issue in this fixup and if there is a 
> need to check vm_flags in ->get_unmapped_area(), we could deal with it 
> later.
>
> And Jinjiang, please send a v2 according to all the discussion.
OK, I will send it later.
>
> Thanks.
>
>>
>> Thanks,
>> Amir.
>
Matthew Wilcox (Oracle) Dec. 13, 2024, 4:25 a.m. UTC | #24
On Wed, Dec 11, 2024 at 10:43:46AM +0100, Amir Goldstein wrote:
> I think this patch is fine as is.

This patch is complete crap.  The test-case is broken.  NAK to all of
this.
Kefeng Wang Dec. 13, 2024, 7:49 a.m. UTC | #25
On 2024/12/13 12:25, Matthew Wilcox wrote:
> On Wed, Dec 11, 2024 at 10:43:46AM +0100, Amir Goldstein wrote:
>> I think this patch is fine as is.
> 
> This patch is complete crap.  The test-case is broken.  NAK to all of
> this.
> 
Hi Matthew, regardless of the test case, the original issue is the
ovl don't respect underlying fs' get_unmapped_area(), the lower fs may
have own rules for vma alignment(own get_unmapped_area callback),
thp_get_unmapped_area() is one case, what's your option/suggestion about
it, thanks.
Matthew Wilcox (Oracle) Dec. 13, 2024, 2 p.m. UTC | #26
On Fri, Dec 13, 2024 at 03:49:53PM +0800, Kefeng Wang wrote:
> 
> 
> On 2024/12/13 12:25, Matthew Wilcox wrote:
> > On Wed, Dec 11, 2024 at 10:43:46AM +0100, Amir Goldstein wrote:
> > > I think this patch is fine as is.
> > 
> > This patch is complete crap.  The test-case is broken.  NAK to all of
> > this.
> > 
> Hi Matthew, regardless of the test case, the original issue is the
> ovl don't respect underlying fs' get_unmapped_area(), the lower fs may
> have own rules for vma alignment(own get_unmapped_area callback),
> thp_get_unmapped_area() is one case, what's your option/suggestion about

No, filesystems don't "have their own rules" for get_unmapped_area.
get_unmapped_area is for device drivers.
Kefeng Wang Dec. 14, 2024, 7:58 a.m. UTC | #27
On 2024/12/13 22:00, Matthew Wilcox wrote:
> On Fri, Dec 13, 2024 at 03:49:53PM +0800, Kefeng Wang wrote:
>>
>>
>> On 2024/12/13 12:25, Matthew Wilcox wrote:
>>> On Wed, Dec 11, 2024 at 10:43:46AM +0100, Amir Goldstein wrote:
>>>> I think this patch is fine as is.
>>>
>>> This patch is complete crap.  The test-case is broken.  NAK to all of
>>> this.
>>>
>> Hi Matthew, regardless of the test case, the original issue is the
>> ovl don't respect underlying fs' get_unmapped_area(), the lower fs may
>> have own rules for vma alignment(own get_unmapped_area callback),
>> thp_get_unmapped_area() is one case, what's your option/suggestion about
> 
> No, filesystems don't "have their own rules" for get_unmapped_area.
> get_unmapped_area is for device drivers.


Commit 74d2fad1334d ("thp, dax: add thp_get_unmapped_area for pmd
mappings") to enable PMD mappings as a FSDAX filesystem, and with
commit 1854bc6e2420 ("mm/readahead: Align file mappings for non-DAX")
to enable THPs for mmapped files too, also other filesystem, eg,
tmpfs provide a shmem_get_unmapped_area to decide the mapping address,
see commit c01d5b300774 ("shmem: get_unmapped_area align huge page"),
that is what I think the filesystem have own rules to get the mapping
address, correct me if I misunderstood, thanks.
diff mbox series

Patch

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 969b458100fe..d0dcf675ebe8 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -653,6 +653,25 @@  static int ovl_flush(struct file *file, fl_owner_t id)
 	return err;
 }
 
+static unsigned long ovl_get_unmapped_area(struct file *file,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	struct file *realfile;
+	const struct cred *old_cred;
+	unsigned long ret;
+
+	realfile = ovl_real_file(file);
+	if (IS_ERR(realfile))
+		return PTR_ERR(realfile);
+
+	old_cred = ovl_override_creds(file_inode(file)->i_sb);
+	ret = get_unmapped_area(realfile, addr, len, pgoff, flags);
+	ovl_revert_creds(old_cred);
+
+	return ret;
+}
+
 const struct file_operations ovl_file_operations = {
 	.open		= ovl_open,
 	.release	= ovl_release,
@@ -661,6 +680,7 @@  const struct file_operations ovl_file_operations = {
 	.write_iter	= ovl_write_iter,
 	.fsync		= ovl_fsync,
 	.mmap		= ovl_mmap,
+	.get_unmapped_area = ovl_get_unmapped_area,
 	.fallocate	= ovl_fallocate,
 	.fadvise	= ovl_fadvise,
 	.flush		= ovl_flush,
diff --git a/mm/mmap.c b/mm/mmap.c
index 16f8e8be01f8..60eb1ff7c9a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -913,6 +913,7 @@  __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	error = security_mmap_addr(addr);
 	return error ? error : addr;
 }
+EXPORT_SYMBOL(__get_unmapped_area);
 
 unsigned long
 mm_get_unmapped_area(struct mm_struct *mm, struct file *file,