diff mbox series

hugetlbfs: add O_TMPFILE support

Message ID 22c29acf9c51dae17802e1b05c9e5e4051448c5c.1571129593.git.p.sarna@tlen.pl (mailing list archive)
State New, archived
Headers show
Series hugetlbfs: add O_TMPFILE support | expand

Commit Message

Piotr Sarna Oct. 15, 2019, 9:01 a.m. UTC
With hugetlbfs, a common pattern for mapping anonymous huge pages
is to create a temporary file first. Currently libraries like
libhugetlbfs and seastar create these with a standard mkstemp+unlink
trick, but it would be more robust to be able to simply pass
the O_TMPFILE flag to open(). O_TMPFILE is already supported by several
file systems like ext4 and xfs. The implementation simply uses the existing
d_tmpfile utility function to instantiate the dcache entry for the file.

Tested manually by successfully creating a temporary file by opening
it with (O_TMPFILE|O_RDWR) on mounted hugetlbfs and successfully
mapping 2M huge pages with it. Without the patch, trying to open
a file with O_TMPFILE results in -ENOSUP.

Signed-off-by: Piotr Sarna <p.sarna@tlen.pl>
---
 fs/hugetlbfs/inode.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

Comments

Michal Hocko Oct. 15, 2019, 10:50 a.m. UTC | #1
On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
> With hugetlbfs, a common pattern for mapping anonymous huge pages
> is to create a temporary file first.

Really? I though that this is normally done by shmget(SHM_HUGETLB) or
mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
huge pages.

> Currently libraries like
> libhugetlbfs and seastar create these with a standard mkstemp+unlink
> trick, but it would be more robust to be able to simply pass
> the O_TMPFILE flag to open(). O_TMPFILE is already supported by several
> file systems like ext4 and xfs. The implementation simply uses the existing
> d_tmpfile utility function to instantiate the dcache entry for the file.
> 
> Tested manually by successfully creating a temporary file by opening
> it with (O_TMPFILE|O_RDWR) on mounted hugetlbfs and successfully
> mapping 2M huge pages with it. Without the patch, trying to open
> a file with O_TMPFILE results in -ENOSUP.
> 
> Signed-off-by: Piotr Sarna <p.sarna@tlen.pl>
> ---
>  fs/hugetlbfs/inode.c | 25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 1dcc57189382..277b7d231db8 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
>  /*
>   * File creation. Allocate an inode, and we're done..
>   */
> -static int hugetlbfs_mknod(struct inode *dir,
> -			struct dentry *dentry, umode_t mode, dev_t dev)
> +static int do_hugetlbfs_mknod(struct inode *dir,
> +			struct dentry *dentry,
> +			umode_t mode,
> +			dev_t dev,
> +			bool tmpfile)
>  {
>  	struct inode *inode;
>  	int error = -ENOSPC;
> @@ -824,13 +827,22 @@ static int hugetlbfs_mknod(struct inode *dir,
>  	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
>  	if (inode) {
>  		dir->i_ctime = dir->i_mtime = current_time(dir);
> -		d_instantiate(dentry, inode);
> +		if (tmpfile)
> +			d_tmpfile(dentry, inode);
> +		else
> +			d_instantiate(dentry, inode);
>  		dget(dentry);	/* Extra count - pin the dentry in core */
>  		error = 0;
>  	}
>  	return error;
>  }
>  
> +static int hugetlbfs_mknod(struct inode *dir,
> +			struct dentry *dentry, umode_t mode, dev_t dev)
> +{
> +	return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
> +}
> +
>  static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
>  {
>  	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
> @@ -844,6 +856,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
>  	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
>  }
>  
> +static int hugetlbfs_tmpfile(struct inode *dir,
> +			struct dentry *dentry, umode_t mode)
> +{
> +	return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
> +}
> +
>  static int hugetlbfs_symlink(struct inode *dir,
>  			struct dentry *dentry, const char *symname)
>  {
> @@ -1102,6 +1120,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = {
>  	.mknod		= hugetlbfs_mknod,
>  	.rename		= simple_rename,
>  	.setattr	= hugetlbfs_setattr,
> +	.tmpfile	= hugetlbfs_tmpfile,
>  };
>  
>  static const struct inode_operations hugetlbfs_inode_operations = {
> -- 
> 2.21.0
>
Mike Kravetz Oct. 15, 2019, 11:37 p.m. UTC | #2
On 10/15/19 3:50 AM, Michal Hocko wrote:
> On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
>> With hugetlbfs, a common pattern for mapping anonymous huge pages
>> is to create a temporary file first.
> 
> Really? I though that this is normally done by shmget(SHM_HUGETLB) or
> mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
> huge pages.
> 
>> Currently libraries like
>> libhugetlbfs and seastar create these with a standard mkstemp+unlink
>> trick,

I would guess that much of libhugetlbfs was writen before MAP_HUGETLB
was implemented.  So, that is why it does not make (more) use of that
option.

The implementation looks to be straight forward.  However, I really do
not want to add more functionality to hugetlbfs unless there is specific
use case that needs it.
Mike Kravetz Oct. 21, 2019, 5:17 p.m. UTC | #3
On 10/15/19 4:37 PM, Mike Kravetz wrote:
> On 10/15/19 3:50 AM, Michal Hocko wrote:
>> On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
>>> With hugetlbfs, a common pattern for mapping anonymous huge pages
>>> is to create a temporary file first.
>>
>> Really? I though that this is normally done by shmget(SHM_HUGETLB) or
>> mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
>> huge pages.
>>
>>> Currently libraries like
>>> libhugetlbfs and seastar create these with a standard mkstemp+unlink
>>> trick,
> 
> I would guess that much of libhugetlbfs was writen before MAP_HUGETLB
> was implemented.  So, that is why it does not make (more) use of that
> option.
> 
> The implementation looks to be straight forward.  However, I really do
> not want to add more functionality to hugetlbfs unless there is specific
> use case that needs it.

It was not my intention to shut down discussion on this patch.  I was just
asking if there was a (new) use case for such a change.  I am checking with
our DB team as I seem to remember them using the create/unlink approach for
hugetlbfs in one of their upcoming models.  

Is there a new use case you were thinking about?
Piotr Sarna Oct. 22, 2019, 7:09 a.m. UTC | #4
On 10/21/19 7:17 PM, Mike Kravetz wrote:
> On 10/15/19 4:37 PM, Mike Kravetz wrote:
>> On 10/15/19 3:50 AM, Michal Hocko wrote:
>>> On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
>>>> With hugetlbfs, a common pattern for mapping anonymous huge pages
>>>> is to create a temporary file first.
>>>
>>> Really? I though that this is normally done by shmget(SHM_HUGETLB) or
>>> mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
>>> huge pages.
>>>
>>>> Currently libraries like
>>>> libhugetlbfs and seastar create these with a standard mkstemp+unlink
>>>> trick,
>>
>> I would guess that much of libhugetlbfs was writen before MAP_HUGETLB
>> was implemented.  So, that is why it does not make (more) use of that
>> option.
>>
>> The implementation looks to be straight forward.  However, I really do
>> not want to add more functionality to hugetlbfs unless there is specific
>> use case that needs it.
> 
> It was not my intention to shut down discussion on this patch.  I was just
> asking if there was a (new) use case for such a change.  I am checking with
> our DB team as I seem to remember them using the create/unlink approach for
> hugetlbfs in one of their upcoming models.
> 
> Is there a new use case you were thinking about?
> 

Oh, I indeed thought it was a shutdown. The use case I was thinking 
about was in Seastar, where the create+unlink trick is used for creating 
temporary files (in a generic way, not only for hugetlbfs). I simply 
intended to migrate it to a newer approach - O_TMPFILE. However,
for the specific case of hugetlbfs it indeed makes more sense to skip it 
and use mmap's MAP_HUGETLB, so perhaps it's not worth it to patch a 
perfectly good and stable file system just to provide a semi-useful flag 
support. My implementation of tmpfile for hugetlbfs is straightforward 
indeed, but the MAP_HUGETLB argument made me realize that it may not be 
worth the trouble - especially that MAP_HUGETLB is here since 2.6 and 
O_TMPFILE was introduced around v3.11, so the mmap way looks more portable.

tldr: I'd be very happy to get my patch accepted, but the use case I had 
in mind can be easily solved with MAP_HUGETLB, so I don't insist.
Mike Kravetz Oct. 23, 2019, 2:55 a.m. UTC | #5
On 10/22/19 12:09 AM, Piotr Sarna wrote:
> On 10/21/19 7:17 PM, Mike Kravetz wrote:
>> On 10/15/19 4:37 PM, Mike Kravetz wrote:
>>> On 10/15/19 3:50 AM, Michal Hocko wrote:
>>>> On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
>>>>> With hugetlbfs, a common pattern for mapping anonymous huge pages
>>>>> is to create a temporary file first.
>>>>
>>>> Really? I though that this is normally done by shmget(SHM_HUGETLB) or
>>>> mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
>>>> huge pages.
>>>>
>>>>> Currently libraries like
>>>>> libhugetlbfs and seastar create these with a standard mkstemp+unlink
>>>>> trick,
>>>
>>> I would guess that much of libhugetlbfs was writen before MAP_HUGETLB
>>> was implemented.  So, that is why it does not make (more) use of that
>>> option.
>>>
>>> The implementation looks to be straight forward.  However, I really do
>>> not want to add more functionality to hugetlbfs unless there is specific
>>> use case that needs it.
>>
>> It was not my intention to shut down discussion on this patch.  I was just
>> asking if there was a (new) use case for such a change.  I am checking with
>> our DB team as I seem to remember them using the create/unlink approach for
>> hugetlbfs in one of their upcoming models.
>>
>> Is there a new use case you were thinking about?
>>
> 
> Oh, I indeed thought it was a shutdown. The use case I was thinking about was in Seastar, where the create+unlink trick is used for creating temporary files (in a generic way, not only for hugetlbfs). I simply intended to migrate it to a newer approach - O_TMPFILE. However,
> for the specific case of hugetlbfs it indeed makes more sense to skip it and use mmap's MAP_HUGETLB, so perhaps it's not worth it to patch a perfectly good and stable file system just to provide a semi-useful flag support. My implementation of tmpfile for hugetlbfs is straightforward indeed, but the MAP_HUGETLB argument made me realize that it may not be worth the trouble - especially that MAP_HUGETLB is here since 2.6 and O_TMPFILE was introduced around v3.11, so the mmap way looks more portable.
> 
> tldr: I'd be very happy to get my patch accepted, but the use case I had in mind can be easily solved with MAP_HUGETLB, so I don't insist.

If you really are after something like 'anonymous memory' for Seastar,
then MAP_HUGETLB would be the better approach.

I'm still checking with Oracle DB team as they may have a use for O_TMPFILE
in an upcoming release.  In their use case, they want an open fd to work with.
If it looks like they will proceed in this direction, we can work to get
your patch moved forward.

Thanks,
Piotr Sarna Oct. 23, 2019, 7:14 a.m. UTC | #6
On 10/23/19 4:55 AM, Mike Kravetz wrote:
> On 10/22/19 12:09 AM, Piotr Sarna wrote:
>> On 10/21/19 7:17 PM, Mike Kravetz wrote:
>>> On 10/15/19 4:37 PM, Mike Kravetz wrote:
>>>> On 10/15/19 3:50 AM, Michal Hocko wrote:
>>>>> On Tue 15-10-19 11:01:12, Piotr Sarna wrote:
>>>>>> With hugetlbfs, a common pattern for mapping anonymous huge pages
>>>>>> is to create a temporary file first.
>>>>>
>>>>> Really? I though that this is normally done by shmget(SHM_HUGETLB) or
>>>>> mmap(MAP_HUGETLB). Or maybe I misunderstood your definition on anonymous
>>>>> huge pages.
>>>>>
>>>>>> Currently libraries like
>>>>>> libhugetlbfs and seastar create these with a standard mkstemp+unlink
>>>>>> trick,
>>>>
>>>> I would guess that much of libhugetlbfs was writen before MAP_HUGETLB
>>>> was implemented.  So, that is why it does not make (more) use of that
>>>> option.
>>>>
>>>> The implementation looks to be straight forward.  However, I really do
>>>> not want to add more functionality to hugetlbfs unless there is specific
>>>> use case that needs it.
>>>
>>> It was not my intention to shut down discussion on this patch.  I was just
>>> asking if there was a (new) use case for such a change.  I am checking with
>>> our DB team as I seem to remember them using the create/unlink approach for
>>> hugetlbfs in one of their upcoming models.
>>>
>>> Is there a new use case you were thinking about?
>>>
>>
>> Oh, I indeed thought it was a shutdown. The use case I was thinking about was in Seastar, where the create+unlink trick is used for creating temporary files (in a generic way, not only for hugetlbfs). I simply intended to migrate it to a newer approach - O_TMPFILE. However,
>> for the specific case of hugetlbfs it indeed makes more sense to skip it and use mmap's MAP_HUGETLB, so perhaps it's not worth it to patch a perfectly good and stable file system just to provide a semi-useful flag support. My implementation of tmpfile for hugetlbfs is straightforward indeed, but the MAP_HUGETLB argument made me realize that it may not be worth the trouble - especially that MAP_HUGETLB is here since 2.6 and O_TMPFILE was introduced around v3.11, so the mmap way looks more portable.
>>
>> tldr: I'd be very happy to get my patch accepted, but the use case I had in mind can be easily solved with MAP_HUGETLB, so I don't insist.
> 
> If you really are after something like 'anonymous memory' for Seastar,
> then MAP_HUGETLB would be the better approach.

Just to clarify - my original goal was to migrate Seastar's temporary 
file implementation (which is fs-agnostic, based on descriptors) from 
the current create+unlink to O_TMPFILE, for robustness. One of the 
internal usages of this generic mechanism was to create a tmpfile on 
hugetlbfs and that's why I sent this patch. However, this particular 
internal usage can be easily switched to more portable MAP_HUGETLB, 
which will also mean that the generic tmpfile implementation will not be 
used internally for hugetlbfs anymore.

There *may* still be value in being able to support hugetlbfs once 
Seastar's tmpfile implementation migrates to O_TMPFILE, since the 
library offers creating temporary files in its public API, but there's 
no immediate use case I can apply it to.

> 
> I'm still checking with Oracle DB team as they may have a use for O_TMPFILE
> in an upcoming release.  In their use case, they want an open fd to work with.
> If it looks like they will proceed in this direction, we can work to get
> your patch moved forward.
> 
> Thanks,

Great, if it turns out that my patch helps anyone with their O_TMPFILE 
usage, I'd be very glad to see it merged.
Mike Kravetz Oct. 28, 2019, 6:56 p.m. UTC | #7
Cc: Andrew

On 10/15/19 2:01 AM, Piotr Sarna wrote:
> With hugetlbfs, a common pattern for mapping anonymous huge pages
> is to create a temporary file first. Currently libraries like
> libhugetlbfs and seastar create these with a standard mkstemp+unlink
> trick, but it would be more robust to be able to simply pass
> the O_TMPFILE flag to open(). O_TMPFILE is already supported by several
> file systems like ext4 and xfs. The implementation simply uses the existing
> d_tmpfile utility function to instantiate the dcache entry for the file.

Let's drop the mention of anonymous mapping to avoid any confusion.  If
we include use cases, the above paragraph could be rewritten as:

O_TMPFILE is an option used to create an unnamed temporary regular
file.  Currently, libhugetlbfs and Seastar use a combination of
mkstemp and unlink to accomplish similar functionality on hugetlbfs.
Add O_TMPFILE support to hugetlbfs so that it can potentially be used
by existing users (Seastar) and new users (Oracle DB).  Support is
added by simply using the d_tmpfile utility function to instantiate
the dcache entry for a temporary file.

> Tested manually by successfully creating a temporary file by opening
> it with (O_TMPFILE|O_RDWR) on mounted hugetlbfs and successfully
> mapping 2M huge pages with it. Without the patch, trying to open
> a file with O_TMPFILE results in -ENOSUP.
> 
> Signed-off-by: Piotr Sarna <p.sarna@tlen.pl>

The code looks good,
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
diff mbox series

Patch

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1dcc57189382..277b7d231db8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -815,8 +815,11 @@  static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 /*
  * File creation. Allocate an inode, and we're done..
  */
-static int hugetlbfs_mknod(struct inode *dir,
-			struct dentry *dentry, umode_t mode, dev_t dev)
+static int do_hugetlbfs_mknod(struct inode *dir,
+			struct dentry *dentry,
+			umode_t mode,
+			dev_t dev,
+			bool tmpfile)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -824,13 +827,22 @@  static int hugetlbfs_mknod(struct inode *dir,
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
 	if (inode) {
 		dir->i_ctime = dir->i_mtime = current_time(dir);
-		d_instantiate(dentry, inode);
+		if (tmpfile)
+			d_tmpfile(dentry, inode);
+		else
+			d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
 	}
 	return error;
 }
 
+static int hugetlbfs_mknod(struct inode *dir,
+			struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+}
+
 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
@@ -844,6 +856,12 @@  static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
 }
 
+static int hugetlbfs_tmpfile(struct inode *dir,
+			struct dentry *dentry, umode_t mode)
+{
+	return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+}
+
 static int hugetlbfs_symlink(struct inode *dir,
 			struct dentry *dentry, const char *symname)
 {
@@ -1102,6 +1120,7 @@  static const struct inode_operations hugetlbfs_dir_inode_operations = {
 	.mknod		= hugetlbfs_mknod,
 	.rename		= simple_rename,
 	.setattr	= hugetlbfs_setattr,
+	.tmpfile	= hugetlbfs_tmpfile,
 };
 
 static const struct inode_operations hugetlbfs_inode_operations = {