diff mbox

[2/7] Add support for per-file stream ID

Message ID 1427296070-8472-3-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe March 25, 2015, 3:07 p.m. UTC
Writing on flash devices can be much more efficient, if we can
inform the device what kind of data can be grouped together. If
the device is able to group data together with similar lifetimes,
then it can be more efficient in garbage collection. This, in turn,
leads to lower write amplification, which is a win on both device
wear and performance.

Add a new fadvise hint, POSIX_FADV_STREAMID, which sets the file
and inode streamid. The file streamid is used if we have the file
available at the time of the write (O_DIRECT), we use the inode
streamid if not (buffered writeback). The fadvise hint uses the
'offset' field to specify a stream ID.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/inode.c                   |  1 +
 fs/open.c                    |  1 +
 include/linux/fs.h           | 23 +++++++++++++++++++++++
 include/uapi/linux/fadvise.h |  2 ++
 mm/fadvise.c                 | 17 +++++++++++++++++
 5 files changed, 44 insertions(+)

Comments

Dmitry Monakhov April 9, 2015, 9:30 a.m. UTC | #1
Jens Axboe <axboe@fb.com> writes:

One small question.
You states that all IDs are equals but can we reserve some IDs
for internal kernel purposes. For example very short lived data (files
opened with O_TEMP) and so on.

Also small nitpicking see below.
> Writing on flash devices can be much more efficient, if we can
> inform the device what kind of data can be grouped together. If
> the device is able to group data together with similar lifetimes,
> then it can be more efficient in garbage collection. This, in turn,
> leads to lower write amplification, which is a win on both device
> wear and performance.
>
> Add a new fadvise hint, POSIX_FADV_STREAMID, which sets the file
> and inode streamid. The file streamid is used if we have the file
> available at the time of the write (O_DIRECT), we use the inode
> streamid if not (buffered writeback). The fadvise hint uses the
> 'offset' field to specify a stream ID.
>
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
>  fs/inode.c                   |  1 +
>  fs/open.c                    |  1 +
>  include/linux/fs.h           | 23 +++++++++++++++++++++++
>  include/uapi/linux/fadvise.h |  2 ++
>  mm/fadvise.c                 | 17 +++++++++++++++++
>  5 files changed, 44 insertions(+)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index f00b16f45507..41885322ba64 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -149,6 +149,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
>  	inode->i_blocks = 0;
>  	inode->i_bytes = 0;
>  	inode->i_generation = 0;
> +	inode->i_streamid = 0;
>  	inode->i_pipe = NULL;
>  	inode->i_bdev = NULL;
>  	inode->i_cdev = NULL;
> diff --git a/fs/open.c b/fs/open.c
> index 33f9cbf2610b..4a9b2be1a674 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -743,6 +743,7 @@ static int do_dentry_open(struct file *f,
>  	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
>  
>  	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
> +	f->f_streamid = 0;
>  
>  	return 0;
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b4d71b5e1ff2..43dde70c1d0d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -631,6 +631,7 @@ struct inode {
>  	};
>  
>  	__u32			i_generation;
> +	unsigned int		i_streamid;
>  
>  #ifdef CONFIG_FSNOTIFY
>  	__u32			i_fsnotify_mask; /* all events this inode cares about */
> @@ -640,6 +641,14 @@ struct inode {
>  	void			*i_private; /* fs or device private pointer */
>  };
>  
> +static inline unsigned int inode_streamid(struct inode *inode)
> +{
> +	if (inode)
> +		return inode->i_streamid;
> +
> +	return 0;
> +}
> +
>  static inline int inode_unhashed(struct inode *inode)
>  {
>  	return hlist_unhashed(&inode->i_hash);
> @@ -820,6 +829,8 @@ struct file {
>  	const struct cred	*f_cred;
>  	struct file_ra_state	f_ra;
>  
> +	unsigned int		f_streamid;
> +
>  	u64			f_version;
>  #ifdef CONFIG_SECURITY
>  	void			*f_security;
> @@ -842,6 +853,18 @@ struct file_handle {
>  	unsigned char f_handle[0];
>  };
>  
> +/*
> + * If the file doesn't have a stream ID set, return the inode stream ID
> + * in case that has been set.
> + */
> +static inline unsigned int file_streamid(struct file *f)
> +{
> +	if (f->f_streamid)
> +		return f->f_streamid;
> +
> +	return inode_streamid(f->f_inode);
> +}
> +
>  static inline struct file *get_file(struct file *f)
>  {
>  	atomic_long_inc(&f->f_count);
> diff --git a/include/uapi/linux/fadvise.h b/include/uapi/linux/fadvise.h
> index e8e747139b9a..3dc8a1ff1422 100644
> --- a/include/uapi/linux/fadvise.h
> +++ b/include/uapi/linux/fadvise.h
> @@ -18,4 +18,6 @@
>  #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
>  #endif
>  
> +#define POSIX_FADV_STREAMID	8 /* associate stream ID with file */
> +
>  #endif	/* FADVISE_H_INCLUDED */
> diff --git a/mm/fadvise.c b/mm/fadvise.c
> index 4a3907cf79f8..b111a8899fb7 100644
> --- a/mm/fadvise.c
> +++ b/mm/fadvise.c
> @@ -60,6 +60,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
>  		case POSIX_FADV_WILLNEED:
>  		case POSIX_FADV_NOREUSE:
>  		case POSIX_FADV_DONTNEED:
> +		case POSIX_FADV_STREAMID:
>  			/* no bad return value, but ignore advice */
>  			break;
>  		default:
> @@ -144,6 +145,22 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
>  			}
>  		}
>  		break;
> +	case POSIX_FADV_STREAMID:
> +		/*
> +		 * streamid is stored in offset... we don't limit or check
> +		 * if the device supports streams, or if it does, if the
> +		 * stream nr is within the limits. 1 is the lowest valid
> +		 * stream id, 0 is "don't care/know".
> +		 */
> +		if (offset != (unsigned int) offset)
> +			ret = EINVAL;
Shuld be negative       ret = -EINVAL;
> +		else {
> +			f.file->f_streamid = offset;
> +			spin_lock(&inode->i_lock);
> +			inode->i_streamid = offset;
> +			spin_unlock(&inode->i_lock);
> +		}
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> -- 
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 9, 2015, 4:28 p.m. UTC | #2
On 04/09/2015 03:30 AM, Dmitry Monakhov wrote:
> Jens Axboe <axboe@fb.com> writes:
>
> One small question.
> You states that all IDs are equals but can we reserve some IDs
> for internal kernel purposes. For example very short lived data (files
> opened with O_TEMP) and so on.

Yes, we probably should end up reserving some IDs for specific internal 
things. O_TEMP to a specific stream would make sense. Journal writes 
too, for instance.

I just preferred not wiring any of that up, as it then gets closer to 
being a policy decision. I'm mainly interested in getting this exposed 
to userspace, and seeing how the hw side develops since that will 
influence how we actually use this (number of streams, any actions 
required to manage streams, etc).

>> +	case POSIX_FADV_STREAMID:
>> +		/*
>> +		 * streamid is stored in offset... we don't limit or check
>> +		 * if the device supports streams, or if it does, if the
>> +		 * stream nr is within the limits. 1 is the lowest valid
>> +		 * stream id, 0 is "don't care/know".
>> +		 */
>> +		if (offset != (unsigned int) offset)
>> +			ret = EINVAL;
> Shuld be negative       ret = -EINVAL;

Indeed it should, thanks!
Andreas Dilger April 9, 2015, 11:22 p.m. UTC | #3
On Mar 25, 2015, at 9:07 AM, Jens Axboe <axboe@fb.com> wrote:
> 
> Writing on flash devices can be much more efficient, if we can
> inform the device what kind of data can be grouped together. If
> the device is able to group data together with similar lifetimes,
> then it can be more efficient in garbage collection. This, in turn,
> leads to lower write amplification, which is a win on both device
> wear and performance.
> 
> Add a new fadvise hint, POSIX_FADV_STREAMID, which sets the file
> and inode streamid. The file streamid is used if we have the file
> available at the time of the write (O_DIRECT), we use the inode
> streamid if not (buffered writeback). The fadvise hint uses the
> 'offset' field to specify a stream ID.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
> fs/inode.c                   |  1 +
> fs/open.c                    |  1 +
> include/linux/fs.h           | 23 +++++++++++++++++++++++
> include/uapi/linux/fadvise.h |  2 ++
> mm/fadvise.c                 | 17 +++++++++++++++++
> 5 files changed, 44 insertions(+)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index f00b16f45507..41885322ba64 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -149,6 +149,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
> 	inode->i_blocks = 0;
> 	inode->i_bytes = 0;
> 	inode->i_generation = 0;
> +	inode->i_streamid = 0;
> 	inode->i_pipe = NULL;
> 	inode->i_bdev = NULL;
> 	inode->i_cdev = NULL;
> diff --git a/fs/open.c b/fs/open.c
> index 33f9cbf2610b..4a9b2be1a674 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -743,6 +743,7 @@ static int do_dentry_open(struct file *f,
> 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
> 
> 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
> +	f->f_streamid = 0;
> 
> 	return 0;
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b4d71b5e1ff2..43dde70c1d0d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -631,6 +631,7 @@ struct inode {
> 	};
> 
> 	__u32			i_generation;
> +	unsigned int		i_streamid;

Since there are only 8 bits of streamid being passed from userspace,
is it possible to declare this as a char and pack it into a hole so
that it doesn't increase the inode size for a functionality that most
people won't be using?  Maybe after i_bytes?  That could be increased
to unsigned short if needed without increasing the size of the inode.

Cheers, Andreas

> #ifdef CONFIG_FSNOTIFY
> 	__u32			i_fsnotify_mask; /* all events this inode cares about */
> @@ -640,6 +641,14 @@ struct inode {
> 	void			*i_private; /* fs or device private pointer */
> };
> 
> +static inline unsigned int inode_streamid(struct inode *inode)
> +{
> +	if (inode)
> +		return inode->i_streamid;
> +
> +	return 0;
> +}
> +
> static inline int inode_unhashed(struct inode *inode)
> {
> 	return hlist_unhashed(&inode->i_hash);
> @@ -820,6 +829,8 @@ struct file {
> 	const struct cred	*f_cred;
> 	struct file_ra_state	f_ra;
> 
> +	unsigned int		f_streamid;
> 	u64			f_version;
> #ifdef CONFIG_SECURITY
> 	void			*f_security;
> @@ -842,6 +853,18 @@ struct file_handle {
> 	unsigned char f_handle[0];
> };
> 
> +/*
> + * If the file doesn't have a stream ID set, return the inode stream ID
> + * in case that has been set.
> + */
> +static inline unsigned int file_streamid(struct file *f)
> +{
> +	if (f->f_streamid)
> +		return f->f_streamid;
> +
> +	return inode_streamid(f->f_inode);
> +}
> +
> static inline struct file *get_file(struct file *f)
> {
> 	atomic_long_inc(&f->f_count);
> diff --git a/include/uapi/linux/fadvise.h b/include/uapi/linux/fadvise.h
> index e8e747139b9a..3dc8a1ff1422 100644
> --- a/include/uapi/linux/fadvise.h
> +++ b/include/uapi/linux/fadvise.h
> @@ -18,4 +18,6 @@
> #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
> #endif
> 
> +#define POSIX_FADV_STREAMID	8 /* associate stream ID with file */
> +
> #endif	/* FADVISE_H_INCLUDED */
> diff --git a/mm/fadvise.c b/mm/fadvise.c
> index 4a3907cf79f8..b111a8899fb7 100644
> --- a/mm/fadvise.c
> +++ b/mm/fadvise.c
> @@ -60,6 +60,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
> 		case POSIX_FADV_WILLNEED:
> 		case POSIX_FADV_NOREUSE:
> 		case POSIX_FADV_DONTNEED:
> +		case POSIX_FADV_STREAMID:
> 			/* no bad return value, but ignore advice */
> 			break;
> 		default:
> @@ -144,6 +145,22 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
> 			}
> 		}
> 		break;
> +	case POSIX_FADV_STREAMID:
> +		/*
> +		 * streamid is stored in offset... we don't limit or check
> +		 * if the device supports streams, or if it does, if the
> +		 * stream nr is within the limits. 1 is the lowest valid
> +		 * stream id, 0 is "don't care/know".
> +		 */
> +		if (offset != (unsigned int) offset)
> +			ret = EINVAL;
> +		else {
> +			f.file->f_streamid = offset;
> +			spin_lock(&inode->i_lock);
> +			inode->i_streamid = offset;
> +			spin_unlock(&inode->i_lock);
> +		}
> +		break;
> 	default:
> 		ret = -EINVAL;
> 	}
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 18, 2015, 7:51 p.m. UTC | #4
On 04/09/2015 05:22 PM, Andreas Dilger wrote:
> On Mar 25, 2015, at 9:07 AM, Jens Axboe <axboe@fb.com> wrote:
>>
>> Writing on flash devices can be much more efficient, if we can
>> inform the device what kind of data can be grouped together. If
>> the device is able to group data together with similar lifetimes,
>> then it can be more efficient in garbage collection. This, in turn,
>> leads to lower write amplification, which is a win on both device
>> wear and performance.
>>
>> Add a new fadvise hint, POSIX_FADV_STREAMID, which sets the file
>> and inode streamid. The file streamid is used if we have the file
>> available at the time of the write (O_DIRECT), we use the inode
>> streamid if not (buffered writeback). The fadvise hint uses the
>> 'offset' field to specify a stream ID.
>>
>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> ---
>> fs/inode.c                   |  1 +
>> fs/open.c                    |  1 +
>> include/linux/fs.h           | 23 +++++++++++++++++++++++
>> include/uapi/linux/fadvise.h |  2 ++
>> mm/fadvise.c                 | 17 +++++++++++++++++
>> 5 files changed, 44 insertions(+)
>>
>> diff --git a/fs/inode.c b/fs/inode.c
>> index f00b16f45507..41885322ba64 100644
>> --- a/fs/inode.c
>> +++ b/fs/inode.c
>> @@ -149,6 +149,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
>> 	inode->i_blocks = 0;
>> 	inode->i_bytes = 0;
>> 	inode->i_generation = 0;
>> +	inode->i_streamid = 0;
>> 	inode->i_pipe = NULL;
>> 	inode->i_bdev = NULL;
>> 	inode->i_cdev = NULL;
>> diff --git a/fs/open.c b/fs/open.c
>> index 33f9cbf2610b..4a9b2be1a674 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -743,6 +743,7 @@ static int do_dentry_open(struct file *f,
>> 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
>>
>> 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
>> +	f->f_streamid = 0;
>>
>> 	return 0;
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index b4d71b5e1ff2..43dde70c1d0d 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -631,6 +631,7 @@ struct inode {
>> 	};
>>
>> 	__u32			i_generation;
>> +	unsigned int		i_streamid;
>
> Since there are only 8 bits of streamid being passed from userspace,
> is it possible to declare this as a char and pack it into a hole so
> that it doesn't increase the inode size for a functionality that most
> people won't be using?  Maybe after i_bytes?  That could be increased
> to unsigned short if needed without increasing the size of the inode.

In the next version, I've retained the int, but ensured that they pack 
better in both struct file and struct inode. Basically fill some 
existing holes.
diff mbox

Patch

diff --git a/fs/inode.c b/fs/inode.c
index f00b16f45507..41885322ba64 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -149,6 +149,7 @@  int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
 	inode->i_generation = 0;
+	inode->i_streamid = 0;
 	inode->i_pipe = NULL;
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf2610b..4a9b2be1a674 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -743,6 +743,7 @@  static int do_dentry_open(struct file *f,
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+	f->f_streamid = 0;
 
 	return 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5e1ff2..43dde70c1d0d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -631,6 +631,7 @@  struct inode {
 	};
 
 	__u32			i_generation;
+	unsigned int		i_streamid;
 
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
@@ -640,6 +641,14 @@  struct inode {
 	void			*i_private; /* fs or device private pointer */
 };
 
+static inline unsigned int inode_streamid(struct inode *inode)
+{
+	if (inode)
+		return inode->i_streamid;
+
+	return 0;
+}
+
 static inline int inode_unhashed(struct inode *inode)
 {
 	return hlist_unhashed(&inode->i_hash);
@@ -820,6 +829,8 @@  struct file {
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
 
+	unsigned int		f_streamid;
+
 	u64			f_version;
 #ifdef CONFIG_SECURITY
 	void			*f_security;
@@ -842,6 +853,18 @@  struct file_handle {
 	unsigned char f_handle[0];
 };
 
+/*
+ * If the file doesn't have a stream ID set, return the inode stream ID
+ * in case that has been set.
+ */
+static inline unsigned int file_streamid(struct file *f)
+{
+	if (f->f_streamid)
+		return f->f_streamid;
+
+	return inode_streamid(f->f_inode);
+}
+
 static inline struct file *get_file(struct file *f)
 {
 	atomic_long_inc(&f->f_count);
diff --git a/include/uapi/linux/fadvise.h b/include/uapi/linux/fadvise.h
index e8e747139b9a..3dc8a1ff1422 100644
--- a/include/uapi/linux/fadvise.h
+++ b/include/uapi/linux/fadvise.h
@@ -18,4 +18,6 @@ 
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
 
+#define POSIX_FADV_STREAMID	8 /* associate stream ID with file */
+
 #endif	/* FADVISE_H_INCLUDED */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4a3907cf79f8..b111a8899fb7 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -60,6 +60,7 @@  SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		case POSIX_FADV_WILLNEED:
 		case POSIX_FADV_NOREUSE:
 		case POSIX_FADV_DONTNEED:
+		case POSIX_FADV_STREAMID:
 			/* no bad return value, but ignore advice */
 			break;
 		default:
@@ -144,6 +145,22 @@  SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 			}
 		}
 		break;
+	case POSIX_FADV_STREAMID:
+		/*
+		 * streamid is stored in offset... we don't limit or check
+		 * if the device supports streams, or if it does, if the
+		 * stream nr is within the limits. 1 is the lowest valid
+		 * stream id, 0 is "don't care/know".
+		 */
+		if (offset != (unsigned int) offset)
+			ret = EINVAL;
+		else {
+			f.file->f_streamid = offset;
+			spin_lock(&inode->i_lock);
+			inode->i_streamid = offset;
+			spin_unlock(&inode->i_lock);
+		}
+		break;
 	default:
 		ret = -EINVAL;
 	}