diff mbox

[04/11] fs: add support for allowing applications to pass in write life time hints

Message ID 1497498312-17704-5-git-send-email-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe June 15, 2017, 3:45 a.m. UTC
Add four flags for the pwritev2(2) system call, allowing an application
to give the kernel a hint about what on-media life times can be
expected from a given write.

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Define IOCB flags to carry this over this information from the pwritev2
RWF_WRITE_LIFE_* flags.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/read_write.c         |  9 ++++++++-
 include/linux/fs.h      | 12 ++++++++++++
 include/uapi/linux/fs.h | 10 ++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

Comments

Darrick J. Wong June 15, 2017, 4:15 a.m. UTC | #1
On Wed, Jun 14, 2017 at 09:45:05PM -0600, Jens Axboe wrote:
> Add four flags for the pwritev2(2) system call, allowing an application
> to give the kernel a hint about what on-media life times can be
> expected from a given write.
> 
> The intent is for these values to be relative to each other, no
> absolute meaning should be attached to these flag names.
> 
> Define IOCB flags to carry this over this information from the pwritev2
> RWF_WRITE_LIFE_* flags.
> 
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/read_write.c         |  9 ++++++++-
>  include/linux/fs.h      | 12 ++++++++++++
>  include/uapi/linux/fs.h | 10 ++++++++++
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d4484df9..9cb2314efca3 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>  	struct kiocb kiocb;
>  	ssize_t ret;
>  
> -	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
> +	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
>  		return -EOPNOTSUPP;
>  
>  	init_sync_kiocb(&kiocb, filp);
> @@ -688,6 +688,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>  		kiocb.ki_flags |= IOCB_DSYNC;
>  	if (flags & RWF_SYNC)
>  		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
> +	if (flags & RWF_WRITE_LIFE_MASK) {
> +		struct inode *inode = file_inode(filp);
> +
> +		inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
> +					RWF_WRITE_LIFE_SHIFT;

Hmm, so once set, hints stick around until someone else sets a different
one.  I suppose it's unlikely that you'd have two programs writing to
the same inode with different write hints, right?

Just wondering if anyone will be surprised that they thought they were
writing to an _EXTREME hint file but someone else switched it to _SHORT
on them.

Also, how does userspace query the write hint value once set?

> +		kiocb.ki_flags |= inode->i_write_hint << IOCB_WRITE_LIFE_SHIFT;
> +	}
>  	kiocb.ki_pos = *ppos;
>  
>  	if (type == READ)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f4f9df8ed059..63798b67fcfe 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -269,6 +269,12 @@ struct writeback_control;
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  
> +/*
> + * Steal 4-bits for stream information, this allows 16 valid streams
> + */
> +#define IOCB_WRITE_LIFE_SHIFT	7
> +#define IOCB_WRITE_LIFE_MASK	(BIT(7) | BIT(8) | BIT(9) | BIT(10))
> +
>  struct kiocb {
>  	struct file		*ki_filp;
>  	loff_t			ki_pos;
> @@ -292,6 +298,12 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
>  	};
>  }
>  
> +static inline int iocb_write_hint(const struct kiocb *iocb)
> +{
> +	return (iocb->ki_flags & IOCB_WRITE_LIFE_MASK) >>
> +			IOCB_WRITE_LIFE_SHIFT;
> +}
> +
>  /*
>   * "descriptor" for what we're up to with a read.
>   * This allows us to use the same read code yet
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 24e61a54feaa..58b7ee06b380 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -361,4 +361,14 @@ struct fscrypt_key {
>  #define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
>  #define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
>  
> +/*
> + * Data life time write flags, steal 4 bits for that
> + */
> +#define RWF_WRITE_LIFE_SHIFT		4
> +#define RWF_WRITE_LIFE_MASK		0x000000f0 /* 4 bits of stream ID */
> +#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)

Should O_TMPFILE files ought to be created with i_write_hint =
RWF_WRITE_LIFE_SHORT by default?

--D

> +
>  #endif /* _UAPI_LINUX_FS_H */
> -- 
> 2.7.4
>
Jens Axboe June 15, 2017, 4:33 a.m. UTC | #2
On 06/14/2017 10:15 PM, Darrick J. Wong wrote:
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d4484df9..9cb2314efca3 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>>  	struct kiocb kiocb;
>>  	ssize_t ret;
>>  
>> -	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
>> +	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
>>  		return -EOPNOTSUPP;
>>  
>>  	init_sync_kiocb(&kiocb, filp);
>> @@ -688,6 +688,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>>  		kiocb.ki_flags |= IOCB_DSYNC;
>>  	if (flags & RWF_SYNC)
>>  		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
>> +	if (flags & RWF_WRITE_LIFE_MASK) {
>> +		struct inode *inode = file_inode(filp);
>> +
>> +		inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
>> +					RWF_WRITE_LIFE_SHIFT;
> 
> Hmm, so once set, hints stick around until someone else sets a different
> one.  I suppose it's unlikely that you'd have two programs writing to
> the same inode with different write hints, right?

You'd hope so... There's really no good way to support that with
buffered writes. For the NVMe use case, you'd be no worse off than you
were without hints, however.

But I do think one change should be made above - we only reset the hint
if someone passes a new hint in. But we probably also want to do so for
the case where no hint is passed in, but one is currently set.

> Also, how does userspace query the write hint value once set?

It doesn't. Ideally this hint would be "for this write only", but that's
not really possible with deferred write back.

>> +/*
>> + * Data life time write flags, steal 4 bits for that
>> + */
>> +#define RWF_WRITE_LIFE_SHIFT		4
>> +#define RWF_WRITE_LIFE_MASK		0x000000f0 /* 4 bits of stream ID */
>> +#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
> 
> Should O_TMPFILE files ought to be created with i_write_hint =
> RWF_WRITE_LIFE_SHORT by default?

The answer here is "it depends". If we're already using hints on that
device, then yes, O_TMPFILE is a clear candidate for
RWF_WRITE_LIFE_SHORT. However, if we are not, then we should not set it
as it may have implications on how the device manages writes. For that
case we'll potentially only be driving a single stream, short writes,
and that may not be enough to saturate device bandwidth.

I would rather leave that for future experimentation. There are similar
things we can do with short lived writes, like apply them to the log
writes in the file system. But all of that should be built on top of
what we end up agreeing on, not included from the get-go. I'd rather get
the basic usage and model going first before we further complicate
matters.
Christoph Hellwig June 15, 2017, 8:19 a.m. UTC | #3
I think Darrick has a very valid concern here - using RWF_* flags
to affect inode or fd-wide state is extremely counter productive.

Combined with the fact that the streams need a special setup in NVMe
I'm tempted to say that the interface really should be fadvise or
similar, which would keep the setup out of the I/O path and make clear
it's a sticky interface.  For direct I/O RWF_* would make some sense,
but we'd still have to deal with the setup issue.
Al Viro June 15, 2017, 11:24 a.m. UTC | #4
On Wed, Jun 14, 2017 at 09:15:03PM -0700, Darrick J. Wong wrote:
> > + */
> > +#define RWF_WRITE_LIFE_SHIFT		4
> > +#define RWF_WRITE_LIFE_MASK		0x000000f0 /* 4 bits of stream ID */
> > +#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
> 
> Should O_TMPFILE files ought to be created with i_write_hint =
> RWF_WRITE_LIFE_SHORT by default?

Depends...  One of the uses for that is to have them linked into permanent
place once completely written...
Jens Axboe June 15, 2017, 2:21 p.m. UTC | #5
On 06/15/2017 02:19 AM, Christoph Hellwig wrote:
> I think Darrick has a very valid concern here - using RWF_* flags
> to affect inode or fd-wide state is extremely counter productive.
> 
> Combined with the fact that the streams need a special setup in NVMe
> I'm tempted to say that the interface really should be fadvise or
> similar, which would keep the setup out of the I/O path and make clear
> it's a sticky interface.  For direct I/O RWF_* would make some sense,
> but we'd still have to deal with the setup issue.

OK, which is exactly how I had implemented the interface 2 years ago.
I can resurrect that part and dump the RWF_* flags. I agree the RWF_*
flags are confusing for buffered IO, since they are persistent. For
O_DIRECT, they make more sense. So the question is if we want to
retain the RWF_WRITE_LIFE_* hints at all, or simply go back to the
fadvise with something ala:

POSIX_FADV_WRITE_HINT_SET	Set write life time hint
POSIX_FADV_WRITE_HINT_GET	Get write life time hint

I'd still very much like to stick to the same four hints, and not
require any special setup. I think the lazy setup for nvme is fine. Some
devices can do it at probe time, others will want to do it lazily to not
waste resources. Do we really want to have the HINT_SET bubble down to
the device and allocate streams?

I want to keep this simple to use. The RWF_WRITE_LIFE_* are easy to use
and adopt. But I do agree that being able to query the setting is
useful, and then we may as well introduce a get/set fadvise interface
for doing so.

Do people like the S_WRITE_LIFE_* part, or should we keep the
i_write_hint and just shrink it to 8 bytes?
Jens Axboe June 15, 2017, 3:23 p.m. UTC | #6
On 06/15/2017 08:21 AM, Jens Axboe wrote:
> On 06/15/2017 02:19 AM, Christoph Hellwig wrote:
>> I think Darrick has a very valid concern here - using RWF_* flags
>> to affect inode or fd-wide state is extremely counter productive.
>>
>> Combined with the fact that the streams need a special setup in NVMe
>> I'm tempted to say that the interface really should be fadvise or
>> similar, which would keep the setup out of the I/O path and make clear
>> it's a sticky interface.  For direct I/O RWF_* would make some sense,
>> but we'd still have to deal with the setup issue.
> 
> OK, which is exactly how I had implemented the interface 2 years ago.
> I can resurrect that part and dump the RWF_* flags. I agree the RWF_*
> flags are confusing for buffered IO, since they are persistent. For
> O_DIRECT, they make more sense. So the question is if we want to
> retain the RWF_WRITE_LIFE_* hints at all, or simply go back to the
> fadvise with something ala:
> 
> POSIX_FADV_WRITE_HINT_SET	Set write life time hint
> POSIX_FADV_WRITE_HINT_GET	Get write life time hint

And then I remembered why fadvise _sucks_. It returns the error value
directly. So 0 is success > 0 is some error. That does not work well
for adding a set/get interface. Additionally, with fadvise, we have
to overload either 'offset' or 'length' for the write hint for the
set operation. Not super pretty.

Any objections to making the auxiliary interface fcntl(2) based?
That would be a cleaner fit, imho.
Christoph Hellwig June 16, 2017, 7:30 a.m. UTC | #7
On Thu, Jun 15, 2017 at 09:23:02AM -0600, Jens Axboe wrote:
> And then I remembered why fadvise _sucks_. It returns the error value
> directly. So 0 is success > 0 is some error. That does not work well
> for adding a set/get interface. Additionally, with fadvise, we have
> to overload either 'offset' or 'length' for the write hint for the
> set operation. Not super pretty.
> 
> Any objections to making the auxiliary interface fcntl(2) based?
> That would be a cleaner fit, imho.

fcntl sounds fine to me.
Christoph Hellwig June 16, 2017, 7:33 a.m. UTC | #8
On Thu, Jun 15, 2017 at 08:21:18AM -0600, Jens Axboe wrote:
> I'd still very much like to stick to the same four hints, and not

Agreed.  In fact I'd go a little further:  we should have a

	u16 hints;

that goes all the way down from fcntl to the driver, right now
we'll allocate the first 3 bits for the write lifetime hints (2.5,
so we have one value spare, as they don't need to flags but can be
enum values), leaving more space for other kinds of hints.

> require any special setup. I think the lazy setup for nvme is fine. Some
> devices can do it at probe time, others will want to do it lazily to not
> waste resources. Do we really want to have the HINT_SET bubble down to
> the device and allocate streams?

If I look at the mess for allocating the streams I think we need it
to bubble down.  That way the device can allocate the stream at time
of the fcntl and we can keep the low level driver very simple.
Jens Axboe June 16, 2017, 2:35 p.m. UTC | #9
On 06/16/2017 01:33 AM, Christoph Hellwig wrote:
> On Thu, Jun 15, 2017 at 08:21:18AM -0600, Jens Axboe wrote:
>> I'd still very much like to stick to the same four hints, and not
> 
> Agreed.  In fact I'd go a little further:  we should have a
> 
> 	u16 hints;
> 
> that goes all the way down from fcntl to the driver, right now
> we'll allocate the first 3 bits for the write lifetime hints (2.5,
> so we have one value spare, as they don't need to flags but can be
> enum values), leaving more space for other kinds of hints.

Did you see v5? It adds enum write_hint and passes it all the way down,
until we transform them into rq/bio flags.

>> require any special setup. I think the lazy setup for nvme is fine. Some
>> devices can do it at probe time, others will want to do it lazily to not
>> waste resources. Do we really want to have the HINT_SET bubble down to
>> the device and allocate streams?
> 
> If I look at the mess for allocating the streams I think we need it
> to bubble down.  That way the device can allocate the stream at time
> of the fcntl and we can keep the low level driver very simple.

Mess? The NVMe code seems pretty straight forward to me. Is it purely
the lazy alloc part you're referring to? I'm fine with bubbling down the
hint and setting everything up inline from the fcntl() call, but that
means we need to make things like btrfs and md/dm aware of it and
resolve all mappings. That sort-of sucks, especially since they don't
otherwise need to know about it.

If another driver needs similar setup like NVMe, I'd much rather just
fix it up there. From the API point of view, there would be no changes.
Jens Axboe June 16, 2017, 2:35 p.m. UTC | #10
On 06/16/2017 01:30 AM, Christoph Hellwig wrote:
> On Thu, Jun 15, 2017 at 09:23:02AM -0600, Jens Axboe wrote:
>> And then I remembered why fadvise _sucks_. It returns the error value
>> directly. So 0 is success > 0 is some error. That does not work well
>> for adding a set/get interface. Additionally, with fadvise, we have
>> to overload either 'offset' or 'length' for the write hint for the
>> set operation. Not super pretty.
>>
>> Any objections to making the auxiliary interface fcntl(2) based?
>> That would be a cleaner fit, imho.
> 
> fcntl sounds fine to me.

v5 implemented that, much better fit than fadvise.
Jens Axboe June 16, 2017, 2:53 p.m. UTC | #11
On 06/16/2017 08:35 AM, Jens Axboe wrote:
>> If I look at the mess for allocating the streams I think we need it
>> to bubble down.  That way the device can allocate the stream at time
>> of the fcntl and we can keep the low level driver very simple.
> 
> Mess? The NVMe code seems pretty straight forward to me. Is it purely
> the lazy alloc part you're referring to? I'm fine with bubbling down the
> hint and setting everything up inline from the fcntl() call, but that
> means we need to make things like btrfs and md/dm aware of it and
> resolve all mappings. That sort-of sucks, especially since they don't
> otherwise need to know about it.

This is what the old code looked like, back when it was implemented as
managed streams, for btrfs:

http://git.kernel.dk/cgit/linux-block/commit/?id=93b4d9370250ea9273e1e71775df85964ff52922

We don't need the close part anymore, but the device iteration for
constituent devices would be the same. On top of that, we'd have to
touch all of drivers/md/ as well, or it won't work for anyone that's
using raid or dm.

Why not just let it resolve lazily? That way we don't have to touch
drivers that need not know about this, since the information is already
being passed all the way down without having to modify drivers.

If you object to the nvme driver open coding this, I can put that in
block instead and catch it earlier. Then nvme would not have to worry
about it. But honestly, I'd much rather make it generic once we have
another user that needs this, than do it up front.
Christoph Hellwig June 16, 2017, 3:52 p.m. UTC | #12
On Fri, Jun 16, 2017 at 08:35:07AM -0600, Jens Axboe wrote:
> > Agreed.  In fact I'd go a little further:  we should have a
> > 
> > 	u16 hints;
> > 
> > that goes all the way down from fcntl to the driver, right now
> > we'll allocate the first 3 bits for the write lifetime hints (2.5,
> > so we have one value spare, as they don't need to flags but can be
> > enum values), leaving more space for other kinds of hints.
> 
> Did you see v5? It adds enum write_hint and passes it all the way down,
> until we transform them into rq/bio flags.

Yes.  But with all the way down I mean all the way down to the driver :)

> Mess? The NVMe code seems pretty straight forward to me. Is it purely
> the lazy alloc part you're referring to?

Yes.

> I'm fine with bubbling down the
> hint and setting everything up inline from the fcntl() call, but that
> means we need to make things like btrfs and md/dm aware of it and
> resolve all mappings. That sort-of sucks, especially since they don't
> otherwise need to know about it.

True, that sucks.  But taking the hint and doing things behind the
scenes sounds nasty.

> If another driver needs similar setup like NVMe, I'd much rather just
> fix it up there. From the API point of view, there would be no changes.

Ok..
Jens Axboe June 16, 2017, 3:59 p.m. UTC | #13
On 06/16/2017 09:52 AM, Christoph Hellwig wrote:
> On Fri, Jun 16, 2017 at 08:35:07AM -0600, Jens Axboe wrote:
>>> Agreed.  In fact I'd go a little further:  we should have a
>>>
>>> 	u16 hints;
>>>
>>> that goes all the way down from fcntl to the driver, right now
>>> we'll allocate the first 3 bits for the write lifetime hints (2.5,
>>> so we have one value spare, as they don't need to flags but can be
>>> enum values), leaving more space for other kinds of hints.
>>
>> Did you see v5? It adds enum write_hint and passes it all the way down,
>> until we transform them into rq/bio flags.
> 
> Yes.  But with all the way down I mean all the way down to the driver :)

Only missing part is the request flags. And why make that any different
than the flags we already have now? It'd be trivial to pack the value
into the request flags as well, but I'm struggling to see the point of
that, honestly.

Please expand on why you think changing the request flags to also
carry that value would be useful, as opposed to just mapping it when
we setup the request. If you have a valid concern I don't mind making
the change, but I just don't see one right now.

>> Mess? The NVMe code seems pretty straight forward to me. Is it purely
>> the lazy alloc part you're referring to?
> 
> Yes.
> 
>> I'm fine with bubbling down the
>> hint and setting everything up inline from the fcntl() call, but that
>> means we need to make things like btrfs and md/dm aware of it and
>> resolve all mappings. That sort-of sucks, especially since they don't
>> otherwise need to know about it.
> 
> True, that sucks.  But taking the hint and doing things behind the
> scenes sounds nasty.

How so? It's pretty straight forward. The only downside I see is that
we'll have a delay between seeing the first valid write hint and to
the time when nvme has it enabled. But that's so short, and in the
grander scheme of things, doesn't matter one bit.

The alternative is a bunch of infrastructure to do this inline. I
think that choice is pretty clear here.

>> If another driver needs similar setup like NVMe, I'd much rather just
>> fix it up there. From the API point of view, there would be no changes.
> 
> Ok..
>
Christoph Hellwig June 16, 2017, 6:02 p.m. UTC | #14
On Fri, Jun 16, 2017 at 09:59:38AM -0600, Jens Axboe wrote:
> >> Did you see v5? It adds enum write_hint and passes it all the way down,
> >> until we transform them into rq/bio flags.
> > 
> > Yes.  But with all the way down I mean all the way down to the driver :)
> 
> Only missing part is the request flags. And why make that any different
> than the flags we already have now? It'd be trivial to pack the value
> into the request flags as well, but I'm struggling to see the point of
> that, honestly.
> 
> Please expand on why you think changing the request flags to also
> carry that value would be useful, as opposed to just mapping it when
> we setup the request. If you have a valid concern I don't mind making
> the change, but I just don't see one right now.

Mostly so that we can pass the value down in one form through the whole
stack.

> How so? It's pretty straight forward. The only downside I see is that
> we'll have a delay between seeing the first valid write hint and to
> the time when nvme has it enabled. But that's so short, and in the
> grander scheme of things, doesn't matter one bit.

I'll take your word for that.  To be honest I hope future standards
won't make the mistake to come up with something as ugly as streams
and just take something like our hints on a per-I/O basis..
Jens Axboe June 16, 2017, 7:35 p.m. UTC | #15
On 06/16/2017 12:02 PM, Christoph Hellwig wrote:
> On Fri, Jun 16, 2017 at 09:59:38AM -0600, Jens Axboe wrote:
>>>> Did you see v5? It adds enum write_hint and passes it all the way down,
>>>> until we transform them into rq/bio flags.
>>>
>>> Yes.  But with all the way down I mean all the way down to the driver :)
>>
>> Only missing part is the request flags. And why make that any different
>> than the flags we already have now? It'd be trivial to pack the value
>> into the request flags as well, but I'm struggling to see the point of
>> that, honestly.
>>
>> Please expand on why you think changing the request flags to also
>> carry that value would be useful, as opposed to just mapping it when
>> we setup the request. If you have a valid concern I don't mind making
>> the change, but I just don't see one right now.
> 
> Mostly so that we can pass the value down in one form through the whole
> stack.

OK good, so we have that now. I guess the one change we could further make
is have the write hints be a subset of the possible hints. I already made
changes to reflect that in the latest posting, where we have a rw_hint
enum. I haven't made any division of it into read vs write hints, but that
could always be done and it won't impact the API since we can just keep
the write hints in the lower 8-16 bits, or whatever we choose.

>> How so? It's pretty straight forward. The only downside I see is that
>> we'll have a delay between seeing the first valid write hint and to
>> the time when nvme has it enabled. But that's so short, and in the
>> grander scheme of things, doesn't matter one bit.
> 
> I'll take your word for that.  To be honest I hope future standards
> won't make the mistake to come up with something as ugly as streams
> and just take something like our hints on a per-I/O basis..

Well, at least streams is better than how it started out...
diff mbox

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d4484df9..9cb2314efca3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -678,7 +678,7 @@  static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	struct kiocb kiocb;
 	ssize_t ret;
 
-	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
+	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
 		return -EOPNOTSUPP;
 
 	init_sync_kiocb(&kiocb, filp);
@@ -688,6 +688,13 @@  static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 		kiocb.ki_flags |= IOCB_DSYNC;
 	if (flags & RWF_SYNC)
 		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+	if (flags & RWF_WRITE_LIFE_MASK) {
+		struct inode *inode = file_inode(filp);
+
+		inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
+					RWF_WRITE_LIFE_SHIFT;
+		kiocb.ki_flags |= inode->i_write_hint << IOCB_WRITE_LIFE_SHIFT;
+	}
 	kiocb.ki_pos = *ppos;
 
 	if (type == READ)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f4f9df8ed059..63798b67fcfe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -269,6 +269,12 @@  struct writeback_control;
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 
+/*
+ * Steal 4-bits for stream information, this allows 16 valid streams
+ */
+#define IOCB_WRITE_LIFE_SHIFT	7
+#define IOCB_WRITE_LIFE_MASK	(BIT(7) | BIT(8) | BIT(9) | BIT(10))
+
 struct kiocb {
 	struct file		*ki_filp;
 	loff_t			ki_pos;
@@ -292,6 +298,12 @@  static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 	};
 }
 
+static inline int iocb_write_hint(const struct kiocb *iocb)
+{
+	return (iocb->ki_flags & IOCB_WRITE_LIFE_MASK) >>
+			IOCB_WRITE_LIFE_SHIFT;
+}
+
 /*
  * "descriptor" for what we're up to with a read.
  * This allows us to use the same read code yet
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 24e61a54feaa..58b7ee06b380 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -361,4 +361,14 @@  struct fscrypt_key {
 #define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
 #define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
 
+/*
+ * Data life time write flags, steal 4 bits for that
+ */
+#define RWF_WRITE_LIFE_SHIFT		4
+#define RWF_WRITE_LIFE_MASK		0x000000f0 /* 4 bits of stream ID */
+#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
+
 #endif /* _UAPI_LINUX_FS_H */