diff mbox

[3/6] direct-io: add support for write stream IDs

Message ID 1427210823-5283-4-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe March 24, 2015, 3:27 p.m. UTC
Get the streamid from the file, if any, and set it on the bio.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/direct-io.c | 4 ++++
 1 file changed, 4 insertions(+)

Comments

Dave Chinner March 25, 2015, 2:43 a.m. UTC | #1
On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
> Get the streamid from the file, if any, and set it on the bio.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
>  fs/direct-io.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index e181b6b2e297..5d2750346451 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -77,6 +77,7 @@ struct dio_submit {
>  	int reap_counter;		/* rate limit reaping */
>  	sector_t final_block_in_request;/* doesn't change */
>  	int boundary;			/* prev block is at a boundary */
> +	int streamid;			/* Write stream ID */
>  	get_block_t *get_block;		/* block mapping function */
>  	dio_submit_t *submit_io;	/* IO submition function */
>  
> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
>  
>  	sdio->bio = bio;
>  	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
> +
> +	bio_set_streamid(bio, sdio->streamid);
>  }
>  
>  /*
> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  	sdio.blkbits = blkbits;
>  	sdio.blkfactor = i_blkbits - blkbits;
>  	sdio.block_in_file = offset >> blkbits;
> +	sdio.streamid = iocb->ki_filp->f_streamid;

If iocb->ki_filp->f_streamid is not set, then it should fall back to
whatever is set on the inode->i_streamid.

Cheers,

Dave.
Jens Axboe March 25, 2015, 2:26 p.m. UTC | #2
On 03/24/2015 08:43 PM, Dave Chinner wrote:
> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
>> Get the streamid from the file, if any, and set it on the bio.
>>
>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> ---
>>   fs/direct-io.c | 4 ++++
>>   1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>> index e181b6b2e297..5d2750346451 100644
>> --- a/fs/direct-io.c
>> +++ b/fs/direct-io.c
>> @@ -77,6 +77,7 @@ struct dio_submit {
>>   	int reap_counter;		/* rate limit reaping */
>>   	sector_t final_block_in_request;/* doesn't change */
>>   	int boundary;			/* prev block is at a boundary */
>> +	int streamid;			/* Write stream ID */
>>   	get_block_t *get_block;		/* block mapping function */
>>   	dio_submit_t *submit_io;	/* IO submition function */
>>
>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
>>
>>   	sdio->bio = bio;
>>   	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>> +
>> +	bio_set_streamid(bio, sdio->streamid);
>>   }
>>
>>   /*
>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>>   	sdio.blkbits = blkbits;
>>   	sdio.blkfactor = i_blkbits - blkbits;
>>   	sdio.block_in_file = offset >> blkbits;
>> +	sdio.streamid = iocb->ki_filp->f_streamid;
>
> If iocb->ki_filp->f_streamid is not set, then it should fall back to
> whatever is set on the inode->i_streamid.

Good point, agree. Will make that change.
Ming Lin April 10, 2015, 11:50 p.m. UTC | #3
On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 03/24/2015 08:43 PM, Dave Chinner wrote:
>>
>> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
>>>
>>> Get the streamid from the file, if any, and set it on the bio.
>>>
>>> Signed-off-by: Jens Axboe <axboe@fb.com>
>>> ---
>>>   fs/direct-io.c | 4 ++++
>>>   1 file changed, 4 insertions(+)
>>>
>>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>>> index e181b6b2e297..5d2750346451 100644
>>> --- a/fs/direct-io.c
>>> +++ b/fs/direct-io.c
>>> @@ -77,6 +77,7 @@ struct dio_submit {
>>>         int reap_counter;               /* rate limit reaping */
>>>         sector_t final_block_in_request;/* doesn't change */
>>>         int boundary;                   /* prev block is at a boundary */
>>> +       int streamid;                   /* Write stream ID */
>>>         get_block_t *get_block;         /* block mapping function */
>>>         dio_submit_t *submit_io;        /* IO submition function */
>>>
>>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit
>>> *sdio,
>>>
>>>         sdio->bio = bio;
>>>         sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>>> +
>>> +       bio_set_streamid(bio, sdio->streamid);
>>>   }
>>>
>>>   /*
>>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb,
>>> struct inode *inode,
>>>         sdio.blkbits = blkbits;
>>>         sdio.blkfactor = i_blkbits - blkbits;
>>>         sdio.block_in_file = offset >> blkbits;
>>> +       sdio.streamid = iocb->ki_filp->f_streamid;
>>
>>
>> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>> whatever is set on the inode->i_streamid.

Why should do the fall back?

>
>
> Good point, agree. Will make that change.

Hi Jens,

That change causes problem for direct IO, for example

process 1:
fd = open("/dev/nvme0n1", O_DIRECT...);
//set stream_id 1
fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
pwrite(fd, ....);

process 2:
fd = open("/dev/nvme0n1", O_DIRECT...);
//should be legacy stream_id 0
pwrite(fd, ....);

But now process 2 also see stream_id 1, which is wrong.

Thanks,
Ming

>
>
> --
> Jens Axboe
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lin April 11, 2015, 12:06 a.m. UTC | #4
On Fri, Apr 10, 2015 at 4:50 PM, Ming Lin <mlin@kernel.org> wrote:
> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 03/24/2015 08:43 PM, Dave Chinner wrote:
>>>
>>> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
>>>>
>>>> Get the streamid from the file, if any, and set it on the bio.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@fb.com>
>>>> ---
>>>>   fs/direct-io.c | 4 ++++
>>>>   1 file changed, 4 insertions(+)
>>>>
>>>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>>>> index e181b6b2e297..5d2750346451 100644
>>>> --- a/fs/direct-io.c
>>>> +++ b/fs/direct-io.c
>>>> @@ -77,6 +77,7 @@ struct dio_submit {
>>>>         int reap_counter;               /* rate limit reaping */
>>>>         sector_t final_block_in_request;/* doesn't change */
>>>>         int boundary;                   /* prev block is at a boundary */
>>>> +       int streamid;                   /* Write stream ID */
>>>>         get_block_t *get_block;         /* block mapping function */
>>>>         dio_submit_t *submit_io;        /* IO submition function */
>>>>
>>>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit
>>>> *sdio,
>>>>
>>>>         sdio->bio = bio;
>>>>         sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>>>> +
>>>> +       bio_set_streamid(bio, sdio->streamid);
>>>>   }
>>>>
>>>>   /*
>>>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb,
>>>> struct inode *inode,
>>>>         sdio.blkbits = blkbits;
>>>>         sdio.blkfactor = i_blkbits - blkbits;
>>>>         sdio.block_in_file = offset >> blkbits;
>>>> +       sdio.streamid = iocb->ki_filp->f_streamid;
>>>
>>>
>>> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>>> whatever is set on the inode->i_streamid.
>
> Why should do the fall back?
>
>>
>>
>> Good point, agree. Will make that change.
>
> Hi Jens,
>
> That change causes problem for direct IO, for example

Actually, buffered write also has problem.

process 1:
fd = open("/mnt/foo.txt", ...);
//set stream_id 1
fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
write(fd, ....);

process 1 exit, but stream_id is still saved in inode

later process 2 starts,
and when writeback it will see stream_id 1 from inode although it
didn't set stream_id at all

process 2:
fd = open("/mnt/foo.txt", ...);
write(fd, ....);

Thanks,
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner April 11, 2015, 11:59 a.m. UTC | #5
On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
> > On 03/24/2015 08:43 PM, Dave Chinner wrote:
> >>
> >> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
> >>>
> >>> Get the streamid from the file, if any, and set it on the bio.
> >>>
> >>> Signed-off-by: Jens Axboe <axboe@fb.com>
> >>> ---
> >>>   fs/direct-io.c | 4 ++++
> >>>   1 file changed, 4 insertions(+)
> >>>
> >>> diff --git a/fs/direct-io.c b/fs/direct-io.c
> >>> index e181b6b2e297..5d2750346451 100644
> >>> --- a/fs/direct-io.c
> >>> +++ b/fs/direct-io.c
> >>> @@ -77,6 +77,7 @@ struct dio_submit {
> >>>         int reap_counter;               /* rate limit reaping */
> >>>         sector_t final_block_in_request;/* doesn't change */
> >>>         int boundary;                   /* prev block is at a boundary */
> >>> +       int streamid;                   /* Write stream ID */
> >>>         get_block_t *get_block;         /* block mapping function */
> >>>         dio_submit_t *submit_io;        /* IO submition function */
> >>>
> >>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit
> >>> *sdio,
> >>>
> >>>         sdio->bio = bio;
> >>>         sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
> >>> +
> >>> +       bio_set_streamid(bio, sdio->streamid);
> >>>   }
> >>>
> >>>   /*
> >>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb,
> >>> struct inode *inode,
> >>>         sdio.blkbits = blkbits;
> >>>         sdio.blkfactor = i_blkbits - blkbits;
> >>>         sdio.block_in_file = offset >> blkbits;
> >>> +       sdio.streamid = iocb->ki_filp->f_streamid;
> >>
> >>
> >> If iocb->ki_filp->f_streamid is not set, then it should fall back to
> >> whatever is set on the inode->i_streamid.
> 
> Why should do the fall back?

Because then you have a method of using streams with applications
that aren't aware of streams.

Or perhaps you have a file you know has different access patterns to
the rest of the files in a directory, and you don't want to have to
set the stream on every process that opens and uses that file. e.g.
database writeahead log files (sequential write, never read) vs
database index/table files (random read/write).....

> > Good point, agree. Will make that change.
> 
> That change causes problem for direct IO, for example
> 
> process 1:
> fd = open("/dev/nvme0n1", O_DIRECT...);
> //set stream_id 1
> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
> pwrite(fd, ....);
> 
> process 2:
> fd = open("/dev/nvme0n1", O_DIRECT...);
> //should be legacy stream_id 0
> pwrite(fd, ....);
> 
> But now process 2 also see stream_id 1, which is wrong.

It's not wrong, your behaviour model is just different You have
defined a process/fd based stream model and not considered
considered that admins and applications might want to use a file
based stream model instead, so applications don't need to even be
aware that write streams are in use...

Cheers,

Dave.
Ming Lin April 17, 2015, 6:20 a.m. UTC | #6
On Sat, Apr 11, 2015 at 4:59 AM, Dave Chinner <david@fromorbit.com> wrote:
> On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
>> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> > On 03/24/2015 08:43 PM, Dave Chinner wrote:
>> >>
>> >> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
>> >>>
>> >>> Get the streamid from the file, if any, and set it on the bio.
>> >>>
>> >>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> >>> ---
>> >>>   fs/direct-io.c | 4 ++++
>> >>>   1 file changed, 4 insertions(+)
>> >>>
>> >>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>> >>> index e181b6b2e297..5d2750346451 100644
>> >>> --- a/fs/direct-io.c
>> >>> +++ b/fs/direct-io.c
>> >>> @@ -77,6 +77,7 @@ struct dio_submit {
>> >>>         int reap_counter;               /* rate limit reaping */
>> >>>         sector_t final_block_in_request;/* doesn't change */
>> >>>         int boundary;                   /* prev block is at a boundary */
>> >>> +       int streamid;                   /* Write stream ID */
>> >>>         get_block_t *get_block;         /* block mapping function */
>> >>>         dio_submit_t *submit_io;        /* IO submition function */
>> >>>
>> >>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit
>> >>> *sdio,
>> >>>
>> >>>         sdio->bio = bio;
>> >>>         sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>> >>> +
>> >>> +       bio_set_streamid(bio, sdio->streamid);
>> >>>   }
>> >>>
>> >>>   /*
>> >>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb,
>> >>> struct inode *inode,
>> >>>         sdio.blkbits = blkbits;
>> >>>         sdio.blkfactor = i_blkbits - blkbits;
>> >>>         sdio.block_in_file = offset >> blkbits;
>> >>> +       sdio.streamid = iocb->ki_filp->f_streamid;
>> >>
>> >>
>> >> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>> >> whatever is set on the inode->i_streamid.
>>
>> Why should do the fall back?
>
> Because then you have a method of using streams with applications
> that aren't aware of streams.
>
> Or perhaps you have a file you know has different access patterns to
> the rest of the files in a directory, and you don't want to have to
> set the stream on every process that opens and uses that file. e.g.
> database writeahead log files (sequential write, never read) vs
> database index/table files (random read/write).....
>
>> > Good point, agree. Will make that change.
>>
>> That change causes problem for direct IO, for example
>>
>> process 1:
>> fd = open("/dev/nvme0n1", O_DIRECT...);
>> //set stream_id 1
>> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
>> pwrite(fd, ....);
>>
>> process 2:
>> fd = open("/dev/nvme0n1", O_DIRECT...);
>> //should be legacy stream_id 0
>> pwrite(fd, ....);
>>
>> But now process 2 also see stream_id 1, which is wrong.
>
> It's not wrong, your behaviour model is just different You have
> defined a process/fd based stream model and not considered
> considered that admins and applications might want to use a file
> based stream model instead, so applications don't need to even be
> aware that write streams are in use...

The stream must be opened, otherwise device will return error if application
write to a not-opened stream.

Device has limited number of streams, for example, 16 streams.
There are 2 APIs to open/close the stream.

process 1:
fd = open("/dev/nvme0n1", O_DIRECT...);
stream_id = open_stream("/dev/nvme0n1", ....);
fadvise(fd, stream_id, 0, POSIX_FADV_STREAMID);
pwrite(fd, ....);
close_stream("/dev/nvme0n1", stream_id);

process 2:
fd = open("/dev/nvme0n1", O_DIRECT...);
//should be legacy stream_id 0
pwrite(fd, ....);

Now process 2 gets error because the "stream_id" was already closed.

Thanks,
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 17, 2015, 3:17 p.m. UTC | #7
On 04/10/2015 05:50 PM, Ming Lin wrote:
> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 03/24/2015 08:43 PM, Dave Chinner wrote:
>>>
>>> On Tue, Mar 24, 2015 at 09:27:00AM -0600, Jens Axboe wrote:
>>>>
>>>> Get the streamid from the file, if any, and set it on the bio.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@fb.com>
>>>> ---
>>>>    fs/direct-io.c | 4 ++++
>>>>    1 file changed, 4 insertions(+)
>>>>
>>>> diff --git a/fs/direct-io.c b/fs/direct-io.c
>>>> index e181b6b2e297..5d2750346451 100644
>>>> --- a/fs/direct-io.c
>>>> +++ b/fs/direct-io.c
>>>> @@ -77,6 +77,7 @@ struct dio_submit {
>>>>          int reap_counter;               /* rate limit reaping */
>>>>          sector_t final_block_in_request;/* doesn't change */
>>>>          int boundary;                   /* prev block is at a boundary */
>>>> +       int streamid;                   /* Write stream ID */
>>>>          get_block_t *get_block;         /* block mapping function */
>>>>          dio_submit_t *submit_io;        /* IO submition function */
>>>>
>>>> @@ -372,6 +373,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit
>>>> *sdio,
>>>>
>>>>          sdio->bio = bio;
>>>>          sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>>>> +
>>>> +       bio_set_streamid(bio, sdio->streamid);
>>>>    }
>>>>
>>>>    /*
>>>> @@ -1205,6 +1208,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb,
>>>> struct inode *inode,
>>>>          sdio.blkbits = blkbits;
>>>>          sdio.blkfactor = i_blkbits - blkbits;
>>>>          sdio.block_in_file = offset >> blkbits;
>>>> +       sdio.streamid = iocb->ki_filp->f_streamid;
>>>
>>>
>>> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>>> whatever is set on the inode->i_streamid.
>
> Why should do the fall back?

Because the assumption is that, in general, the specific file is a good 
indication of the data lifetime, if someone has already set that. It's a 
better guess than writing without any stream attached.

> That change causes problem for direct IO, for example
>
> process 1:
> fd = open("/dev/nvme0n1", O_DIRECT...);
> //set stream_id 1
> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
> pwrite(fd, ....);
>
> process 2:
> fd = open("/dev/nvme0n1", O_DIRECT...);
> //should be legacy stream_id 0
> pwrite(fd, ....);
>
> But now process 2 also see stream_id 1, which is wrong.

I guess for that case, it is a problem. Basically the fallback breaks 
down for full block devices, or huge files that are used as a general 
backing store (like a vm image, for instance). Hmm, not sure what the 
right solution would be here, or if there really is one. It's probably 
best NOT to do the fallback after all.
Dave Chinner April 17, 2015, 11:06 p.m. UTC | #8
On Thu, Apr 16, 2015 at 11:20:45PM -0700, Ming Lin wrote:
> On Sat, Apr 11, 2015 at 4:59 AM, Dave Chinner <david@fromorbit.com> wrote:
> > On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
> >> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
> >> >> If iocb->ki_filp->f_streamid is not set, then it should fall back to
> >> >> whatever is set on the inode->i_streamid.
> >>
> >> Why should do the fall back?
> >
> > Because then you have a method of using streams with applications
> > that aren't aware of streams.
> >
> > Or perhaps you have a file you know has different access patterns to
> > the rest of the files in a directory, and you don't want to have to
> > set the stream on every process that opens and uses that file. e.g.
> > database writeahead log files (sequential write, never read) vs
> > database index/table files (random read/write).....
> >
> >> > Good point, agree. Will make that change.
> >>
> >> That change causes problem for direct IO, for example
> >>
> >> process 1:
> >> fd = open("/dev/nvme0n1", O_DIRECT...);
> >> //set stream_id 1
> >> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
> >> pwrite(fd, ....);
> >>
> >> process 2:
> >> fd = open("/dev/nvme0n1", O_DIRECT...);
> >> //should be legacy stream_id 0
> >> pwrite(fd, ....);
> >>
> >> But now process 2 also see stream_id 1, which is wrong.
> >
> > It's not wrong, your behaviour model is just different You have
> > defined a process/fd based stream model and not considered
> > considered that admins and applications might want to use a file
> > based stream model instead, so applications don't need to even be
> > aware that write streams are in use...
> 
> The stream must be opened, otherwise device will return error if application
> write to a not-opened stream.

That's an extremely device specific *implementation* of a write
stream. The *concept* of a write stream being passed from userspace to
the block layer doesn't have such constraints, and I get realy
concerned when implementations of a generic concept are so tightly
focussed around one type of hardware implementation of the
concept...

> Device has limited number of streams, for example, 16 streams.
> There are 2 APIs to open/close the stream.

What's to stop me writing something for DM-thinp that understands
write streams in bios and uses it to separate out the write streams
into different regions of the thinp device to improve locality of
it's data placement and hence reduce fragmentation?

Yes, for nvme devices, the "streamid" might come from hardware,
but there's nothing stopping other storage devices using it
differently or having fewer constraints. e.g. unknown ID -> same as
stream 0....

Cheers,

Dave.
Jens Axboe April 17, 2015, 11:11 p.m. UTC | #9
On 04/17/2015 05:06 PM, Dave Chinner wrote:
> On Thu, Apr 16, 2015 at 11:20:45PM -0700, Ming Lin wrote:
>> On Sat, Apr 11, 2015 at 4:59 AM, Dave Chinner <david@fromorbit.com> wrote:
>>> On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
>>>> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>>>>>> whatever is set on the inode->i_streamid.
>>>>
>>>> Why should do the fall back?
>>>
>>> Because then you have a method of using streams with applications
>>> that aren't aware of streams.
>>>
>>> Or perhaps you have a file you know has different access patterns to
>>> the rest of the files in a directory, and you don't want to have to
>>> set the stream on every process that opens and uses that file. e.g.
>>> database writeahead log files (sequential write, never read) vs
>>> database index/table files (random read/write).....
>>>
>>>>> Good point, agree. Will make that change.
>>>>
>>>> That change causes problem for direct IO, for example
>>>>
>>>> process 1:
>>>> fd = open("/dev/nvme0n1", O_DIRECT...);
>>>> //set stream_id 1
>>>> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
>>>> pwrite(fd, ....);
>>>>
>>>> process 2:
>>>> fd = open("/dev/nvme0n1", O_DIRECT...);
>>>> //should be legacy stream_id 0
>>>> pwrite(fd, ....);
>>>>
>>>> But now process 2 also see stream_id 1, which is wrong.
>>>
>>> It's not wrong, your behaviour model is just different You have
>>> defined a process/fd based stream model and not considered
>>> considered that admins and applications might want to use a file
>>> based stream model instead, so applications don't need to even be
>>> aware that write streams are in use...
>>
>> The stream must be opened, otherwise device will return error if application
>> write to a not-opened stream.
>
> That's an extremely device specific *implementation* of a write
> stream. The *concept* of a write stream being passed from userspace to
> the block layer doesn't have such constraints, and I get realy
> concerned when implementations of a generic concept are so tightly
> focussed around one type of hardware implementation of the
> concept...

Indeed, which is why the implementation posted cares ONLY about the 
stream ID itself, and passing that through.

But the point about fallback is valid, however, for some use cases that 
will not be what you want. But we have to make some sort of decision, 
and falling back to the inode set value (if one is set) is probably the 
right thing to do in most use cases.

>> Device has limited number of streams, for example, 16 streams.
>> There are 2 APIs to open/close the stream.
>
> What's to stop me writing something for DM-thinp that understands
> write streams in bios and uses it to separate out the write streams
> into different regions of the thinp device to improve locality of
> it's data placement and hence reduce fragmentation?

Absolutely nothing, in fact that's one of the use cases that I had in 
mind. Or for for caching software.
Dave Chinner April 17, 2015, 11:51 p.m. UTC | #10
On Fri, Apr 17, 2015 at 05:11:40PM -0600, Jens Axboe wrote:
> On 04/17/2015 05:06 PM, Dave Chinner wrote:
> >On Thu, Apr 16, 2015 at 11:20:45PM -0700, Ming Lin wrote:
> >>On Sat, Apr 11, 2015 at 4:59 AM, Dave Chinner <david@fromorbit.com> wrote:
> >>>On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
> >>>>On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>>If iocb->ki_filp->f_streamid is not set, then it should fall back to
> >>>>>>whatever is set on the inode->i_streamid.
> >>>>
> >>>>Why should do the fall back?
> >>>
> >>>Because then you have a method of using streams with applications
> >>>that aren't aware of streams.
> >>>
> >>>Or perhaps you have a file you know has different access patterns to
> >>>the rest of the files in a directory, and you don't want to have to
> >>>set the stream on every process that opens and uses that file. e.g.
> >>>database writeahead log files (sequential write, never read) vs
> >>>database index/table files (random read/write).....
> >>>
> >>>>>Good point, agree. Will make that change.
> >>>>
> >>>>That change causes problem for direct IO, for example
> >>>>
> >>>>process 1:
> >>>>fd = open("/dev/nvme0n1", O_DIRECT...);
> >>>>//set stream_id 1
> >>>>fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
> >>>>pwrite(fd, ....);
> >>>>
> >>>>process 2:
> >>>>fd = open("/dev/nvme0n1", O_DIRECT...);
> >>>>//should be legacy stream_id 0
> >>>>pwrite(fd, ....);
> >>>>
> >>>>But now process 2 also see stream_id 1, which is wrong.
> >>>
> >>>It's not wrong, your behaviour model is just different You have
> >>>defined a process/fd based stream model and not considered
> >>>considered that admins and applications might want to use a file
> >>>based stream model instead, so applications don't need to even be
> >>>aware that write streams are in use...
> >>
> >>The stream must be opened, otherwise device will return error if application
> >>write to a not-opened stream.
> >
> >That's an extremely device specific *implementation* of a write
> >stream. The *concept* of a write stream being passed from userspace to
> >the block layer doesn't have such constraints, and I get realy
> >concerned when implementations of a generic concept are so tightly
> >focussed around one type of hardware implementation of the
> >concept...
> 
> Indeed, which is why the implementation posted cares ONLY about the
> stream ID itself, and passing that through.
> 
> But the point about fallback is valid, however, for some use cases
> that will not be what you want. But we have to make some sort of
> decision, and falling back to the inode set value (if one is set) is
> probably the right thing to do in most use cases.

Right, the question is then whether fadvise should set the value on
the inode at all, because then the effect of setting it on a fd also
changes the fallback. Perhaps we need to a distinction between
"setting the stream for this fd" which lasts as long as the fd is
active, and "setting the default inode stream" which is potentially
a persistent operation if the filesystem stores it on disk...

> >>Device has limited number of streams, for example, 16 streams.
> >>There are 2 APIs to open/close the stream.
> >
> >What's to stop me writing something for DM-thinp that understands
> >write streams in bios and uses it to separate out the write streams
> >into different regions of the thinp device to improve locality of
> >it's data placement and hence reduce fragmentation?
> 
> Absolutely nothing, in fact that's one of the use cases that I had
> in mind. Or for for caching software.

*nod*. We are on the same page, then :)

Cheers,

Dave.
> 
> -- 
> Jens Axboe
> 
>
Jens Axboe April 18, 2015, 2 a.m. UTC | #11
On 04/17/2015 05:51 PM, Dave Chinner wrote:
> On Fri, Apr 17, 2015 at 05:11:40PM -0600, Jens Axboe wrote:
>> On 04/17/2015 05:06 PM, Dave Chinner wrote:
>>> On Thu, Apr 16, 2015 at 11:20:45PM -0700, Ming Lin wrote:
>>>> On Sat, Apr 11, 2015 at 4:59 AM, Dave Chinner <david@fromorbit.com> wrote:
>>>>> On Fri, Apr 10, 2015 at 04:50:05PM -0700, Ming Lin wrote:
>>>>>> On Wed, Mar 25, 2015 at 7:26 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>> If iocb->ki_filp->f_streamid is not set, then it should fall back to
>>>>>>>> whatever is set on the inode->i_streamid.
>>>>>>
>>>>>> Why should do the fall back?
>>>>>
>>>>> Because then you have a method of using streams with applications
>>>>> that aren't aware of streams.
>>>>>
>>>>> Or perhaps you have a file you know has different access patterns to
>>>>> the rest of the files in a directory, and you don't want to have to
>>>>> set the stream on every process that opens and uses that file. e.g.
>>>>> database writeahead log files (sequential write, never read) vs
>>>>> database index/table files (random read/write).....
>>>>>
>>>>>>> Good point, agree. Will make that change.
>>>>>>
>>>>>> That change causes problem for direct IO, for example
>>>>>>
>>>>>> process 1:
>>>>>> fd = open("/dev/nvme0n1", O_DIRECT...);
>>>>>> //set stream_id 1
>>>>>> fadvise(fd, 1, 0, POSIX_FADV_STREAMID);
>>>>>> pwrite(fd, ....);
>>>>>>
>>>>>> process 2:
>>>>>> fd = open("/dev/nvme0n1", O_DIRECT...);
>>>>>> //should be legacy stream_id 0
>>>>>> pwrite(fd, ....);
>>>>>>
>>>>>> But now process 2 also see stream_id 1, which is wrong.
>>>>>
>>>>> It's not wrong, your behaviour model is just different You have
>>>>> defined a process/fd based stream model and not considered
>>>>> considered that admins and applications might want to use a file
>>>>> based stream model instead, so applications don't need to even be
>>>>> aware that write streams are in use...
>>>>
>>>> The stream must be opened, otherwise device will return error if application
>>>> write to a not-opened stream.
>>>
>>> That's an extremely device specific *implementation* of a write
>>> stream. The *concept* of a write stream being passed from userspace to
>>> the block layer doesn't have such constraints, and I get realy
>>> concerned when implementations of a generic concept are so tightly
>>> focussed around one type of hardware implementation of the
>>> concept...
>>
>> Indeed, which is why the implementation posted cares ONLY about the
>> stream ID itself, and passing that through.
>>
>> But the point about fallback is valid, however, for some use cases
>> that will not be what you want. But we have to make some sort of
>> decision, and falling back to the inode set value (if one is set) is
>> probably the right thing to do in most use cases.
>
> Right, the question is then whether fadvise should set the value on
> the inode at all, because then the effect of setting it on a fd also
> changes the fallback. Perhaps we need to a distinction between
> "setting the stream for this fd" which lasts as long as the fd is
> active, and "setting the default inode stream" which is potentially
> a persistent operation if the filesystem stores it on disk...

Yes, that might be a good compromise. The easiest would be to define a 
second fadvise advice, where the stronger advice would be file + inode. 
Another option would be changing the file approach to use fcntl(), and 
keeping the fadvise for the inode. I'll be happy to take input on what 
people would prefer here.

>>>> Device has limited number of streams, for example, 16 streams.
>>>> There are 2 APIs to open/close the stream.
>>>
>>> What's to stop me writing something for DM-thinp that understands
>>> write streams in bios and uses it to separate out the write streams
>>> into different regions of the thinp device to improve locality of
>>> it's data placement and hence reduce fragmentation?
>>
>> Absolutely nothing, in fact that's one of the use cases that I had
>> in mind. Or for for caching software.
>
> *nod*. We are on the same page, then :)

Yes completely, basically just wanted to clarify that.
diff mbox

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..5d2750346451 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -77,6 +77,7 @@  struct dio_submit {
 	int reap_counter;		/* rate limit reaping */
 	sector_t final_block_in_request;/* doesn't change */
 	int boundary;			/* prev block is at a boundary */
+	int streamid;			/* Write stream ID */
 	get_block_t *get_block;		/* block mapping function */
 	dio_submit_t *submit_io;	/* IO submition function */
 
@@ -372,6 +373,8 @@  dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 
 	sdio->bio = bio;
 	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
+
+	bio_set_streamid(bio, sdio->streamid);
 }
 
 /*
@@ -1205,6 +1208,7 @@  do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	sdio.blkbits = blkbits;
 	sdio.blkfactor = i_blkbits - blkbits;
 	sdio.block_in_file = offset >> blkbits;
+	sdio.streamid = iocb->ki_filp->f_streamid;
 
 	sdio.get_block = get_block;
 	dio->end_io = end_io;