diff mbox

[v2] fs: Add aio iopriority support for block_dev

Message ID 20180503182114.2797-1-adam.manzanares@wdc.com (mailing list archive)
State New, archived
Headers show

Commit Message

Adam Manzanares May 3, 2018, 6:21 p.m. UTC
From: Adam Manzanares <adam.manzanares@wdc.com>

This is the per-I/O equivalent of the ioprio_set system call.

When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.

When a bio is created for an aio request by the block dev we set the priority
value of the bio to the user supplied value.

See the following link for performance implications on a SATA HDD:
https://lkml.org/lkml/2016/12/6/495

Given that WRR support for NVME devices has patches floating around and it was
discussed at LSFMM, we may soon have a lower latency storage device that can 
make use of iopriorities. A per command iopriority interface seems timely 
given these developments. 

If we want to avoid bloating struct kiocb, I suggest we turn the private field 
into a union of the private and ki_ioprio field. It seems like the users of 
the private field all use it at a point where we can yank the priority from 
the kiocb before the private field is used. Comments and suggestions welcome.

v2: merge patches
    use IOCB_FLAG_IOPRIO
    validate intended use with IOCB_IOPRIO
    add linux-api and linux-block to cc

Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
---
 fs/aio.c                     | 10 ++++++++++
 fs/block_dev.c               |  2 ++
 include/linux/fs.h           |  2 ++
 include/uapi/linux/aio_abi.h |  1 +
 4 files changed, 15 insertions(+)

Comments

Matthew Wilcox May 3, 2018, 6:33 p.m. UTC | #1
On Thu, May 03, 2018 at 11:21:14AM -0700, adam.manzanares@wdc.com wrote:
> If we want to avoid bloating struct kiocb, I suggest we turn the private field 
> into a union of the private and ki_ioprio field. It seems like the users of 
> the private field all use it at a point where we can yank the priority from 
> the kiocb before the private field is used. Comments and suggestions welcome.

Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need
32 bits of ki_hint.  (currently defined values are 1-5)

> @@ -300,6 +301,7 @@ struct kiocb {
>  	void			*private;
>  	int			ki_flags;
>  	enum rw_hint		ki_hint;
> +	u16			ki_ioprio; /* See linux/ioprio.h */
>  } __randomize_layout;
>  
>  static inline bool is_sync_kiocb(struct kiocb *kiocb)
Jeff Moyer May 3, 2018, 6:36 p.m. UTC | #2
Hi, Adam,

adam.manzanares@wdc.com writes:

> From: Adam Manzanares <adam.manzanares@wdc.com>
>
> This is the per-I/O equivalent of the ioprio_set system call.
>
> When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
> newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.
>
> When a bio is created for an aio request by the block dev we set the priority
> value of the bio to the user supplied value.
>
> See the following link for performance implications on a SATA HDD:
> https://lkml.org/lkml/2016/12/6/495
>
> Given that WRR support for NVME devices has patches floating around and it was
> discussed at LSFMM, we may soon have a lower latency storage device that can 
> make use of iopriorities. A per command iopriority interface seems timely 
> given these developments. 
>
> If we want to avoid bloating struct kiocb, I suggest we turn the private field 
> into a union of the private and ki_ioprio field. It seems like the users of 
> the private field all use it at a point where we can yank the priority from 
> the kiocb before the private field is used. Comments and suggestions welcome.

The ioprio_set system call requires CAP_SYS_ADMIN for setting
IOPRIO_CLASS_RT.  I think we need similar checks here.

-Jeff

>
> v2: merge patches
>     use IOCB_FLAG_IOPRIO
>     validate intended use with IOCB_IOPRIO
>     add linux-api and linux-block to cc
>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> ---
>  fs/aio.c                     | 10 ++++++++++
>  fs/block_dev.c               |  2 ++
>  include/linux/fs.h           |  2 ++
>  include/uapi/linux/aio_abi.h |  1 +
>  4 files changed, 15 insertions(+)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 88d7927ffbc6..f36636d8ff2c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1597,6 +1597,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
>  		req->common.ki_flags |= IOCB_EVENTFD;
>  	}
>  
> +	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
> +		/*
> +		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
> +		 * aio_reqprio is interpreted as an I/O scheduling
> +		 * class and priority.
> +		 */
> +		req->common.ki_ioprio = iocb->aio_reqprio;
> +		req->common.ki_flags |= IOCB_IOPRIO;
> +	}
> +
>  	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
>  	if (unlikely(ret)) {
>  		pr_debug("EINVAL: aio_rw_flags\n");
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 7ec920e27065..970bef79caa6 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -355,6 +355,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		bio->bi_write_hint = iocb->ki_hint;
>  		bio->bi_private = dio;
>  		bio->bi_end_io = blkdev_bio_end_io;
> +		if (iocb->ki_flags & IOCB_IOPRIO)
> +			bio->bi_ioprio = iocb->ki_ioprio;
>  
>  		ret = bio_iov_iter_get_pages(bio, iter);
>  		if (unlikely(ret)) {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 760d8da1b6c7..ab63ce720305 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -292,6 +292,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_IOPRIO		(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -300,6 +301,7 @@ struct kiocb {
>  	void			*private;
>  	int			ki_flags;
>  	enum rw_hint		ki_hint;
> +	u16			ki_ioprio; /* See linux/ioprio.h */
>  } __randomize_layout;
>  
>  static inline bool is_sync_kiocb(struct kiocb *kiocb)
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index a04adbc70ddf..d4593a6062ef 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -54,6 +54,7 @@ enum {
>   *                   is valid.
>   */
>  #define IOCB_FLAG_RESFD		(1 << 0)
> +#define IOCB_FLAG_IOPRIO	(1 << 1)
>  
>  /* read() from /dev/aio returns these structures. */
>  struct io_event {
Adam Manzanares May 3, 2018, 8:15 p.m. UTC | #3
On 5/3/18 11:33 AM, Matthew Wilcox wrote:
> On Thu, May 03, 2018 at 11:21:14AM -0700, adam.manzanares@wdc.com wrote:

>> If we want to avoid bloating struct kiocb, I suggest we turn the private field

>> into a union of the private and ki_ioprio field. It seems like the users of

>> the private field all use it at a point where we can yank the priority from

>> the kiocb before the private field is used. Comments and suggestions welcome.

> 

> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need

> 32 bits of ki_hint.  (currently defined values are 1-5)


I like the approach of using a u16 for the ki_hint. I'll update and 
resubmit.

>> @@ -300,6 +301,7 @@ struct kiocb {

>>   	void			*private;

>>   	int			ki_flags;

>>   	enum rw_hint		ki_hint;

>> +	u16			ki_ioprio; /* See linux/ioprio.h */

>>   } __randomize_layout;

>>   

>>   static inline bool is_sync_kiocb(struct kiocb *kiocb)
Adam Manzanares May 3, 2018, 8:24 p.m. UTC | #4
On 5/3/18 11:36 AM, Jeff Moyer wrote:
> Hi, Adam,


Hello Jeff,

> 

> adam.manzanares@wdc.com writes:

> 

>> From: Adam Manzanares <adam.manzanares@wdc.com>

>>

>> This is the per-I/O equivalent of the ioprio_set system call.

>>

>> When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the

>> newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.

>>

>> When a bio is created for an aio request by the block dev we set the priority

>> value of the bio to the user supplied value.

>>

>> See the following link for performance implications on a SATA HDD:

>> https://lkml.org/lkml/2016/12/6/495

>>

>> Given that WRR support for NVME devices has patches floating around and it was

>> discussed at LSFMM, we may soon have a lower latency storage device that can

>> make use of iopriorities. A per command iopriority interface seems timely

>> given these developments.

>>

>> If we want to avoid bloating struct kiocb, I suggest we turn the private field

>> into a union of the private and ki_ioprio field. It seems like the users of

>> the private field all use it at a point where we can yank the priority from

>> the kiocb before the private field is used. Comments and suggestions welcome.

> 

> The ioprio_set system call requires CAP_SYS_ADMIN for setting

> IOPRIO_CLASS_RT.  I think we need similar checks here.


I forgot how dangerous IOPRIO_CLASS_RT can be :). I will make a new 
function based on the checks in ioprio.c and reuse.

Thanks,
Adam

> 

> -Jeff

> 

>>

>> v2: merge patches

>>      use IOCB_FLAG_IOPRIO

>>      validate intended use with IOCB_IOPRIO

>>      add linux-api and linux-block to cc

>>

>> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>

>> ---

>>   fs/aio.c                     | 10 ++++++++++

>>   fs/block_dev.c               |  2 ++

>>   include/linux/fs.h           |  2 ++

>>   include/uapi/linux/aio_abi.h |  1 +

>>   4 files changed, 15 insertions(+)

>>

>> diff --git a/fs/aio.c b/fs/aio.c

>> index 88d7927ffbc6..f36636d8ff2c 100644

>> --- a/fs/aio.c

>> +++ b/fs/aio.c

>> @@ -1597,6 +1597,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,

>>   		req->common.ki_flags |= IOCB_EVENTFD;

>>   	}

>>   

>> +	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {

>> +		/*

>> +		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then

>> +		 * aio_reqprio is interpreted as an I/O scheduling

>> +		 * class and priority.

>> +		 */

>> +		req->common.ki_ioprio = iocb->aio_reqprio;

>> +		req->common.ki_flags |= IOCB_IOPRIO;

>> +	}

>> +

>>   	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);

>>   	if (unlikely(ret)) {

>>   		pr_debug("EINVAL: aio_rw_flags\n");

>> diff --git a/fs/block_dev.c b/fs/block_dev.c

>> index 7ec920e27065..970bef79caa6 100644

>> --- a/fs/block_dev.c

>> +++ b/fs/block_dev.c

>> @@ -355,6 +355,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)

>>   		bio->bi_write_hint = iocb->ki_hint;

>>   		bio->bi_private = dio;

>>   		bio->bi_end_io = blkdev_bio_end_io;

>> +		if (iocb->ki_flags & IOCB_IOPRIO)

>> +			bio->bi_ioprio = iocb->ki_ioprio;

>>   

>>   		ret = bio_iov_iter_get_pages(bio, iter);

>>   		if (unlikely(ret)) {

>> diff --git a/include/linux/fs.h b/include/linux/fs.h

>> index 760d8da1b6c7..ab63ce720305 100644

>> --- a/include/linux/fs.h

>> +++ b/include/linux/fs.h

>> @@ -292,6 +292,7 @@ enum rw_hint {

>>   #define IOCB_SYNC		(1 << 5)

>>   #define IOCB_WRITE		(1 << 6)

>>   #define IOCB_NOWAIT		(1 << 7)

>> +#define IOCB_IOPRIO		(1 << 8)

>>   

>>   struct kiocb {

>>   	struct file		*ki_filp;

>> @@ -300,6 +301,7 @@ struct kiocb {

>>   	void			*private;

>>   	int			ki_flags;

>>   	enum rw_hint		ki_hint;

>> +	u16			ki_ioprio; /* See linux/ioprio.h */

>>   } __randomize_layout;

>>   

>>   static inline bool is_sync_kiocb(struct kiocb *kiocb)

>> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h

>> index a04adbc70ddf..d4593a6062ef 100644

>> --- a/include/uapi/linux/aio_abi.h

>> +++ b/include/uapi/linux/aio_abi.h

>> @@ -54,6 +54,7 @@ enum {

>>    *                   is valid.

>>    */

>>   #define IOCB_FLAG_RESFD		(1 << 0)

>> +#define IOCB_FLAG_IOPRIO	(1 << 1)

>>   

>>   /* read() from /dev/aio returns these structures. */

>>   struct io_event {
Jens Axboe May 3, 2018, 8:24 p.m. UTC | #5
On 5/3/18 2:15 PM, Adam Manzanares wrote:
> 
> 
> On 5/3/18 11:33 AM, Matthew Wilcox wrote:
>> On Thu, May 03, 2018 at 11:21:14AM -0700, adam.manzanares@wdc.com wrote:
>>> If we want to avoid bloating struct kiocb, I suggest we turn the private field
>>> into a union of the private and ki_ioprio field. It seems like the users of
>>> the private field all use it at a point where we can yank the priority from
>>> the kiocb before the private field is used. Comments and suggestions welcome.
>>
>> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need
>> 32 bits of ki_hint.  (currently defined values are 1-5)
> 
> I like the approach of using a u16 for the ki_hint. I'll update and 
> resubmit.

It's intended to be a mask. If you do shrink it for now, then we need some
guard code to ensure it can always carry what it needs to.
Adam Manzanares May 3, 2018, 8:58 p.m. UTC | #6
On 5/3/18 1:24 PM, Jens Axboe wrote:
> On 5/3/18 2:15 PM, Adam Manzanares wrote:

>>

>>

>> On 5/3/18 11:33 AM, Matthew Wilcox wrote:

>>> On Thu, May 03, 2018 at 11:21:14AM -0700, adam.manzanares@wdc.com wrote:

>>>> If we want to avoid bloating struct kiocb, I suggest we turn the private field

>>>> into a union of the private and ki_ioprio field. It seems like the users of

>>>> the private field all use it at a point where we can yank the priority from

>>>> the kiocb before the private field is used. Comments and suggestions welcome.

>>>

>>> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need

>>> 32 bits of ki_hint.  (currently defined values are 1-5)

>>

>> I like the approach of using a u16 for the ki_hint. I'll update and

>> resubmit.

> 

> It's intended to be a mask. If you do shrink it for now, then we need some

> guard code to ensure it can always carry what it needs to.

> 


Got it, I'll add the guard to rw_hint_valid along with a comment about 
being limited by the size of ki_hint in case we get to a situation where 
16 bits is not enough.
Jens Axboe May 3, 2018, 9:20 p.m. UTC | #7
On 5/3/18 2:58 PM, Adam Manzanares wrote:
> 
> 
> On 5/3/18 1:24 PM, Jens Axboe wrote:
>> On 5/3/18 2:15 PM, Adam Manzanares wrote:
>>>
>>>
>>> On 5/3/18 11:33 AM, Matthew Wilcox wrote:
>>>> On Thu, May 03, 2018 at 11:21:14AM -0700, adam.manzanares@wdc.com wrote:
>>>>> If we want to avoid bloating struct kiocb, I suggest we turn the private field
>>>>> into a union of the private and ki_ioprio field. It seems like the users of
>>>>> the private field all use it at a point where we can yank the priority from
>>>>> the kiocb before the private field is used. Comments and suggestions welcome.
>>>>
>>>> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need
>>>> 32 bits of ki_hint.  (currently defined values are 1-5)
>>>
>>> I like the approach of using a u16 for the ki_hint. I'll update and
>>> resubmit.
>>
>> It's intended to be a mask. If you do shrink it for now, then we need some
>> guard code to ensure it can always carry what it needs to.
>>
> 
> Got it, I'll add the guard to rw_hint_valid along with a comment about 
> being limited by the size of ki_hint in case we get to a situation where 
> 16 bits is not enough.

Other way around - the API should not be limited by the fact that some
smaller type was chosen for an internal structure. Hence the guard/check
should not be in rw_hint_valid, but rather around where you assign
ki_hint. If/when we do extend the read/write hints, then we'll
potentially need to bump ki_hint to accommodate it.
Matthew Wilcox May 3, 2018, 10:43 p.m. UTC | #8
On Thu, May 03, 2018 at 02:24:58PM -0600, Jens Axboe wrote:
> On 5/3/18 2:15 PM, Adam Manzanares wrote:
> > On 5/3/18 11:33 AM, Matthew Wilcox wrote:
> >> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need
> >> 32 bits of ki_hint.  (currently defined values are 1-5)
> > 
> > I like the approach of using a u16 for the ki_hint. I'll update and 
> > resubmit.
> 
> It's intended to be a mask. If you do shrink it for now, then we need some
> guard code to ensure it can always carry what it needs to.

ummm ...

enum rw_hint {
        WRITE_LIFE_NOT_SET      = 0,
        WRITE_LIFE_NONE         = RWH_WRITE_LIFE_NONE,
        WRITE_LIFE_SHORT        = RWH_WRITE_LIFE_SHORT,
...

                .ki_hint = file_write_hint(filp),

static inline enum rw_hint file_write_hint(struct file *file)

#define RWF_WRITE_LIFE_NOT_SET  0
#define RWH_WRITE_LIFE_NONE     1
#define RWH_WRITE_LIFE_SHORT    2
#define RWH_WRITE_LIFE_MEDIUM   3
#define RWH_WRITE_LIFE_LONG     4
#define RWH_WRITE_LIFE_EXTREME  5

It doesn't look like it's being used as a mask.
Jens Axboe May 3, 2018, 10:53 p.m. UTC | #9
On 5/3/18 4:43 PM, Matthew Wilcox wrote:
> On Thu, May 03, 2018 at 02:24:58PM -0600, Jens Axboe wrote:
>> On 5/3/18 2:15 PM, Adam Manzanares wrote:
>>> On 5/3/18 11:33 AM, Matthew Wilcox wrote:
>>>> Or we could just make ki_hint a u8 or u16 ... seems unlikely we'll need
>>>> 32 bits of ki_hint.  (currently defined values are 1-5)
>>>
>>> I like the approach of using a u16 for the ki_hint. I'll update and 
>>> resubmit.
>>
>> It's intended to be a mask. If you do shrink it for now, then we need some
>> guard code to ensure it can always carry what it needs to.
> 
> ummm ...
> 
> enum rw_hint {
>         WRITE_LIFE_NOT_SET      = 0,
>         WRITE_LIFE_NONE         = RWH_WRITE_LIFE_NONE,
>         WRITE_LIFE_SHORT        = RWH_WRITE_LIFE_SHORT,
> ...
> 
>                 .ki_hint = file_write_hint(filp),
> 
> static inline enum rw_hint file_write_hint(struct file *file)
> 
> #define RWF_WRITE_LIFE_NOT_SET  0
> #define RWH_WRITE_LIFE_NONE     1
> #define RWH_WRITE_LIFE_SHORT    2
> #define RWH_WRITE_LIFE_MEDIUM   3
> #define RWH_WRITE_LIFE_LONG     4
> #define RWH_WRITE_LIFE_EXTREME  5
> 
> It doesn't look like it's being used as a mask.

Right, currently it only supports the life time hint. I'm saying the
intent is, for when it's being expanded to cover other hints, is to
split up the value into maskable sections.
diff mbox

Patch

diff --git a/fs/aio.c b/fs/aio.c
index 88d7927ffbc6..f36636d8ff2c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1597,6 +1597,16 @@  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		req->common.ki_flags |= IOCB_EVENTFD;
 	}
 
+	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
+		/*
+		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
+		 * aio_reqprio is interpreted as an I/O scheduling
+		 * class and priority.
+		 */
+		req->common.ki_ioprio = iocb->aio_reqprio;
+		req->common.ki_flags |= IOCB_IOPRIO;
+	}
+
 	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
 	if (unlikely(ret)) {
 		pr_debug("EINVAL: aio_rw_flags\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7ec920e27065..970bef79caa6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -355,6 +355,8 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		bio->bi_write_hint = iocb->ki_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
+		if (iocb->ki_flags & IOCB_IOPRIO)
+			bio->bi_ioprio = iocb->ki_ioprio;
 
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..ab63ce720305 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -292,6 +292,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_IOPRIO		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -300,6 +301,7 @@  struct kiocb {
 	void			*private;
 	int			ki_flags;
 	enum rw_hint		ki_hint;
+	u16			ki_ioprio; /* See linux/ioprio.h */
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..d4593a6062ef 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -54,6 +54,7 @@  enum {
  *                   is valid.
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
+#define IOCB_FLAG_IOPRIO	(1 << 1)
 
 /* read() from /dev/aio returns these structures. */
 struct io_event {