diff mbox series

[RFC] io_uring: stop only support read/write for ctx with IORING_SETUP_IOPOLL

Message ID 20191104085608.44816-1-yangerkun@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] io_uring: stop only support read/write for ctx with IORING_SETUP_IOPOLL | expand

Commit Message

Yang Erkun Nov. 4, 2019, 8:56 a.m. UTC
There is no problem to support other type request for the ctx with
IORING_SETUP_IOPOLL. What we should do is just insert read/write
requests to poll list, and protect cqes with comopletion_lock in
io_iopoll_complete since other requests now can be completed as the same
time while we do io_iopoll_complete.

Signed-off-by: yangerkun <yangerkun@huawei.com>
---
 fs/io_uring.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

Comments

Bob Liu Nov. 4, 2019, 10:09 a.m. UTC | #1
On 11/4/19 4:56 PM, yangerkun wrote:
> There is no problem to support other type request for the ctx with
> IORING_SETUP_IOPOLL. 

Could you describe the benefit of doing this?

> What we should do is just insert read/write
> requests to poll list, and protect cqes with comopletion_lock in
> io_iopoll_complete since other requests now can be completed as the same
> time while we do io_iopoll_complete.
> 
> Signed-off-by: yangerkun <yangerkun@huawei.com>
> ---
>  fs/io_uring.c | 29 +++++++++--------------------
>  1 file changed, 9 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f9a38998f2fc..8bf52d9a2c55 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -753,9 +753,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>  {
>  	void *reqs[IO_IOPOLL_BATCH];
>  	struct io_kiocb *req;
> +	unsigned long flags;
>  	int to_free;
>  
>  	to_free = 0;
> +	spin_lock_irqsave(&ctx->completion_lock, flags);
>  	while (!list_empty(done)) {
>  		req = list_first_entry(done, struct io_kiocb, list);
>  		list_del(&req->list);
> @@ -781,6 +783,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>  	}
>  
>  	io_commit_cqring(ctx);
> +	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  	io_free_req_many(ctx, reqs, &to_free);
>  }
>  
> @@ -1517,9 +1520,6 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
>  	struct io_ring_ctx *ctx = req->ctx;
>  	long err = 0;
>  
> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
> -
>  	io_cqring_add_event(ctx, user_data, err);
>  	io_put_req(req);
>  	return 0;
> @@ -1527,13 +1527,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
>  
>  static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
> -	struct io_ring_ctx *ctx = req->ctx;
> -
>  	if (!req->file)
>  		return -EBADF;
>  
> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
>  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
>  		return -EINVAL;
>  
> @@ -1574,14 +1570,11 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  
>  static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
> -	struct io_ring_ctx *ctx = req->ctx;
>  	int ret = 0;
>  
>  	if (!req->file)
>  		return -EBADF;
>  
> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
>  	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
>  		return -EINVAL;
>  
> @@ -1627,9 +1620,6 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	struct socket *sock;
>  	int ret;
>  
> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
> -
>  	sock = sock_from_file(req->file, &ret);
>  	if (sock) {
>  		struct user_msghdr __user *msg;
> @@ -1712,8 +1702,6 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	struct io_kiocb *poll_req, *next;
>  	int ret = -ENOENT;
>  
> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
>  	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
>  	    sqe->poll_events)
>  		return -EINVAL;
> @@ -1833,8 +1821,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	__poll_t mask;
>  	u16 events;
>  
> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
>  	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
>  		return -EINVAL;
>  	if (!poll->file)
> @@ -1932,8 +1918,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	struct timespec64 ts;
>  	unsigned span = 0;
>  
> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
> -		return -EINVAL;
>  	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
>  	    sqe->len != 1)
>  		return -EINVAL;
> @@ -2032,6 +2016,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  			   const struct sqe_submit *s, bool force_nonblock)
>  {
>  	int ret, opcode;
> +	bool poll = false;
>  
>  	req->user_data = READ_ONCE(s->sqe->user_data);
>  
> @@ -2046,17 +2031,21 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  	case IORING_OP_READV:
>  		if (unlikely(s->sqe->buf_index))
>  			return -EINVAL;
> +		poll = true;
>  		ret = io_read(req, s, force_nonblock);
>  		break;
>  	case IORING_OP_WRITEV:
>  		if (unlikely(s->sqe->buf_index))
>  			return -EINVAL;
> +		poll = true;
>  		ret = io_write(req, s, force_nonblock);
>  		break;
>  	case IORING_OP_READ_FIXED:
> +		poll = true;
>  		ret = io_read(req, s, force_nonblock);
>  		break;
>  	case IORING_OP_WRITE_FIXED:
> +		poll = true;
>  		ret = io_write(req, s, force_nonblock);
>  		break;
>  	case IORING_OP_FSYNC:
> @@ -2088,7 +2077,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  	if (ret)
>  		return ret;
>  
> -	if (ctx->flags & IORING_SETUP_IOPOLL) {
> +	if ((ctx->flags & IORING_SETUP_IOPOLL) && poll) {
>  		if (req->result == -EAGAIN)
>  			return -EAGAIN;
>  
>
Yang Erkun Nov. 4, 2019, 11:46 a.m. UTC | #2
On 2019/11/4 18:09, Bob Liu wrote:
> On 11/4/19 4:56 PM, yangerkun wrote:
>> There is no problem to support other type request for the ctx with
>> IORING_SETUP_IOPOLL.
> 
> Could you describe the benefit of doing this?

Hi,

I am trying to replace libaio with io_uring in InnoDB/MariaDB(which
build on xfs/nvme). And in order to simulate the timeout mechanism
like io_getevents, firstly, to use the poll function of io_uring's fd 
has been selected and it really did work. But while trying to enable
IORING_SETUP_IOPOLL since xfs has iopoll function interface, the
mechanism will fail since io_uring_poll does check the cq.head between
cached_cq_tail, which will not update until we call io_uring_enter and
do the poll. So, instead, I decide to use timeout requests in
io_uring but will return -EINVAL since we enable IORING_SETUP_IOPOLL
at the same time. I think this combination is a normal scene so as
the other combination descibed in this patch. I am not sure does it a
good solution for this problem, and maybe there exists some better way.

Thanks,
Kun.



> 
>> What we should do is just insert read/write
>> requests to poll list, and protect cqes with comopletion_lock in
>> io_iopoll_complete since other requests now can be completed as the same
>> time while we do io_iopoll_complete.
>>
>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>> ---
>>   fs/io_uring.c | 29 +++++++++--------------------
>>   1 file changed, 9 insertions(+), 20 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index f9a38998f2fc..8bf52d9a2c55 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -753,9 +753,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>>   {
>>   	void *reqs[IO_IOPOLL_BATCH];
>>   	struct io_kiocb *req;
>> +	unsigned long flags;
>>   	int to_free;
>>   
>>   	to_free = 0;
>> +	spin_lock_irqsave(&ctx->completion_lock, flags);
>>   	while (!list_empty(done)) {
>>   		req = list_first_entry(done, struct io_kiocb, list);
>>   		list_del(&req->list);
>> @@ -781,6 +783,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>>   	}
>>   
>>   	io_commit_cqring(ctx);
>> +	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>>   	io_free_req_many(ctx, reqs, &to_free);
>>   }
>>   
>> @@ -1517,9 +1520,6 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
>>   	struct io_ring_ctx *ctx = req->ctx;
>>   	long err = 0;
>>   
>> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>> -
>>   	io_cqring_add_event(ctx, user_data, err);
>>   	io_put_req(req);
>>   	return 0;
>> @@ -1527,13 +1527,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
>>   
>>   static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   {
>> -	struct io_ring_ctx *ctx = req->ctx;
>> -
>>   	if (!req->file)
>>   		return -EBADF;
>>   
>> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>>   	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
>>   		return -EINVAL;
>>   
>> @@ -1574,14 +1570,11 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>>   
>>   static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   {
>> -	struct io_ring_ctx *ctx = req->ctx;
>>   	int ret = 0;
>>   
>>   	if (!req->file)
>>   		return -EBADF;
>>   
>> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>>   	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
>>   		return -EINVAL;
>>   
>> @@ -1627,9 +1620,6 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>>   	struct socket *sock;
>>   	int ret;
>>   
>> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>> -
>>   	sock = sock_from_file(req->file, &ret);
>>   	if (sock) {
>>   		struct user_msghdr __user *msg;
>> @@ -1712,8 +1702,6 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   	struct io_kiocb *poll_req, *next;
>>   	int ret = -ENOENT;
>>   
>> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>>   	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
>>   	    sqe->poll_events)
>>   		return -EINVAL;
>> @@ -1833,8 +1821,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   	__poll_t mask;
>>   	u16 events;
>>   
>> -	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>>   	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
>>   		return -EINVAL;
>>   	if (!poll->file)
>> @@ -1932,8 +1918,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   	struct timespec64 ts;
>>   	unsigned span = 0;
>>   
>> -	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
>> -		return -EINVAL;
>>   	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
>>   	    sqe->len != 1)
>>   		return -EINVAL;
>> @@ -2032,6 +2016,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>>   			   const struct sqe_submit *s, bool force_nonblock)
>>   {
>>   	int ret, opcode;
>> +	bool poll = false;
>>   
>>   	req->user_data = READ_ONCE(s->sqe->user_data);
>>   
>> @@ -2046,17 +2031,21 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>>   	case IORING_OP_READV:
>>   		if (unlikely(s->sqe->buf_index))
>>   			return -EINVAL;
>> +		poll = true;
>>   		ret = io_read(req, s, force_nonblock);
>>   		break;
>>   	case IORING_OP_WRITEV:
>>   		if (unlikely(s->sqe->buf_index))
>>   			return -EINVAL;
>> +		poll = true;
>>   		ret = io_write(req, s, force_nonblock);
>>   		break;
>>   	case IORING_OP_READ_FIXED:
>> +		poll = true;
>>   		ret = io_read(req, s, force_nonblock);
>>   		break;
>>   	case IORING_OP_WRITE_FIXED:
>> +		poll = true;
>>   		ret = io_write(req, s, force_nonblock);
>>   		break;
>>   	case IORING_OP_FSYNC:
>> @@ -2088,7 +2077,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>>   	if (ret)
>>   		return ret;
>>   
>> -	if (ctx->flags & IORING_SETUP_IOPOLL) {
>> +	if ((ctx->flags & IORING_SETUP_IOPOLL) && poll) {
>>   		if (req->result == -EAGAIN)
>>   			return -EAGAIN;
>>   
>>
> 
> 
> .
>
Jens Axboe Nov. 4, 2019, 2:01 p.m. UTC | #3
On 11/4/19 4:46 AM, yangerkun wrote:
> 
> 
> On 2019/11/4 18:09, Bob Liu wrote:
>> On 11/4/19 4:56 PM, yangerkun wrote:
>>> There is no problem to support other type request for the ctx with
>>> IORING_SETUP_IOPOLL.
>>
>> Could you describe the benefit of doing this?
> 
> Hi,
> 
> I am trying to replace libaio with io_uring in InnoDB/MariaDB(which
> build on xfs/nvme). And in order to simulate the timeout mechanism
> like io_getevents, firstly, to use the poll function of io_uring's fd
> has been selected and it really did work. But while trying to enable
> IORING_SETUP_IOPOLL since xfs has iopoll function interface, the
> mechanism will fail since io_uring_poll does check the cq.head between
> cached_cq_tail, which will not update until we call io_uring_enter and
> do the poll. So, instead, I decide to use timeout requests in
> io_uring but will return -EINVAL since we enable IORING_SETUP_IOPOLL
> at the same time. I think this combination is a normal scene so as
> the other combination descibed in this patch. I am not sure does it a
> good solution for this problem, and maybe there exists some better way.

I think we can support timeouts pretty easily with SETUP_IOPOLL, but we
can't mix and match everything. Pretty sure I've written at length about
that before, but the tldr is that for purely polled commands, we won't
have an IRQ event. That's the case for nvme if it's misconfigured, but
for an optimal setup where nvme is loaded with poll queues, there will
never be an interrupt for the command. This means that we can never wait
in io_cqring_wait(), we must always call the iopoll poller, because if
we wait we might very well be waiting for events that will never happen
unless we actively poll for them.

This could be supported if we accounted requests, but I don't want to
add that kind of overhead. Same with the lock+irqdisable you had to add
for this, it's not acceptable overhead.

Sounds like you just need timeout support for polling? If so, then that
is supportable as we know that these events will trigger an async event
when they happen. Either that, or it triggers when we poll for
completions. So it's safe to support, and we can definitely do that.

But don't mix polled IO with "normal" IO, it just won't work without
extra additions that I'm not willing to pay the cost of.
Yang Erkun Nov. 6, 2019, noon UTC | #4
On 2019/11/4 22:01, Jens Axboe wrote:
> On 11/4/19 4:46 AM, yangerkun wrote:
>>
>>
>> On 2019/11/4 18:09, Bob Liu wrote:
>>> On 11/4/19 4:56 PM, yangerkun wrote:
>>>> There is no problem to support other type request for the ctx with
>>>> IORING_SETUP_IOPOLL.
>>>
>>> Could you describe the benefit of doing this?
>>
>> Hi,
>>
>> I am trying to replace libaio with io_uring in InnoDB/MariaDB(which
>> build on xfs/nvme). And in order to simulate the timeout mechanism
>> like io_getevents, firstly, to use the poll function of io_uring's fd
>> has been selected and it really did work. But while trying to enable
>> IORING_SETUP_IOPOLL since xfs has iopoll function interface, the
>> mechanism will fail since io_uring_poll does check the cq.head between
>> cached_cq_tail, which will not update until we call io_uring_enter and
>> do the poll. So, instead, I decide to use timeout requests in
>> io_uring but will return -EINVAL since we enable IORING_SETUP_IOPOLL
>> at the same time. I think this combination is a normal scene so as
>> the other combination descibed in this patch. I am not sure does it a
>> good solution for this problem, and maybe there exists some better way.
> 
> I think we can support timeouts pretty easily with SETUP_IOPOLL, but we
> can't mix and match everything. Pretty sure I've written at length about
> that before, but the tldr is that for purely polled commands, we won't
> have an IRQ event. That's the case for nvme if it's misconfigured, but
> for an optimal setup where nvme is loaded with poll queues, there will
> never be an interrupt for the command. This means that we can never wait
> in io_cqring_wait(), we must always call the iopoll poller, because if
> we wait we might very well be waiting for events that will never happen
> unless we actively poll for them.
> 
> This could be supported if we accounted requests, but I don't want to
> add that kind of overhead. Same with the lock+irqdisable you had to add
> for this, it's not acceptable overhead.
> 
> Sounds like you just need timeout support for polling? If so, then that

Hi,

Yeah, the thing I need add is the timeout support for polling.

> is supportable as we know that these events will trigger an async event
> when they happen. Either that, or it triggers when we poll for
> completions. So it's safe to support, and we can definitely do that.

I am a little confuse. What you describe is once enable SETUP_IOPOLL
and there is a async call of io_timeout_fn, we should not call
io_cqring_fill_event directly, but leave io_iopoll_check to do this?
Or, the parallel running for io_iopoll_check may trigger some problem
since they can going to io_cqring_fill_event.

Thanks,
Kun.

> 
> But don't mix polled IO with "normal" IO, it just won't work without
> extra additions that I'm not willing to pay the cost of.
> 
>
Jens Axboe Nov. 6, 2019, 2:14 p.m. UTC | #5
On 11/6/19 5:00 AM, yangerkun wrote:
> 
> 
> On 2019/11/4 22:01, Jens Axboe wrote:
>> On 11/4/19 4:46 AM, yangerkun wrote:
>>>
>>>
>>> On 2019/11/4 18:09, Bob Liu wrote:
>>>> On 11/4/19 4:56 PM, yangerkun wrote:
>>>>> There is no problem to support other type request for the ctx with
>>>>> IORING_SETUP_IOPOLL.
>>>>
>>>> Could you describe the benefit of doing this?
>>>
>>> Hi,
>>>
>>> I am trying to replace libaio with io_uring in InnoDB/MariaDB(which
>>> build on xfs/nvme). And in order to simulate the timeout mechanism
>>> like io_getevents, firstly, to use the poll function of io_uring's fd
>>> has been selected and it really did work. But while trying to enable
>>> IORING_SETUP_IOPOLL since xfs has iopoll function interface, the
>>> mechanism will fail since io_uring_poll does check the cq.head between
>>> cached_cq_tail, which will not update until we call io_uring_enter and
>>> do the poll. So, instead, I decide to use timeout requests in
>>> io_uring but will return -EINVAL since we enable IORING_SETUP_IOPOLL
>>> at the same time. I think this combination is a normal scene so as
>>> the other combination descibed in this patch. I am not sure does it a
>>> good solution for this problem, and maybe there exists some better way.
>>
>> I think we can support timeouts pretty easily with SETUP_IOPOLL, but we
>> can't mix and match everything. Pretty sure I've written at length about
>> that before, but the tldr is that for purely polled commands, we won't
>> have an IRQ event. That's the case for nvme if it's misconfigured, but
>> for an optimal setup where nvme is loaded with poll queues, there will
>> never be an interrupt for the command. This means that we can never wait
>> in io_cqring_wait(), we must always call the iopoll poller, because if
>> we wait we might very well be waiting for events that will never happen
>> unless we actively poll for them.
>>
>> This could be supported if we accounted requests, but I don't want to
>> add that kind of overhead. Same with the lock+irqdisable you had to add
>> for this, it's not acceptable overhead.
>>
>> Sounds like you just need timeout support for polling? If so, then that
> 
> Hi,
> 
> Yeah, the thing I need add is the timeout support for polling.
> 
>> is supportable as we know that these events will trigger an async event
>> when they happen. Either that, or it triggers when we poll for
>> completions. So it's safe to support, and we can definitely do that.
> 
> I am a little confuse. What you describe is once enable SETUP_IOPOLL
> and there is a async call of io_timeout_fn, we should not call
> io_cqring_fill_event directly, but leave io_iopoll_check to do this?
> Or, the parallel running for io_iopoll_check may trigger some problem
> since they can going to io_cqring_fill_event.

Maybe that wasn't quite clear, what I'm trying to say is that there are
two outcomes with IORING_OP_TIMEOUT:

1) The number of events specified in the timeout is met. This happens
   through the normal poll command checks when we complete commands.
2) The timer fires. When this happens, we just increment ->cq_timeouts.
   You'd have to make a note of that in the poll loop just like we do in
   cqring_wait() to know to abort if that value is different from when
   we started the loop.

All that's needed to support timeouts with IORING_SETUP_IOPOLL is to
have that ->cq_timeouts check in place. With that, the restriction could
be lifted.
Yang Erkun Nov. 7, 2019, 1:27 a.m. UTC | #6
On 2019/11/6 22:14, Jens Axboe wrote:
> On 11/6/19 5:00 AM, yangerkun wrote:
>>
>>
>> On 2019/11/4 22:01, Jens Axboe wrote:
>>> On 11/4/19 4:46 AM, yangerkun wrote:
>>>>
>>>>
>>>> On 2019/11/4 18:09, Bob Liu wrote:
>>>>> On 11/4/19 4:56 PM, yangerkun wrote:
>>>>>> There is no problem to support other type request for the ctx with
>>>>>> IORING_SETUP_IOPOLL.
>>>>>
>>>>> Could you describe the benefit of doing this?
>>>>
>>>> Hi,
>>>>
>>>> I am trying to replace libaio with io_uring in InnoDB/MariaDB(which
>>>> build on xfs/nvme). And in order to simulate the timeout mechanism
>>>> like io_getevents, firstly, to use the poll function of io_uring's fd
>>>> has been selected and it really did work. But while trying to enable
>>>> IORING_SETUP_IOPOLL since xfs has iopoll function interface, the
>>>> mechanism will fail since io_uring_poll does check the cq.head between
>>>> cached_cq_tail, which will not update until we call io_uring_enter and
>>>> do the poll. So, instead, I decide to use timeout requests in
>>>> io_uring but will return -EINVAL since we enable IORING_SETUP_IOPOLL
>>>> at the same time. I think this combination is a normal scene so as
>>>> the other combination descibed in this patch. I am not sure does it a
>>>> good solution for this problem, and maybe there exists some better way.
>>>
>>> I think we can support timeouts pretty easily with SETUP_IOPOLL, but we
>>> can't mix and match everything. Pretty sure I've written at length about
>>> that before, but the tldr is that for purely polled commands, we won't
>>> have an IRQ event. That's the case for nvme if it's misconfigured, but
>>> for an optimal setup where nvme is loaded with poll queues, there will
>>> never be an interrupt for the command. This means that we can never wait
>>> in io_cqring_wait(), we must always call the iopoll poller, because if
>>> we wait we might very well be waiting for events that will never happen
>>> unless we actively poll for them.
>>>
>>> This could be supported if we accounted requests, but I don't want to
>>> add that kind of overhead. Same with the lock+irqdisable you had to add
>>> for this, it's not acceptable overhead.
>>>
>>> Sounds like you just need timeout support for polling? If so, then that
>>
>> Hi,
>>
>> Yeah, the thing I need add is the timeout support for polling.
>>
>>> is supportable as we know that these events will trigger an async event
>>> when they happen. Either that, or it triggers when we poll for
>>> completions. So it's safe to support, and we can definitely do that.
>>
>> I am a little confuse. What you describe is once enable SETUP_IOPOLL
>> and there is a async call of io_timeout_fn, we should not call
>> io_cqring_fill_event directly, but leave io_iopoll_check to do this?
>> Or, the parallel running for io_iopoll_check may trigger some problem
>> since they can going to io_cqring_fill_event.
> 
> Maybe that wasn't quite clear, what I'm trying to say is that there are
> two outcomes with IORING_OP_TIMEOUT:
> 
> 1) The number of events specified in the timeout is met. This happens
>     through the normal poll command checks when we complete commands.
> 2) The timer fires. When this happens, we just increment ->cq_timeouts.
>     You'd have to make a note of that in the poll loop just like we do in
>     cqring_wait() to know to abort if that value is different from when
>     we started the loop.
> 
> All that's needed to support timeouts with IORING_SETUP_IOPOLL is to
> have that ->cq_timeouts check in place. With that, the restriction could
> be lifted.
> 


Thank you for your detailed explanation! I will try the solution you
suggest instead of this patch!
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f9a38998f2fc..8bf52d9a2c55 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -753,9 +753,11 @@  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 {
 	void *reqs[IO_IOPOLL_BATCH];
 	struct io_kiocb *req;
+	unsigned long flags;
 	int to_free;
 
 	to_free = 0;
+	spin_lock_irqsave(&ctx->completion_lock, flags);
 	while (!list_empty(done)) {
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
@@ -781,6 +783,7 @@  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	}
 
 	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_free_req_many(ctx, reqs, &to_free);
 }
 
@@ -1517,9 +1520,6 @@  static int io_nop(struct io_kiocb *req, u64 user_data)
 	struct io_ring_ctx *ctx = req->ctx;
 	long err = 0;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	io_cqring_add_event(ctx, user_data, err);
 	io_put_req(req);
 	return 0;
@@ -1527,13 +1527,9 @@  static int io_nop(struct io_kiocb *req, u64 user_data)
 
 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
 	if (!req->file)
 		return -EBADF;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
@@ -1574,14 +1570,11 @@  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	int ret = 0;
 
 	if (!req->file)
 		return -EBADF;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
@@ -1627,9 +1620,6 @@  static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	struct socket *sock;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	sock = sock_from_file(req->file, &ret);
 	if (sock) {
 		struct user_msghdr __user *msg;
@@ -1712,8 +1702,6 @@  static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_kiocb *poll_req, *next;
 	int ret = -ENOENT;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
 	    sqe->poll_events)
 		return -EINVAL;
@@ -1833,8 +1821,6 @@  static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	__poll_t mask;
 	u16 events;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
 		return -EINVAL;
 	if (!poll->file)
@@ -1932,8 +1918,6 @@  static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct timespec64 ts;
 	unsigned span = 0;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
 	    sqe->len != 1)
 		return -EINVAL;
@@ -2032,6 +2016,7 @@  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock)
 {
 	int ret, opcode;
+	bool poll = false;
 
 	req->user_data = READ_ONCE(s->sqe->user_data);
 
@@ -2046,17 +2031,21 @@  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_READV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
+		poll = true;
 		ret = io_read(req, s, force_nonblock);
 		break;
 	case IORING_OP_WRITEV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
+		poll = true;
 		ret = io_write(req, s, force_nonblock);
 		break;
 	case IORING_OP_READ_FIXED:
+		poll = true;
 		ret = io_read(req, s, force_nonblock);
 		break;
 	case IORING_OP_WRITE_FIXED:
+		poll = true;
 		ret = io_write(req, s, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
@@ -2088,7 +2077,7 @@  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (ret)
 		return ret;
 
-	if (ctx->flags & IORING_SETUP_IOPOLL) {
+	if ((ctx->flags & IORING_SETUP_IOPOLL) && poll) {
 		if (req->result == -EAGAIN)
 			return -EAGAIN;