[12/12] io_uring: support true async buffered reads, if file provides it
diff mbox series

Message ID 20200523185755.8494-13-axboe@kernel.dk
State New
Headers show
Series
  • Add support for async buffered reads
Related show

Commit Message

Jens Axboe May 23, 2020, 6:57 p.m. UTC
If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
the buffered read to an io-wq worker. Instead we can rely on page
unlocking callbacks to support retry based async IO. This is a lot more
efficient than doing async thread offload.

The retry is done similarly to how we handle poll based retry. From
the unlock callback, we simply queue the retry to a task_work based
handler.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

Comments

Pavel Begunkov May 25, 2020, 7:29 a.m. UTC | #1
On 23/05/2020 21:57, Jens Axboe wrote:
> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
> the buffered read to an io-wq worker. Instead we can rely on page
> unlocking callbacks to support retry based async IO. This is a lot more
> efficient than doing async thread offload.
> 
> The retry is done similarly to how we handle poll based retry. From
> the unlock callback, we simply queue the retry to a task_work based
> handler.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
> 
...
> +
> +	init_task_work(&rw->task_work, io_async_buf_retry);
> +	/* submit ref gets dropped, acquire a new one */
> +	refcount_inc(&req->refs);
> +	tsk = req->task;
> +	ret = task_work_add(tsk, &rw->task_work, true);
> +	if (unlikely(ret)) {
> +		/* queue just for cancelation */
> +		init_task_work(&rw->task_work, io_async_buf_cancel);
> +		tsk = io_wq_get_task(req->ctx->io_wq);

IIRC, task will be put somewhere around io_free_req(). Then shouldn't here be
some juggling with reassigning req->task with task_{get,put}()?

> +		task_work_add(tsk, &rw->task_work, true);
> +	}
> +	wake_up_process(tsk);
> +	return 1;
> +}
...
>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>  {
>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
> @@ -2601,6 +2696,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>  	if (!ret) {
>  		ssize_t ret2;
>  
> +retry:
>  		if (req->file->f_op->read_iter)
>  			ret2 = call_read_iter(req->file, kiocb, &iter);
>  		else
> @@ -2619,6 +2715,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>  			if (!(req->flags & REQ_F_NOWAIT) &&
>  			    !file_can_poll(req->file))
>  				req->flags |= REQ_F_MUST_PUNT;
> +			if (io_rw_should_retry(req))

It looks like a state machine with IOCB_WAITQ and gotos. Wouldn't it be cleaner
to call call_read_iter()/loop_rw_iter() here directly instead of "goto retry" ?

BTW, can this async stuff return -EAGAIN ?

> +				goto retry;
> +			kiocb->ki_flags &= ~IOCB_WAITQ;
>  			return -EAGAIN;
>  		}
>  	}
>
Jens Axboe May 25, 2020, 7:59 p.m. UTC | #2
On 5/25/20 1:29 AM, Pavel Begunkov wrote:
> On 23/05/2020 21:57, Jens Axboe wrote:
>> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
>> the buffered read to an io-wq worker. Instead we can rely on page
>> unlocking callbacks to support retry based async IO. This is a lot more
>> efficient than doing async thread offload.
>>
>> The retry is done similarly to how we handle poll based retry. From
>> the unlock callback, we simply queue the retry to a task_work based
>> handler.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 99 insertions(+)
>>
> ...
>> +
>> +	init_task_work(&rw->task_work, io_async_buf_retry);
>> +	/* submit ref gets dropped, acquire a new one */
>> +	refcount_inc(&req->refs);
>> +	tsk = req->task;
>> +	ret = task_work_add(tsk, &rw->task_work, true);
>> +	if (unlikely(ret)) {
>> +		/* queue just for cancelation */
>> +		init_task_work(&rw->task_work, io_async_buf_cancel);
>> +		tsk = io_wq_get_task(req->ctx->io_wq);
> 
> IIRC, task will be put somewhere around io_free_req(). Then shouldn't here be
> some juggling with reassigning req->task with task_{get,put}()?

Not sure I follow? Yes, we'll put this task again when the request
is freed, but not sure what you mean with juggling?

>> +		task_work_add(tsk, &rw->task_work, true);
>> +	}
>> +	wake_up_process(tsk);
>> +	return 1;
>> +}
> ...
>>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  {
>>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
>> @@ -2601,6 +2696,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  	if (!ret) {
>>  		ssize_t ret2;
>>  
>> +retry:
>>  		if (req->file->f_op->read_iter)
>>  			ret2 = call_read_iter(req->file, kiocb, &iter);
>>  		else
>> @@ -2619,6 +2715,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  			if (!(req->flags & REQ_F_NOWAIT) &&
>>  			    !file_can_poll(req->file))
>>  				req->flags |= REQ_F_MUST_PUNT;
>> +			if (io_rw_should_retry(req))
> 
> It looks like a state machine with IOCB_WAITQ and gotos. Wouldn't it be cleaner
> to call call_read_iter()/loop_rw_iter() here directly instead of "goto retry" ?

We could, probably making that part a separate helper then. How about the
below incremental?

> BTW, can this async stuff return -EAGAIN ?

Probably? Prefer not to make any definitive calls on that being possible or
not, as it's sure to disappoint. If it does and IOCB_WAITQ is already set,
then we'll punt to a thread like before.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index a5a4d9602915..669dccd81207 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2677,6 +2677,13 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	return false;
 }
 
+static int __io_read(struct io_kiocb *req, struct iov_iter *iter)
+{
+	if (req->file->f_op->read_iter)
+		return call_read_iter(req->file, &req->rw.kiocb, iter);
+	return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+}
+
 static int io_read(struct io_kiocb *req, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -2710,11 +2717,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	if (!ret) {
 		ssize_t ret2;
 
-retry:
-		if (req->file->f_op->read_iter)
-			ret2 = call_read_iter(req->file, kiocb, &iter);
-		else
-			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+		ret2 = __io_read(req, &iter);
 
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || ret2 != -EAGAIN) {
@@ -2729,8 +2732,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 			if (!(req->flags & REQ_F_NOWAIT) &&
 			    !file_can_poll(req->file))
 				req->flags |= REQ_F_MUST_PUNT;
-			if (io_rw_should_retry(req))
-				goto retry;
+			if (io_rw_should_retry(req)) {
+				ret2 = __io_read(req, &iter);
+				if (ret2 != -EAGAIN)
+					goto out_free;
+			}
 			kiocb->ki_flags &= ~IOCB_WAITQ;
 			return -EAGAIN;
 		}
Pavel Begunkov May 26, 2020, 7:38 a.m. UTC | #3
On 23/05/2020 21:57, Jens Axboe wrote:
> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
> the buffered read to an io-wq worker. Instead we can rely on page
> unlocking callbacks to support retry based async IO. This is a lot more
> efficient than doing async thread offload.
> 
> The retry is done similarly to how we handle poll based retry. From
> the unlock callback, we simply queue the retry to a task_work based
> handler.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index e95481c552ff..dd532d2634c2 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -498,6 +498,8 @@ struct io_async_rw {
>  	struct iovec			*iov;
>  	ssize_t				nr_segs;
>  	ssize_t				size;
> +	struct wait_page_queue		wpq;
> +	struct callback_head		task_work;
>  };
>  
>  struct io_async_ctx {
> @@ -2568,6 +2570,99 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	return 0;
>  }
>  
> +static void io_async_buf_cancel(struct callback_head *cb)
> +{
> +	struct io_async_rw *rw;
> +	struct io_ring_ctx *ctx;
> +	struct io_kiocb *req;
> +
> +	rw = container_of(cb, struct io_async_rw, task_work);
> +	req = rw->wpq.wait.private;
> +	ctx = req->ctx;
> +
> +	spin_lock_irq(&ctx->completion_lock);
> +	io_cqring_fill_event(req, -ECANCELED);

It seems like it should go through kiocb_done()/io_complete_rw_common().
My concern is missing io_put_kbuf().

> +	io_commit_cqring(ctx);
> +	spin_unlock_irq(&ctx->completion_lock);
> +
> +	io_cqring_ev_posted(ctx);
> +	req_set_fail_links(req);
> +	io_double_put_req(req);
> +}
Pavel Begunkov May 26, 2020, 7:44 a.m. UTC | #4
On 25/05/2020 22:59, Jens Axboe wrote:
> On 5/25/20 1:29 AM, Pavel Begunkov wrote:
>> On 23/05/2020 21:57, Jens Axboe wrote:
>>> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
>>> the buffered read to an io-wq worker. Instead we can rely on page
>>> unlocking callbacks to support retry based async IO. This is a lot more
>>> efficient than doing async thread offload.
>>>
>>> The retry is done similarly to how we handle poll based retry. From
>>> the unlock callback, we simply queue the retry to a task_work based
>>> handler.
>>>
>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> ---
>>>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 99 insertions(+)
>>>
>> ...
>>> +
>>> +	init_task_work(&rw->task_work, io_async_buf_retry);
>>> +	/* submit ref gets dropped, acquire a new one */
>>> +	refcount_inc(&req->refs);
>>> +	tsk = req->task;
>>> +	ret = task_work_add(tsk, &rw->task_work, true);
>>> +	if (unlikely(ret)) {
>>> +		/* queue just for cancelation */
>>> +		init_task_work(&rw->task_work, io_async_buf_cancel);
>>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>>
>> IIRC, task will be put somewhere around io_free_req(). Then shouldn't here be
>> some juggling with reassigning req->task with task_{get,put}()?
> 
> Not sure I follow? Yes, we'll put this task again when the request
> is freed, but not sure what you mean with juggling?

I meant something like:

...
/* queue just for cancelation */
init_task_work(&rw->task_work, io_async_buf_cancel);
+ put_task_struct(req->task);
+ req->task = get_task_struct(io_wq_task);


but, thinking twice, if I got the whole idea right, it should be ok as is --
io-wq won't go away before the request anyway, and leaving req->task pinned down
for a bit is not a problem.

>>> +		task_work_add(tsk, &rw->task_work, true);
>>> +	}
>>> +	wake_up_process(tsk);
>>> +	return 1;
>>> +}
>> ...
>>>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>  {
>>>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
>>> @@ -2601,6 +2696,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>  	if (!ret) {
>>>  		ssize_t ret2;
>>>  
>>> +retry:
>>>  		if (req->file->f_op->read_iter)
>>>  			ret2 = call_read_iter(req->file, kiocb, &iter);
>>>  		else
>>> @@ -2619,6 +2715,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>  			if (!(req->flags & REQ_F_NOWAIT) &&
>>>  			    !file_can_poll(req->file))
>>>  				req->flags |= REQ_F_MUST_PUNT;
>>> +			if (io_rw_should_retry(req))
>>
>> It looks like a state machine with IOCB_WAITQ and gotos. Wouldn't it be cleaner
>> to call call_read_iter()/loop_rw_iter() here directly instead of "goto retry" ?
> 
> We could, probably making that part a separate helper then. How about the
> below incremental?

IMHO, it was easy to get lost with such implicit state switching.
Looks better now! See a small comment below.

> 
>> BTW, can this async stuff return -EAGAIN ?
> 
> Probably? Prefer not to make any definitive calls on that being possible or
> not, as it's sure to disappoint. If it does and IOCB_WAITQ is already set,
> then we'll punt to a thread like before.

Sounds reasonable

> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index a5a4d9602915..669dccd81207 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2677,6 +2677,13 @@ static bool io_rw_should_retry(struct io_kiocb *req)
>  	return false;
>  }
>  
> +static int __io_read(struct io_kiocb *req, struct iov_iter *iter)
> +{
> +	if (req->file->f_op->read_iter)
> +		return call_read_iter(req->file, &req->rw.kiocb, iter);
> +	return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
> +}
> +
>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>  {
>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
> @@ -2710,11 +2717,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>  	if (!ret) {
>  		ssize_t ret2;
>  
> -retry:
> -		if (req->file->f_op->read_iter)
> -			ret2 = call_read_iter(req->file, kiocb, &iter);
> -		else
> -			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
> +		ret2 = __io_read(req, &iter);
>  
>  		/* Catch -EAGAIN return for forced non-blocking submission */
>  		if (!force_nonblock || ret2 != -EAGAIN) {
> @@ -2729,8 +2732,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>  			if (!(req->flags & REQ_F_NOWAIT) &&
>  			    !file_can_poll(req->file))
>  				req->flags |= REQ_F_MUST_PUNT;
> -			if (io_rw_should_retry(req))
> -				goto retry;
> +			if (io_rw_should_retry(req)) {
> +				ret2 = __io_read(req, &iter);
> +				if (ret2 != -EAGAIN)
> +					goto out_free;

"goto out_free" returns ret=0, so someone should add a cqe

if (ret2 != -EAGAIN) {
	kiocb_done(kiocb, ret2);
	goto free_out;
}


> +			}
>  			kiocb->ki_flags &= ~IOCB_WAITQ;
>  			return -EAGAIN;
>  		}
>
Jens Axboe May 26, 2020, 1:47 p.m. UTC | #5
On 5/26/20 1:38 AM, Pavel Begunkov wrote:
> On 23/05/2020 21:57, Jens Axboe wrote:
>> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
>> the buffered read to an io-wq worker. Instead we can rely on page
>> unlocking callbacks to support retry based async IO. This is a lot more
>> efficient than doing async thread offload.
>>
>> The retry is done similarly to how we handle poll based retry. From
>> the unlock callback, we simply queue the retry to a task_work based
>> handler.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 99 insertions(+)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index e95481c552ff..dd532d2634c2 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -498,6 +498,8 @@ struct io_async_rw {
>>  	struct iovec			*iov;
>>  	ssize_t				nr_segs;
>>  	ssize_t				size;
>> +	struct wait_page_queue		wpq;
>> +	struct callback_head		task_work;
>>  };
>>  
>>  struct io_async_ctx {
>> @@ -2568,6 +2570,99 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>>  	return 0;
>>  }
>>  
>> +static void io_async_buf_cancel(struct callback_head *cb)
>> +{
>> +	struct io_async_rw *rw;
>> +	struct io_ring_ctx *ctx;
>> +	struct io_kiocb *req;
>> +
>> +	rw = container_of(cb, struct io_async_rw, task_work);
>> +	req = rw->wpq.wait.private;
>> +	ctx = req->ctx;
>> +
>> +	spin_lock_irq(&ctx->completion_lock);
>> +	io_cqring_fill_event(req, -ECANCELED);
> 
> It seems like it should go through kiocb_done()/io_complete_rw_common().
> My concern is missing io_put_kbuf().

Yeah, I noticed that too after sending it out. If you look at the
current one that I updated yesterday, it does add that (and also
renames the iter read helper):

https://git.kernel.dk/cgit/linux-block/commit/?h=async-buffered.5&id=6f4e3a4066d0db3e3478e58cc250afb16d8d4d91
Jens Axboe May 26, 2020, 1:50 p.m. UTC | #6
On 5/26/20 1:44 AM, Pavel Begunkov wrote:
> On 25/05/2020 22:59, Jens Axboe wrote:
>> On 5/25/20 1:29 AM, Pavel Begunkov wrote:
>>> On 23/05/2020 21:57, Jens Axboe wrote:
>>>> If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
>>>> the buffered read to an io-wq worker. Instead we can rely on page
>>>> unlocking callbacks to support retry based async IO. This is a lot more
>>>> efficient than doing async thread offload.
>>>>
>>>> The retry is done similarly to how we handle poll based retry. From
>>>> the unlock callback, we simply queue the retry to a task_work based
>>>> handler.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>>  fs/io_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 99 insertions(+)
>>>>
>>> ...
>>>> +
>>>> +	init_task_work(&rw->task_work, io_async_buf_retry);
>>>> +	/* submit ref gets dropped, acquire a new one */
>>>> +	refcount_inc(&req->refs);
>>>> +	tsk = req->task;
>>>> +	ret = task_work_add(tsk, &rw->task_work, true);
>>>> +	if (unlikely(ret)) {
>>>> +		/* queue just for cancelation */
>>>> +		init_task_work(&rw->task_work, io_async_buf_cancel);
>>>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>>>
>>> IIRC, task will be put somewhere around io_free_req(). Then shouldn't here be
>>> some juggling with reassigning req->task with task_{get,put}()?
>>
>> Not sure I follow? Yes, we'll put this task again when the request
>> is freed, but not sure what you mean with juggling?
> 
> I meant something like:
> 
> ...
> /* queue just for cancelation */
> init_task_work(&rw->task_work, io_async_buf_cancel);
> + put_task_struct(req->task);
> + req->task = get_task_struct(io_wq_task);
> 
> 
> but, thinking twice, if I got the whole idea right, it should be ok as
> is -- io-wq won't go away before the request anyway, and leaving
> req->task pinned down for a bit is not a problem.

OK good, then I thin kwe agree it's fine.

>>>> +		task_work_add(tsk, &rw->task_work, true);
>>>> +	}
>>>> +	wake_up_process(tsk);
>>>> +	return 1;
>>>> +}
>>> ...
>>>>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>>  {
>>>>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
>>>> @@ -2601,6 +2696,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>>  	if (!ret) {
>>>>  		ssize_t ret2;
>>>>  
>>>> +retry:
>>>>  		if (req->file->f_op->read_iter)
>>>>  			ret2 = call_read_iter(req->file, kiocb, &iter);
>>>>  		else
>>>> @@ -2619,6 +2715,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>>>  			if (!(req->flags & REQ_F_NOWAIT) &&
>>>>  			    !file_can_poll(req->file))
>>>>  				req->flags |= REQ_F_MUST_PUNT;
>>>> +			if (io_rw_should_retry(req))
>>>
>>> It looks like a state machine with IOCB_WAITQ and gotos. Wouldn't it be cleaner
>>> to call call_read_iter()/loop_rw_iter() here directly instead of "goto retry" ?
>>
>> We could, probably making that part a separate helper then. How about the
>> below incremental?
> 
> IMHO, it was easy to get lost with such implicit state switching.
> Looks better now! See a small comment below.

Agree, that is cleaner.

>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index a5a4d9602915..669dccd81207 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -2677,6 +2677,13 @@ static bool io_rw_should_retry(struct io_kiocb *req)
>>  	return false;
>>  }
>>  
>> +static int __io_read(struct io_kiocb *req, struct iov_iter *iter)
>> +{
>> +	if (req->file->f_op->read_iter)
>> +		return call_read_iter(req->file, &req->rw.kiocb, iter);
>> +	return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
>> +}
>> +
>>  static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  {
>>  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
>> @@ -2710,11 +2717,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  	if (!ret) {
>>  		ssize_t ret2;
>>  
>> -retry:
>> -		if (req->file->f_op->read_iter)
>> -			ret2 = call_read_iter(req->file, kiocb, &iter);
>> -		else
>> -			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
>> +		ret2 = __io_read(req, &iter);
>>  
>>  		/* Catch -EAGAIN return for forced non-blocking submission */
>>  		if (!force_nonblock || ret2 != -EAGAIN) {
>> @@ -2729,8 +2732,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
>>  			if (!(req->flags & REQ_F_NOWAIT) &&
>>  			    !file_can_poll(req->file))
>>  				req->flags |= REQ_F_MUST_PUNT;
>> -			if (io_rw_should_retry(req))
>> -				goto retry;
>> +			if (io_rw_should_retry(req)) {
>> +				ret2 = __io_read(req, &iter);
>> +				if (ret2 != -EAGAIN)
>> +					goto out_free;
> 
> "goto out_free" returns ret=0, so someone should add a cqe
> 
> if (ret2 != -EAGAIN) {
> 	kiocb_done(kiocb, ret2);
> 	goto free_out;
> }

Fixed up in the current one.

Patch
diff mbox series

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e95481c552ff..dd532d2634c2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -498,6 +498,8 @@  struct io_async_rw {
 	struct iovec			*iov;
 	ssize_t				nr_segs;
 	ssize_t				size;
+	struct wait_page_queue		wpq;
+	struct callback_head		task_work;
 };
 
 struct io_async_ctx {
@@ -2568,6 +2570,99 @@  static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static void io_async_buf_cancel(struct callback_head *cb)
+{
+	struct io_async_rw *rw;
+	struct io_ring_ctx *ctx;
+	struct io_kiocb *req;
+
+	rw = container_of(cb, struct io_async_rw, task_work);
+	req = rw->wpq.wait.private;
+	ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_cqring_fill_event(req, -ECANCELED);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	req_set_fail_links(req);
+	io_double_put_req(req);
+}
+
+static void io_async_buf_retry(struct callback_head *cb)
+{
+	struct io_async_rw *rw;
+	struct io_ring_ctx *ctx;
+	struct io_kiocb *req;
+
+	rw = container_of(cb, struct io_async_rw, task_work);
+	req = rw->wpq.wait.private;
+	ctx = req->ctx;
+
+	__set_current_state(TASK_RUNNING);
+	mutex_lock(&ctx->uring_lock);
+	__io_queue_sqe(req, NULL);
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
+			     int sync, void *arg)
+{
+	struct wait_page_queue *wpq;
+	struct io_kiocb *req = wait->private;
+	struct io_async_rw *rw = &req->io->rw;
+	struct wait_page_key *key = arg;
+	struct task_struct *tsk;
+	int ret;
+
+	wpq = container_of(wait, struct wait_page_queue, wait);
+
+	ret = wake_page_match(wpq, key);
+	if (ret != 1)
+		return ret;
+
+	list_del_init(&wait->entry);
+
+	init_task_work(&rw->task_work, io_async_buf_retry);
+	/* submit ref gets dropped, acquire a new one */
+	refcount_inc(&req->refs);
+	tsk = req->task;
+	ret = task_work_add(tsk, &rw->task_work, true);
+	if (unlikely(ret)) {
+		/* queue just for cancelation */
+		init_task_work(&rw->task_work, io_async_buf_cancel);
+		tsk = io_wq_get_task(req->ctx->io_wq);
+		task_work_add(tsk, &rw->task_work, true);
+	}
+	wake_up_process(tsk);
+	return 1;
+}
+
+static bool io_rw_should_retry(struct io_kiocb *req)
+{
+	struct kiocb *kiocb = &req->rw.kiocb;
+	int ret;
+
+	/* already tried, or we're doing O_DIRECT */
+	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+		return false;
+	/*
+	 * just use poll if we can, and don't attempt if the fs doesn't
+	 * support callback based unlocks
+	 */
+	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+		return false;
+
+	ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
+						io_async_buf_func, req);
+	if (ret)
+		return false;
+	get_task_struct(current);
+	req->task = current;
+	return true;
+}
+
 static int io_read(struct io_kiocb *req, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -2601,6 +2696,7 @@  static int io_read(struct io_kiocb *req, bool force_nonblock)
 	if (!ret) {
 		ssize_t ret2;
 
+retry:
 		if (req->file->f_op->read_iter)
 			ret2 = call_read_iter(req->file, kiocb, &iter);
 		else
@@ -2619,6 +2715,9 @@  static int io_read(struct io_kiocb *req, bool force_nonblock)
 			if (!(req->flags & REQ_F_NOWAIT) &&
 			    !file_can_poll(req->file))
 				req->flags |= REQ_F_MUST_PUNT;
+			if (io_rw_should_retry(req))
+				goto retry;
+			kiocb->ki_flags &= ~IOCB_WAITQ;
 			return -EAGAIN;
 		}
 	}