diff mbox series

[v2] fs, USB gadget: Rework kiocb cancellation

Message ID 20240208215518.1361570-1-bvanassche@acm.org (mailing list archive)
State New
Headers show
Series [v2] fs, USB gadget: Rework kiocb cancellation | expand

Commit Message

Bart Van Assche Feb. 8, 2024, 9:55 p.m. UTC
Calling kiocb_set_cancel_fn() without knowing whether the caller
submitted a struct kiocb or a struct aio_kiocb is unsafe. Fix this by
introducing the cancel_kiocb() method in struct file_operations. The
following call trace illustrates that without this patch an
out-of-bounds write happens if I/O is submitted by io_uring (from a
phone with an ARM CPU and kernel 6.1):

WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
 kiocb_set_cancel_fn+0x9c/0xa8
 ffs_epfile_read_iter+0x144/0x1d0
 io_read+0x19c/0x498
 io_issue_sqe+0x118/0x27c
 io_submit_sqes+0x25c/0x5fc
 __arm64_sys_io_uring_enter+0x104/0xab0
 invoke_syscall+0x58/0x11c
 el0_svc_common+0xb4/0xf4
 do_el0_svc+0x2c/0xb0
 el0_svc+0x2c/0xa4
 el0t_64_sync_handler+0x68/0xb4
 el0t_64_sync+0x1a4/0x1a8

Several ideas in this patch come from the following patch: Christoph
Hellwig, "[PATCH 08/32] aio: replace kiocb_set_cancel_fn with a
cancel_kiocb file operation", May 2018
(https://lore.kernel.org/all/20180515194833.6906-9-hch@lst.de/).

Cc: Christoph Hellwig <hch@lst.de>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Sandeep Dhavale <dhavale@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/usb/gadget/function/f_fs.c |  19 +----
 drivers/usb/gadget/legacy/inode.c  |  12 +--
 fs/aio.c                           | 129 +++++++++++++++++++----------
 include/linux/aio.h                |   7 --
 include/linux/fs.h                 |   1 +
 5 files changed, 90 insertions(+), 78 deletions(-)

Changes compared to v1:
 - Fixed a race between request completion and addition to the list of
   active requests.
 - Changed the return type of .cancel_kiocb() from int into void.
 - Simplified the .cancel_kiocb() implementations.
 - Introduced the ki_opcode member in struct aio_kiocb.
 - aio_cancel_and_del() checks .ki_opcode before accessing union members.
 - Left out the include/include/mm changes.

Comments

Jens Axboe Feb. 8, 2024, 10:14 p.m. UTC | #1
On 2/8/24 2:55 PM, Bart Van Assche wrote:
> diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
> index 6bff6cb93789..4837e3071263 100644
> --- a/drivers/usb/gadget/function/f_fs.c
> +++ b/drivers/usb/gadget/function/f_fs.c
> @@ -31,7 +31,6 @@
>  #include <linux/usb/composite.h>
>  #include <linux/usb/functionfs.h>
>  
> -#include <linux/aio.h>
>  #include <linux/kthread.h>
>  #include <linux/poll.h>
>  #include <linux/eventfd.h>
> @@ -1157,23 +1156,16 @@ ffs_epfile_open(struct inode *inode, struct file *file)
>  	return stream_open(inode, file);
>  }
>  
> -static int ffs_aio_cancel(struct kiocb *kiocb)
> +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb)
>  {
>  	struct ffs_io_data *io_data = kiocb->private;
>  	struct ffs_epfile *epfile = kiocb->ki_filp->private_data;
>  	unsigned long flags;
> -	int value;
>  
>  	spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
> -
>  	if (io_data && io_data->ep && io_data->req)
> -		value = usb_ep_dequeue(io_data->ep, io_data->req);
> -	else
> -		value = -EINVAL;
> -
> +		usb_ep_dequeue(io_data->ep, io_data->req);
>  	spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);
> -
> -	return value;
>  }

I'm assuming the NULL checks can go because it's just the async parts
now?

> @@ -634,6 +619,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
>  	queue_rcu_work(system_wq, &ctx->free_rwork);
>  }
>  
> +static void aio_cancel_and_del(struct aio_kiocb *req);
> +

Just move the function higher up? It doesn't have any dependencies.

> @@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
>  	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
>  }
>  
> +static void aio_add_rw_to_active_reqs(struct kiocb *req)
> +{
> +	struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
> +	struct kioctx *ctx = aio->ki_ctx;
> +	unsigned long flags;
> +
> +	/*
> +	 * If the .cancel_kiocb() callback has been set, add the request
> +	 * to the list of active requests.
> +	 */
> +	if (!req->ki_filp->f_op->cancel_kiocb)
> +		return;
> +
> +	spin_lock_irqsave(&ctx->ctx_lock, flags);
> +	list_add_tail(&aio->ki_list, &ctx->active_reqs);
> +	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> +}

This can use spin_lock_irq(), always called from process context.

> +/* Must be called only for IOCB_CMD_POLL requests. */
> +static void aio_poll_cancel(struct aio_kiocb *aiocb)
> +{
> +	struct poll_iocb *req = &aiocb->poll;
> +	struct kioctx *ctx = aiocb->ki_ctx;
> +
> +	lockdep_assert_held(&ctx->ctx_lock);
> +
> +	if (!poll_iocb_lock_wq(req))
> +		return;
> +
> +	WRITE_ONCE(req->cancelled, true);
> +	if (!req->work_scheduled) {
> +		schedule_work(&aiocb->poll.work);
> +		req->work_scheduled = true;
> +	}
> +	poll_iocb_unlock_wq(req);
> +}

Not your code, it's just moved, but this looks racy. Might not matter,
and obviously beyond the scope of this change.

> +{
> +	void (*cancel_kiocb)(struct kiocb *) =
> +		req->rw.ki_filp->f_op->cancel_kiocb;
> +	struct kioctx *ctx = req->ki_ctx;
> +
> +	lockdep_assert_held(&ctx->ctx_lock);
> +
> +	switch (req->ki_opcode) {
> +	case IOCB_CMD_PREAD:
> +	case IOCB_CMD_PWRITE:
> +	case IOCB_CMD_PREADV:
> +	case IOCB_CMD_PWRITEV:
> +		if (cancel_kiocb)
> +			cancel_kiocb(&req->rw);
> +		break;
> +	case IOCB_CMD_FSYNC:
> +	case IOCB_CMD_FDSYNC:
> +		break;
> +	case IOCB_CMD_POLL:
> +		aio_poll_cancel(req);
> +		break;
> +	default:
> +		WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
> +	}
> +
> +	list_del_init(&req->ki_list);
> +}

Why don't you just keep ki_cancel() and just change it to a void return
that takes an aio_kiocb? Then you don't need this odd switch, or adding
an opcode field just for this. That seems cleaner.

Outside of these little nits, looks alright. I'd still love to kill the
silly cancel code just for the gadget bits, but that's for another day.
And since the gadget and aio code basically never changes, a cleaned up
variant of the this patch should be trivial enough to backport to
stable, so I don't think we need to worry about doing a fixup first.
Bart Van Assche Feb. 8, 2024, 10:41 p.m. UTC | #2
On 2/8/24 14:14, Jens Axboe wrote:
> On 2/8/24 2:55 PM, Bart Van Assche wrote:
>> -static int ffs_aio_cancel(struct kiocb *kiocb)
>> +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb)
>>   {
>>   	struct ffs_io_data *io_data = kiocb->private;
>>   	struct ffs_epfile *epfile = kiocb->ki_filp->private_data;
>>   	unsigned long flags;
>> -	int value;
>>   
>>   	spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
>> -
>>   	if (io_data && io_data->ep && io_data->req)
>> -		value = usb_ep_dequeue(io_data->ep, io_data->req);
>> -	else
>> -		value = -EINVAL;
>> -
>> +		usb_ep_dequeue(io_data->ep, io_data->req);
>>   	spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);
>> -
>> -	return value;
>>   }
> 
> I'm assuming the NULL checks can go because it's just the async parts
> now?

Probably. I will look into this.

>> +static void aio_cancel_and_del(struct aio_kiocb *req);
>> +
> 
> Just move the function higher up? It doesn't have any dependencies.

aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls
poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of
aio_cancel_and_del(). It's probably possible to get rid of that function
declaration but a nontrivial amount of code would have to be moved.

>> @@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
>>   	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
>>   }
>>   
>> +static void aio_add_rw_to_active_reqs(struct kiocb *req)
>> +{
>> +	struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
>> +	struct kioctx *ctx = aio->ki_ctx;
>> +	unsigned long flags;
>> +
>> +	/*
>> +	 * If the .cancel_kiocb() callback has been set, add the request
>> +	 * to the list of active requests.
>> +	 */
>> +	if (!req->ki_filp->f_op->cancel_kiocb)
>> +		return;
>> +
>> +	spin_lock_irqsave(&ctx->ctx_lock, flags);
>> +	list_add_tail(&aio->ki_list, &ctx->active_reqs);
>> +	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
>> +}
> 
> This can use spin_lock_irq(), always called from process context.

I will make this change.

>> +{
>> +	void (*cancel_kiocb)(struct kiocb *) =
>> +		req->rw.ki_filp->f_op->cancel_kiocb;
>> +	struct kioctx *ctx = req->ki_ctx;
>> +
>> +	lockdep_assert_held(&ctx->ctx_lock);
>> +
>> +	switch (req->ki_opcode) {
>> +	case IOCB_CMD_PREAD:
>> +	case IOCB_CMD_PWRITE:
>> +	case IOCB_CMD_PREADV:
>> +	case IOCB_CMD_PWRITEV:
>> +		if (cancel_kiocb)
>> +			cancel_kiocb(&req->rw);
>> +		break;
>> +	case IOCB_CMD_FSYNC:
>> +	case IOCB_CMD_FDSYNC:
>> +		break;
>> +	case IOCB_CMD_POLL:
>> +		aio_poll_cancel(req);
>> +		break;
>> +	default:
>> +		WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
>> +	}
>> +
>> +	list_del_init(&req->ki_list);
>> +}
> 
> Why don't you just keep ki_cancel() and just change it to a void return
> that takes an aio_kiocb? Then you don't need this odd switch, or adding
> an opcode field just for this. That seems cleaner.

Keeping .ki_cancel() means that it must be set before I/O starts and only
if the I/O is submitted by libaio. That would require an approach to
recognize whether or not a struct kiocb is embedded in struct aio_kiocb,
e.g. the patch that you posted as a reply on version one of this patch.
Does anyone else want to comment on this?

Thanks,

Bart.
Jens Axboe Feb. 8, 2024, 11:05 p.m. UTC | #3
On 2/8/24 3:41 PM, Bart Van Assche wrote:
>> Just move the function higher up? It doesn't have any dependencies.
> 
> aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls
> poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of
> aio_cancel_and_del(). It's probably possible to get rid of that function
> declaration but a nontrivial amount of code would have to be moved.

Ah yes, I mixed it up with the cancel add helper. Forward decl is fine
then, keeps the patch smaller for backporting too.

>>> +{
>>> +    void (*cancel_kiocb)(struct kiocb *) =
>>> +        req->rw.ki_filp->f_op->cancel_kiocb;
>>> +    struct kioctx *ctx = req->ki_ctx;
>>> +
>>> +    lockdep_assert_held(&ctx->ctx_lock);
>>> +
>>> +    switch (req->ki_opcode) {
>>> +    case IOCB_CMD_PREAD:
>>> +    case IOCB_CMD_PWRITE:
>>> +    case IOCB_CMD_PREADV:
>>> +    case IOCB_CMD_PWRITEV:
>>> +        if (cancel_kiocb)
>>> +            cancel_kiocb(&req->rw);
>>> +        break;
>>> +    case IOCB_CMD_FSYNC:
>>> +    case IOCB_CMD_FDSYNC:
>>> +        break;
>>> +    case IOCB_CMD_POLL:
>>> +        aio_poll_cancel(req);
>>> +        break;
>>> +    default:
>>> +        WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
>>> +    }
>>> +
>>> +    list_del_init(&req->ki_list);
>>> +}
>>
>> Why don't you just keep ki_cancel() and just change it to a void return
>> that takes an aio_kiocb? Then you don't need this odd switch, or adding
>> an opcode field just for this. That seems cleaner.
> 
> Keeping .ki_cancel() means that it must be set before I/O starts and
> only if the I/O is submitted by libaio. That would require an approach
> to recognize whether or not a struct kiocb is embedded in struct
> aio_kiocb, e.g. the patch that you posted as a reply on version one of
> this patch. Does anyone else want to comment on this?

Maybe I wasn't clear, but this is in aio_req. You already add an opcode
in there, only to then add a switch here based on that opcode. Just have
a cancel callback which takes aio_req as an argument. For POLL, this can
be aio_poll_cancel(). Add a wrapper for read/write which then calls 
req->rw.ki_filp->f_op->cancel_kiocb(&req->rw); Then the above can
become:

aio_rw_cancel(req)
{
	void (*cancel_kiocb)(struct kiocb *) =
		req->rw.ki_filp->f_op->cancel_kiocb;

	cancel_kiocb(&req->rw);
}

aio_read()
{
	...
	req->cancel = aio_rw_cancel;
	...
}

static void aio_cancel_and_del(struct aio_kiocb *req)
{
	void (*cancel_kiocb)(struct kiocb *) =
		req->rw.ki_filp->f_op->cancel_kiocb;
	struct kioctx *ctx = req->ki_ctx;

	lockdep_assert_held(&ctx->ctx_lock);
	if (req->cancel)
		req->cancel(req);
	list_del_init(&req->ki_list);
}

or something like that. fsync/fdsync clears ->cancel() to NULL, poll
sets it to aio_poll_cancel(), and read/write like the above.
Jens Axboe Feb. 8, 2024, 11:16 p.m. UTC | #4
On 2/8/24 4:05 PM, Jens Axboe wrote:
> On 2/8/24 3:41 PM, Bart Van Assche wrote:
>>> Just move the function higher up? It doesn't have any dependencies.
>>
>> aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls
>> poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of
>> aio_cancel_and_del(). It's probably possible to get rid of that function
>> declaration but a nontrivial amount of code would have to be moved.
> 
> Ah yes, I mixed it up with the cancel add helper. Forward decl is fine
> then, keeps the patch smaller for backporting too.
> 
>>>> +{
>>>> +    void (*cancel_kiocb)(struct kiocb *) =
>>>> +        req->rw.ki_filp->f_op->cancel_kiocb;
>>>> +    struct kioctx *ctx = req->ki_ctx;
>>>> +
>>>> +    lockdep_assert_held(&ctx->ctx_lock);
>>>> +
>>>> +    switch (req->ki_opcode) {
>>>> +    case IOCB_CMD_PREAD:
>>>> +    case IOCB_CMD_PWRITE:
>>>> +    case IOCB_CMD_PREADV:
>>>> +    case IOCB_CMD_PWRITEV:
>>>> +        if (cancel_kiocb)
>>>> +            cancel_kiocb(&req->rw);
>>>> +        break;
>>>> +    case IOCB_CMD_FSYNC:
>>>> +    case IOCB_CMD_FDSYNC:
>>>> +        break;
>>>> +    case IOCB_CMD_POLL:
>>>> +        aio_poll_cancel(req);
>>>> +        break;
>>>> +    default:
>>>> +        WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
>>>> +    }
>>>> +
>>>> +    list_del_init(&req->ki_list);
>>>> +}
>>>
>>> Why don't you just keep ki_cancel() and just change it to a void return
>>> that takes an aio_kiocb? Then you don't need this odd switch, or adding
>>> an opcode field just for this. That seems cleaner.
>>
>> Keeping .ki_cancel() means that it must be set before I/O starts and
>> only if the I/O is submitted by libaio. That would require an approach
>> to recognize whether or not a struct kiocb is embedded in struct
>> aio_kiocb, e.g. the patch that you posted as a reply on version one of
>> this patch. Does anyone else want to comment on this?
> 
> Maybe I wasn't clear, but this is in aio_req. You already add an opcode
> in there, only to then add a switch here based on that opcode. Just have
> a cancel callback which takes aio_req as an argument. For POLL, this can
> be aio_poll_cancel(). Add a wrapper for read/write which then calls 
> req->rw.ki_filp->f_op->cancel_kiocb(&req->rw); Then the above can
> become:
> 
> aio_rw_cancel(req)
> {
> 	void (*cancel_kiocb)(struct kiocb *) =
> 		req->rw.ki_filp->f_op->cancel_kiocb;
> 
> 	cancel_kiocb(&req->rw);
> }
> 
> aio_read()
> {
> 	...
> 	req->cancel = aio_rw_cancel;
> 	...
> }
> 
> static void aio_cancel_and_del(struct aio_kiocb *req)
> {
> 	void (*cancel_kiocb)(struct kiocb *) =
> 		req->rw.ki_filp->f_op->cancel_kiocb;
> 	struct kioctx *ctx = req->ki_ctx;
> 
> 	lockdep_assert_held(&ctx->ctx_lock);
> 	if (req->cancel)
> 		req->cancel(req);
> 	list_del_init(&req->ki_list);
> }
> 
> or something like that. fsync/fdsync clears ->cancel() to NULL, poll
> sets it to aio_poll_cancel(), and read/write like the above.

Totally untested incremental. I think this is cleaner, and it's less
code too.

diff --git a/fs/aio.c b/fs/aio.c
index 9dc0be703aa6..a7770f59269f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -202,6 +202,8 @@ struct aio_kiocb {
 		struct poll_iocb	poll;
 	};
 
+	void (*ki_cancel)(struct aio_kiocb *);
+
 	struct kioctx		*ki_ctx;
 
 	struct io_event		ki_res;
@@ -210,8 +212,6 @@ struct aio_kiocb {
 						 * for cancellation */
 	refcount_t		ki_refcnt;
 
-	u16			ki_opcode;	/* IOCB_CMD_* */
-
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
 	 * this is the underlying eventfd context to deliver events to.
@@ -1576,6 +1576,11 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
 	}
 }
 
+static void aio_rw_cancel(struct aio_kiocb *req)
+{
+	iocb->ki_filp->f_op->cancel_kiocb(iocb);
+}
+
 static int aio_read(struct kiocb *req, const struct iocb *iocb,
 			bool vectored, bool compat)
 {
@@ -1722,50 +1727,14 @@ static void poll_iocb_unlock_wq(struct poll_iocb *req)
 	rcu_read_unlock();
 }
 
-/* Must be called only for IOCB_CMD_POLL requests. */
-static void aio_poll_cancel(struct aio_kiocb *aiocb)
-{
-	struct poll_iocb *req = &aiocb->poll;
-	struct kioctx *ctx = aiocb->ki_ctx;
-
-	lockdep_assert_held(&ctx->ctx_lock);
-
-	if (!poll_iocb_lock_wq(req))
-		return;
-
-	WRITE_ONCE(req->cancelled, true);
-	if (!req->work_scheduled) {
-		schedule_work(&aiocb->poll.work);
-		req->work_scheduled = true;
-	}
-	poll_iocb_unlock_wq(req);
-}
-
 static void aio_cancel_and_del(struct aio_kiocb *req)
 {
-	void (*cancel_kiocb)(struct kiocb *) =
-		req->rw.ki_filp->f_op->cancel_kiocb;
 	struct kioctx *ctx = req->ki_ctx;
 
 	lockdep_assert_held(&ctx->ctx_lock);
 
-	switch (req->ki_opcode) {
-	case IOCB_CMD_PREAD:
-	case IOCB_CMD_PWRITE:
-	case IOCB_CMD_PREADV:
-	case IOCB_CMD_PWRITEV:
-		if (cancel_kiocb)
-			cancel_kiocb(&req->rw);
-		break;
-	case IOCB_CMD_FSYNC:
-	case IOCB_CMD_FDSYNC:
-		break;
-	case IOCB_CMD_POLL:
-		aio_poll_cancel(req);
-		break;
-	default:
-		WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
-	}
+	if (req->ki_cancel)
+		req->ki_cancel(req);
 
 	list_del_init(&req->ki_list);
 }
@@ -1922,6 +1891,25 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	add_wait_queue(head, &pt->iocb->poll.wait);
 }
 
+/* Must be called only for IOCB_CMD_POLL requests. */
+static void aio_poll_cancel(struct aio_kiocb *aiocb)
+{
+	struct poll_iocb *req = &aiocb->poll;
+	struct kioctx *ctx = aiocb->ki_ctx;
+
+	lockdep_assert_held(&ctx->ctx_lock);
+
+	if (!poll_iocb_lock_wq(req))
+		return;
+
+	WRITE_ONCE(req->cancelled, true);
+	if (!req->work_scheduled) {
+		schedule_work(&aiocb->poll.work);
+		req->work_scheduled = true;
+	}
+	poll_iocb_unlock_wq(req);
+}
+
 static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 {
 	struct kioctx *ctx = aiocb->ki_ctx;
@@ -2028,23 +2016,27 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 	req->ki_res.data = iocb->aio_data;
 	req->ki_res.res = 0;
 	req->ki_res.res2 = 0;
-
-	req->ki_opcode = iocb->aio_lio_opcode;
+	req->ki_cancel = NULL;
 
 	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
+		req->ki_cancel = aio_rw_cancel;
 		return aio_read(&req->rw, iocb, false, compat);
 	case IOCB_CMD_PWRITE:
+		req->ki_cancel = aio_rw_cancel;
 		return aio_write(&req->rw, iocb, false, compat);
 	case IOCB_CMD_PREADV:
+		req->ki_cancel = aio_rw_cancel;
 		return aio_read(&req->rw, iocb, true, compat);
 	case IOCB_CMD_PWRITEV:
+		req->ki_cancel = aio_rw_cancel;
 		return aio_write(&req->rw, iocb, true, compat);
 	case IOCB_CMD_FSYNC:
 		return aio_fsync(&req->fsync, iocb, false);
 	case IOCB_CMD_FDSYNC:
 		return aio_fsync(&req->fsync, iocb, true);
 	case IOCB_CMD_POLL:
+		req->ki_cancel = aio_poll_cancel;
 		return aio_poll(req, iocb);
 	default:
 		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
Christian Brauner Feb. 9, 2024, 9:34 a.m. UTC | #5
On Thu, Feb 08, 2024 at 03:14:43PM -0700, Jens Axboe wrote:
> On 2/8/24 2:55 PM, Bart Van Assche wrote:
> > diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
> > index 6bff6cb93789..4837e3071263 100644
> > --- a/drivers/usb/gadget/function/f_fs.c
> > +++ b/drivers/usb/gadget/function/f_fs.c
> > @@ -31,7 +31,6 @@
> >  #include <linux/usb/composite.h>
> >  #include <linux/usb/functionfs.h>
> >  
> > -#include <linux/aio.h>
> >  #include <linux/kthread.h>
> >  #include <linux/poll.h>
> >  #include <linux/eventfd.h>
> > @@ -1157,23 +1156,16 @@ ffs_epfile_open(struct inode *inode, struct file *file)
> >  	return stream_open(inode, file);
> >  }
> >  
> > -static int ffs_aio_cancel(struct kiocb *kiocb)
> > +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb)
> >  {
> >  	struct ffs_io_data *io_data = kiocb->private;
> >  	struct ffs_epfile *epfile = kiocb->ki_filp->private_data;
> >  	unsigned long flags;
> > -	int value;
> >  
> >  	spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
> > -
> >  	if (io_data && io_data->ep && io_data->req)
> > -		value = usb_ep_dequeue(io_data->ep, io_data->req);
> > -	else
> > -		value = -EINVAL;
> > -
> > +		usb_ep_dequeue(io_data->ep, io_data->req);
> >  	spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);
> > -
> > -	return value;
> >  }
> 
> I'm assuming the NULL checks can go because it's just the async parts
> now?
> 
> > @@ -634,6 +619,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
> >  	queue_rcu_work(system_wq, &ctx->free_rwork);
> >  }
> >  
> > +static void aio_cancel_and_del(struct aio_kiocb *req);
> > +
> 
> Just move the function higher up? It doesn't have any dependencies.
> 
> > @@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
> >  	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
> >  }
> >  
> > +static void aio_add_rw_to_active_reqs(struct kiocb *req)
> > +{
> > +	struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
> > +	struct kioctx *ctx = aio->ki_ctx;
> > +	unsigned long flags;
> > +
> > +	/*
> > +	 * If the .cancel_kiocb() callback has been set, add the request
> > +	 * to the list of active requests.
> > +	 */
> > +	if (!req->ki_filp->f_op->cancel_kiocb)
> > +		return;
> > +
> > +	spin_lock_irqsave(&ctx->ctx_lock, flags);
> > +	list_add_tail(&aio->ki_list, &ctx->active_reqs);
> > +	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> > +}
> 
> This can use spin_lock_irq(), always called from process context.
> 
> > +/* Must be called only for IOCB_CMD_POLL requests. */
> > +static void aio_poll_cancel(struct aio_kiocb *aiocb)
> > +{
> > +	struct poll_iocb *req = &aiocb->poll;
> > +	struct kioctx *ctx = aiocb->ki_ctx;
> > +
> > +	lockdep_assert_held(&ctx->ctx_lock);
> > +
> > +	if (!poll_iocb_lock_wq(req))
> > +		return;
> > +
> > +	WRITE_ONCE(req->cancelled, true);
> > +	if (!req->work_scheduled) {
> > +		schedule_work(&aiocb->poll.work);
> > +		req->work_scheduled = true;
> > +	}
> > +	poll_iocb_unlock_wq(req);
> > +}
> 
> Not your code, it's just moved, but this looks racy. Might not matter,
> and obviously beyond the scope of this change.
> 
> > +{
> > +	void (*cancel_kiocb)(struct kiocb *) =
> > +		req->rw.ki_filp->f_op->cancel_kiocb;
> > +	struct kioctx *ctx = req->ki_ctx;
> > +
> > +	lockdep_assert_held(&ctx->ctx_lock);
> > +
> > +	switch (req->ki_opcode) {
> > +	case IOCB_CMD_PREAD:
> > +	case IOCB_CMD_PWRITE:
> > +	case IOCB_CMD_PREADV:
> > +	case IOCB_CMD_PWRITEV:
> > +		if (cancel_kiocb)
> > +			cancel_kiocb(&req->rw);
> > +		break;
> > +	case IOCB_CMD_FSYNC:
> > +	case IOCB_CMD_FDSYNC:
> > +		break;
> > +	case IOCB_CMD_POLL:
> > +		aio_poll_cancel(req);
> > +		break;
> > +	default:
> > +		WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
> > +	}
> > +
> > +	list_del_init(&req->ki_list);
> > +}
> 
> Why don't you just keep ki_cancel() and just change it to a void return
> that takes an aio_kiocb? Then you don't need this odd switch, or adding
> an opcode field just for this. That seems cleaner.
> 
> Outside of these little nits, looks alright. I'd still love to kill the
> silly cancel code just for the gadget bits, but that's for another day.

Well, I'd prefer to kill it if we can asap. Because then we can lose
that annoying file_operations addition. That really rubs me the wrong way.

> And since the gadget and aio code basically never changes, a cleaned up
> variant of the this patch should be trivial enough to backport to
> stable, so I don't think we need to worry about doing a fixup first.
Jens Axboe Feb. 9, 2024, 6:12 p.m. UTC | #6
On 2/9/24 2:34 AM, Christian Brauner wrote:
>> Why don't you just keep ki_cancel() and just change it to a void return
>> that takes an aio_kiocb? Then you don't need this odd switch, or adding
>> an opcode field just for this. That seems cleaner.
>>
>> Outside of these little nits, looks alright. I'd still love to kill the
>> silly cancel code just for the gadget bits, but that's for another day.
> 
> Well, I'd prefer to kill it if we can asap. Because then we can lose
> that annoying file_operations addition. That really rubs me the wrong way.

Greg, can you elaborate on how useful cancel is for gadgets? Is it one
of those things that was wired up "just because", or does it have
actually useful cases?

Because cancel, internally in aio, makes sense on eg a poll request. But
we don't need extra support for that, that's all internal to aio. It
doesn't make sense for O_DIRECT IO on a regular file, as there's no way
to cancel that anyway.

Reason I'm asking is that we have this broken cancel infrastructure that
we can either attempt to make work, at a cost of adding an operation to
the file_operations struct, or we can just get rid of it.
Bart Van Assche Feb. 12, 2024, 7:28 p.m. UTC | #7
On 2/9/24 10:12, Jens Axboe wrote:
> Greg, can you elaborate on how useful cancel is for gadgets? Is it one
> of those things that was wired up "just because", or does it have
> actually useful cases?

I found two use cases in the Android Open Source Project and have submitted
CLs that request to remove the io_cancel() calls from that code. Although I
think I understand why these calls were added, the race conditions that
these io_cancel() calls try to address cannot be addressed completely by
calling io_cancel().

Bart.
Bart Van Assche Feb. 13, 2024, 9:01 p.m. UTC | #8
On 2/12/24 11:28, Bart Van Assche wrote:
> On 2/9/24 10:12, Jens Axboe wrote:
>> Greg, can you elaborate on how useful cancel is for gadgets? Is it one
>> of those things that was wired up "just because", or does it have
>> actually useful cases?
> 
> I found two use cases in the Android Open Source Project and have submitted
> CLs that request to remove the io_cancel() calls from that code. Although I
> think I understand why these calls were added, the race conditions that
> these io_cancel() calls try to address cannot be addressed completely by
> calling io_cancel().

(replying to my own e-mail) The adb daemon (adbd) maintainers asked me to
preserve the I/O cancellation code in adbd because it was introduced recently
in that code to fix an important bug. Does everyone agree with the approach of
the untested patches below?

Thanks,

Bart.

-----------------------------------------------------------------------------
[PATCH 1/2] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted
  by libaio

If kiocb_set_cancel_fn() is called for I/O submitted by io_uring, the
following kernel warning appears:

WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8
Call trace:
  kiocb_set_cancel_fn+0x9c/0xa8
  ffs_epfile_read_iter+0x144/0x1d0
  io_read+0x19c/0x498
  io_issue_sqe+0x118/0x27c
  io_submit_sqes+0x25c/0x5fc
  __arm64_sys_io_uring_enter+0x104/0xab0
  invoke_syscall+0x58/0x11c
  el0_svc_common+0xb4/0xf4
  do_el0_svc+0x2c/0xb0
  el0_svc+0x2c/0xa4
  el0t_64_sync_handler+0x68/0xb4
  el0t_64_sync+0x1a4/0x1a8

Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is
submitted by libaio.

Suggested-by: Jens Axboe <axboe@kernel.dk>
---
  fs/aio.c           | 9 ++++++++-
  include/linux/fs.h | 2 ++
  2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..da18dbcfcb22 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
  	struct kioctx *ctx = req->ki_ctx;
  	unsigned long flags;

+	/*
+	 * kiocb didn't come from aio or is neither a read nor a write, hence
+	 * ignore it.
+	 */
+	if (!(iocb->ki_flags & IOCB_AIO_RW))
+		return;
+
  	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
  		return;

@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
  	req->ki_complete = aio_complete_rw;
  	req->private = NULL;
  	req->ki_pos = iocb->aio_offset;
-	req->ki_flags = req->ki_filp->f_iocb_flags;
+	req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
  	if (iocb->aio_flags & IOCB_FLAG_RESFD)
  		req->ki_flags |= IOCB_EVENTFD;
  	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..c2dcc98cb4c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@ enum rw_hint {
   * unrelated IO (like cache flushing, new IO generation, etc).
   */
  #define IOCB_DIO_CALLER_COMP	(1 << 22)
+/* kiocb is a read or write operation submitted by fs/aio.c. */
+#define IOCB_AIO_RW		(1 << 23)

  /* for use in trace events */
  #define TRACE_IOCB_STRINGS \

-----------------------------------------------------------------------------

[PATCH 2/2] fs/aio: Make io_cancel() generate completions again

The following patch accidentally removed the code for delivering
completions for cancelled reads and writes to user space: "[PATCH 04/33]
aio: remove retry-based AIO"
(https://lore.kernel.org/all/1363883754-27966-5-git-send-email-koverstreet@google.com/)
 From that patch:

-	if (kiocbIsCancelled(iocb)) {
-		ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-		/* must not access the iocb after this */
-		goto out;
-	}

This leads to a leak in user space of a struct iocb. Hence this patch
that restores the code that reports to user space that a read or write
has been cancelled successfully.
---
  fs/aio.c | 27 +++++++++++----------------
  1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index da18dbcfcb22..28223f511931 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2165,14 +2165,11 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
  #endif

  /* sys_io_cancel:
- *	Attempts to cancel an iocb previously passed to io_submit.  If
- *	the operation is successfully cancelled, the resulting event is
- *	copied into the memory pointed to by result without being placed
- *	into the completion queue and 0 is returned.  May fail with
- *	-EFAULT if any of the data structures pointed to are invalid.
- *	May fail with -EINVAL if aio_context specified by ctx_id is
- *	invalid.  May fail with -EAGAIN if the iocb specified was not
- *	cancelled.  Will fail with -ENOSYS if not implemented.
+ *	Attempts to cancel an iocb previously passed to io_submit(). If the
+ *	operation is successfully cancelled 0 is returned. May fail with
+ *	-EFAULT if any of the data structures pointed to are invalid. May
+ *	fail with -EINVAL if aio_context specified by ctx_id is invalid. Will
+ *	fail with -ENOSYS if not implemented.
   */
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  		struct io_event __user *, result)
@@ -2203,14 +2200,12 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  	}
  	spin_unlock_irq(&ctx->ctx_lock);

-	if (!ret) {
-		/*
-		 * The result argument is no longer used - the io_event is
-		 * always delivered via the ring buffer. -EINPROGRESS indicates
-		 * cancellation is progress:
-		 */
-		ret = -EINPROGRESS;
-	}
+	/*
+	 * The result argument is no longer used - the io_event is always
+	 * delivered via the ring buffer.
+	 */
+	if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW)
+		aio_complete_rw(&kiocb->rw, -EINTR);

  	percpu_ref_put(&ctx->users);


-----------------------------------------------------------------------------
Christian Brauner Feb. 16, 2024, 3 p.m. UTC | #9
On Tue, Feb 13, 2024 at 01:01:51PM -0800, Bart Van Assche wrote:
> On 2/12/24 11:28, Bart Van Assche wrote:
> > On 2/9/24 10:12, Jens Axboe wrote:
> > > Greg, can you elaborate on how useful cancel is for gadgets? Is it one
> > > of those things that was wired up "just because", or does it have
> > > actually useful cases?
> > 
> > I found two use cases in the Android Open Source Project and have submitted
> > CLs that request to remove the io_cancel() calls from that code. Although I
> > think I understand why these calls were added, the race conditions that
> > these io_cancel() calls try to address cannot be addressed completely by
> > calling io_cancel().
> 
> (replying to my own e-mail) The adb daemon (adbd) maintainers asked me to
> preserve the I/O cancellation code in adbd because it was introduced recently
> in that code to fix an important bug. Does everyone agree with the approach of
> the untested patches below?

But I mean, the cancellation code has seemingly been broken since
forever according to your patch description. So their fix which relies
on it actually fixes nothing? And they seemingly didn't notice until you
told them that it's broken. Can we get a link to that stuff, please?

Btw, you should probably provide that context in your patch series you
sent that Christoph and I responded too. Because I just stumbled upon
this here and it provided context I was missing.
diff mbox series

Patch

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 6bff6cb93789..4837e3071263 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -31,7 +31,6 @@ 
 #include <linux/usb/composite.h>
 #include <linux/usb/functionfs.h>
 
-#include <linux/aio.h>
 #include <linux/kthread.h>
 #include <linux/poll.h>
 #include <linux/eventfd.h>
@@ -1157,23 +1156,16 @@  ffs_epfile_open(struct inode *inode, struct file *file)
 	return stream_open(inode, file);
 }
 
-static int ffs_aio_cancel(struct kiocb *kiocb)
+static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb)
 {
 	struct ffs_io_data *io_data = kiocb->private;
 	struct ffs_epfile *epfile = kiocb->ki_filp->private_data;
 	unsigned long flags;
-	int value;
 
 	spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
-
 	if (io_data && io_data->ep && io_data->req)
-		value = usb_ep_dequeue(io_data->ep, io_data->req);
-	else
-		value = -EINVAL;
-
+		usb_ep_dequeue(io_data->ep, io_data->req);
 	spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);
-
-	return value;
 }
 
 static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from)
@@ -1198,9 +1190,6 @@  static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 
 	kiocb->private = p;
 
-	if (p->aio)
-		kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
-
 	res = ffs_epfile_io(kiocb->ki_filp, p);
 	if (res == -EIOCBQUEUED)
 		return res;
@@ -1242,9 +1231,6 @@  static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to)
 
 	kiocb->private = p;
 
-	if (p->aio)
-		kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
-
 	res = ffs_epfile_io(kiocb->ki_filp, p);
 	if (res == -EIOCBQUEUED)
 		return res;
@@ -1356,6 +1342,7 @@  static const struct file_operations ffs_epfile_operations = {
 	.release =	ffs_epfile_release,
 	.unlocked_ioctl =	ffs_epfile_ioctl,
 	.compat_ioctl = compat_ptr_ioctl,
+	.cancel_kiocb = ffs_epfile_cancel_kiocb,
 };
 
 
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 03179b1880fd..c2cf7fca6937 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -22,7 +22,6 @@ 
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/kthread.h>
-#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/refcount.h>
 #include <linux/delay.h>
@@ -446,23 +445,18 @@  struct kiocb_priv {
 	unsigned		actual;
 };
 
-static int ep_aio_cancel(struct kiocb *iocb)
+static void ep_cancel_kiocb(struct kiocb *iocb)
 {
 	struct kiocb_priv	*priv = iocb->private;
 	struct ep_data		*epdata;
-	int			value;
 
 	local_irq_disable();
 	epdata = priv->epdata;
 	// spin_lock(&epdata->dev->lock);
 	if (likely(epdata && epdata->ep && priv->req))
-		value = usb_ep_dequeue (epdata->ep, priv->req);
-	else
-		value = -EINVAL;
+		usb_ep_dequeue(epdata->ep, priv->req);
 	// spin_unlock(&epdata->dev->lock);
 	local_irq_enable();
-
-	return value;
 }
 
 static void ep_user_copy_worker(struct work_struct *work)
@@ -537,7 +531,6 @@  static ssize_t ep_aio(struct kiocb *iocb,
 	iocb->private = priv;
 	priv->iocb = iocb;
 
-	kiocb_set_cancel_fn(iocb, ep_aio_cancel);
 	get_ep(epdata);
 	priv->epdata = epdata;
 	priv->actual = 0;
@@ -709,6 +702,7 @@  static const struct file_operations ep_io_operations = {
 	.unlocked_ioctl = ep_ioctl,
 	.read_iter =	ep_read_iter,
 	.write_iter =	ep_write_iter,
+	.cancel_kiocb = ep_cancel_kiocb,
 };
 
 /* ENDPOINT INITIALIZATION
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..9dc0be703aa6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -203,7 +203,6 @@  struct aio_kiocb {
 	};
 
 	struct kioctx		*ki_ctx;
-	kiocb_cancel_fn		*ki_cancel;
 
 	struct io_event		ki_res;
 
@@ -211,6 +210,8 @@  struct aio_kiocb {
 						 * for cancellation */
 	refcount_t		ki_refcnt;
 
+	u16			ki_opcode;	/* IOCB_CMD_* */
+
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
 	 * this is the underlying eventfd context to deliver events to.
@@ -587,22 +588,6 @@  static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
-{
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
-	struct kioctx *ctx = req->ki_ctx;
-	unsigned long flags;
-
-	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
-		return;
-
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	list_add_tail(&req->ki_list, &ctx->active_reqs);
-	req->ki_cancel = cancel;
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-}
-EXPORT_SYMBOL(kiocb_set_cancel_fn);
-
 /*
  * free_ioctx() should be RCU delayed to synchronize against the RCU
  * protected lookup_ioctx() and also needs process context to call
@@ -634,6 +619,8 @@  static void free_ioctx_reqs(struct percpu_ref *ref)
 	queue_rcu_work(system_wq, &ctx->free_rwork);
 }
 
+static void aio_cancel_and_del(struct aio_kiocb *req);
+
 /*
  * When this function runs, the kioctx has been removed from the "hash table"
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
@@ -649,8 +636,7 @@  static void free_ioctx_users(struct percpu_ref *ref)
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-		req->ki_cancel(&req->rw);
-		list_del_init(&req->ki_list);
+		aio_cancel_and_del(req);
 	}
 
 	spin_unlock_irq(&ctx->ctx_lock);
@@ -1552,6 +1538,24 @@  static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
 	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
 }
 
+static void aio_add_rw_to_active_reqs(struct kiocb *req)
+{
+	struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
+	struct kioctx *ctx = aio->ki_ctx;
+	unsigned long flags;
+
+	/*
+	 * If the .cancel_kiocb() callback has been set, add the request
+	 * to the list of active requests.
+	 */
+	if (!req->ki_filp->f_op->cancel_kiocb)
+		return;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	list_add_tail(&aio->ki_list, &ctx->active_reqs);
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+
 static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
 {
 	switch (ret) {
@@ -1593,8 +1597,10 @@  static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
-	if (!ret)
+	if (!ret) {
+		aio_add_rw_to_active_reqs(req);
 		aio_rw_done(req, call_read_iter(file, req, &iter));
+	}
 	kfree(iovec);
 	return ret;
 }
@@ -1622,6 +1628,7 @@  static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
+		aio_add_rw_to_active_reqs(req);
 		if (S_ISREG(file_inode(file)->i_mode))
 			kiocb_start_write(req);
 		req->ki_flags |= IOCB_WRITE;
@@ -1715,6 +1722,54 @@  static void poll_iocb_unlock_wq(struct poll_iocb *req)
 	rcu_read_unlock();
 }
 
+/* Must be called only for IOCB_CMD_POLL requests. */
+static void aio_poll_cancel(struct aio_kiocb *aiocb)
+{
+	struct poll_iocb *req = &aiocb->poll;
+	struct kioctx *ctx = aiocb->ki_ctx;
+
+	lockdep_assert_held(&ctx->ctx_lock);
+
+	if (!poll_iocb_lock_wq(req))
+		return;
+
+	WRITE_ONCE(req->cancelled, true);
+	if (!req->work_scheduled) {
+		schedule_work(&aiocb->poll.work);
+		req->work_scheduled = true;
+	}
+	poll_iocb_unlock_wq(req);
+}
+
+static void aio_cancel_and_del(struct aio_kiocb *req)
+{
+	void (*cancel_kiocb)(struct kiocb *) =
+		req->rw.ki_filp->f_op->cancel_kiocb;
+	struct kioctx *ctx = req->ki_ctx;
+
+	lockdep_assert_held(&ctx->ctx_lock);
+
+	switch (req->ki_opcode) {
+	case IOCB_CMD_PREAD:
+	case IOCB_CMD_PWRITE:
+	case IOCB_CMD_PREADV:
+	case IOCB_CMD_PWRITEV:
+		if (cancel_kiocb)
+			cancel_kiocb(&req->rw);
+		break;
+	case IOCB_CMD_FSYNC:
+	case IOCB_CMD_FDSYNC:
+		break;
+	case IOCB_CMD_POLL:
+		aio_poll_cancel(req);
+		break;
+	default:
+		WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
+	}
+
+	list_del_init(&req->ki_list);
+}
+
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1727,11 +1782,11 @@  static void aio_poll_complete_work(struct work_struct *work)
 		mask = vfs_poll(req->file, &pt) & req->events;
 
 	/*
-	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
-	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
-	 * synchronize with them.  In the cancellation case the list_del_init
-	 * itself is not actually needed, but harmless so we keep it in to
-	 * avoid further branches in the fast path.
+	 * aio_cancel_and_del() deletes the iocb from the active_reqs list. We
+	 * need the ctx_lock here to synchronize with aio_cancel_and_del(). In
+	 * the cancellation case the list_del_init itself is not actually
+	 * needed, but harmless so we keep it in to avoid further branches in
+	 * the fast path.
 	 */
 	spin_lock_irq(&ctx->ctx_lock);
 	if (poll_iocb_lock_wq(req)) {
@@ -1760,24 +1815,6 @@  static void aio_poll_complete_work(struct work_struct *work)
 	iocb_put(iocb);
 }
 
-/* assumes we are called with irqs disabled */
-static int aio_poll_cancel(struct kiocb *iocb)
-{
-	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
-	struct poll_iocb *req = &aiocb->poll;
-
-	if (poll_iocb_lock_wq(req)) {
-		WRITE_ONCE(req->cancelled, true);
-		if (!req->work_scheduled) {
-			schedule_work(&aiocb->poll.work);
-			req->work_scheduled = true;
-		}
-		poll_iocb_unlock_wq(req);
-	} /* else, the request was force-cancelled by POLLFREE already */
-
-	return 0;
-}
-
 static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		void *key)
 {
@@ -1945,7 +1982,6 @@  static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 			 * active_reqs so that it can be cancelled if needed.
 			 */
 			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
-			aiocb->ki_cancel = aio_poll_cancel;
 		}
 		if (on_queue)
 			poll_iocb_unlock_wq(req);
@@ -1993,6 +2029,8 @@  static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 	req->ki_res.res = 0;
 	req->ki_res.res2 = 0;
 
+	req->ki_opcode = iocb->aio_lio_opcode;
+
 	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
 		return aio_read(&req->rw, iocb, false, compat);
@@ -2189,8 +2227,7 @@  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_res.obj == obj) {
-			ret = kiocb->ki_cancel(&kiocb->rw);
-			list_del_init(&kiocb->ki_list);
+			aio_cancel_and_del(kiocb);
 			break;
 		}
 	}
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 86892a4fe7c8..2aa6d0be3171 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -4,20 +4,13 @@ 
 
 #include <linux/aio_abi.h>
 
-struct kioctx;
-struct kiocb;
 struct mm_struct;
 
-typedef int (kiocb_cancel_fn)(struct kiocb *);
-
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
 static inline void exit_aio(struct mm_struct *mm) { }
-static inline void kiocb_set_cancel_fn(struct kiocb *req,
-				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
 #endif /* __LINUX__AIO_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a70495..7ec714878637 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2021,6 +2021,7 @@  struct file_operations {
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
 				unsigned int poll_flags);
+	void (*cancel_kiocb)(struct kiocb *);
 } __randomize_layout;
 
 /* Wrap a directory iterator that needs exclusive inode access */