diff mbox series

[2/2] nvme: use blk-mq polling for uring commands

Message ID 20230324212803.1837554-2-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series [1/2] blk-mq: export request polling helpers | expand

Commit Message

Keith Busch March 24, 2023, 9:28 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

The first advantage is that unshared and multipath namespaces can use
the same polling callback.

The other advantage is that we don't need a bio payload in order to
poll, allowing commands like 'flush' and 'write zeroes' to be submitted
on the same high priority queue as read and write commands.

This can also allow for a future optimization for the driver since we no
longer need to create special hidden block devices to back nvme-generic
char dev's with unsupported command sets.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
 drivers/nvme/host/multipath.c |  2 +-
 drivers/nvme/host/nvme.h      |  2 -
 3 files changed, 28 insertions(+), 55 deletions(-)

Comments

kernel test robot March 25, 2023, 2:50 a.m. UTC | #1
Hi Keith,

I love your patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[cannot apply to linus/master v6.3-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/nvme-use-blk-mq-polling-for-uring-commands/20230325-052914
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20230324212803.1837554-2-kbusch%40meta.com
patch subject: [PATCH 2/2] nvme: use blk-mq polling for uring commands
config: m68k-allmodconfig (https://download.01.org/0day-ci/archive/20230325/202303251022.c5FPQHwt-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/62483898b54af99f656f5a16a65ff26940b817a8
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Keith-Busch/nvme-use-blk-mq-polling-for-uring-commands/20230325-052914
        git checkout 62483898b54af99f656f5a16a65ff26940b817a8
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303251022.c5FPQHwt-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "blk_queue_exit" [drivers/nvme/host/nvme-core.ko] undefined!
Sagi Grimberg March 26, 2023, 1:01 p.m. UTC | #2
On 3/25/23 00:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The first advantage is that unshared and multipath namespaces can use
> the same polling callback.
> 
> The other advantage is that we don't need a bio payload in order to
> poll, allowing commands like 'flush' and 'write zeroes' to be submitted
> on the same high priority queue as read and write commands.
> 
> This can also allow for a future optimization for the driver since we no
> longer need to create special hidden block devices to back nvme-generic
> char dev's with unsupported command sets.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
>   drivers/nvme/host/multipath.c |  2 +-
>   drivers/nvme/host/nvme.h      |  2 -
>   3 files changed, 28 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 723e7d5b778f2..369e8519b87a2 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -503,7 +503,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>   {
>   	struct io_uring_cmd *ioucmd = req->end_io_data;
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> -	void *cookie = READ_ONCE(ioucmd->cookie);
>   
>   	req->bio = pdu->bio;
>   	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> @@ -516,9 +515,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>   	 * For iopoll, complete it directly.
>   	 * Otherwise, move the completion to task work.
>   	 */
> -	if (cookie != NULL && blk_rq_is_poll(req))
> +	if (blk_rq_is_poll(req)) {
> +		WRITE_ONCE(ioucmd->cookie, NULL);
>   		nvme_uring_task_cb(ioucmd);
> -	else
> +	} else
>   		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>   
>   	return RQ_END_IO_FREE;
> @@ -529,7 +529,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
>   {
>   	struct io_uring_cmd *ioucmd = req->end_io_data;
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> -	void *cookie = READ_ONCE(ioucmd->cookie);
>   
>   	req->bio = pdu->bio;
>   	pdu->req = req;
> @@ -538,9 +537,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
>   	 * For iopoll, complete it directly.
>   	 * Otherwise, move the completion to task work.
>   	 */
> -	if (cookie != NULL && blk_rq_is_poll(req))
> +	if (blk_rq_is_poll(req)) {
> +		WRITE_ONCE(ioucmd->cookie, NULL);
>   		nvme_uring_task_meta_cb(ioucmd);
> -	else
> +	} else
>   		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
>   
>   	return RQ_END_IO_NONE;
> @@ -597,7 +597,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>   	if (issue_flags & IO_URING_F_IOPOLL)
>   		rq_flags |= REQ_POLLED;
>   
> -retry:
>   	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
>   	if (IS_ERR(req))
>   		return PTR_ERR(req);
> @@ -611,17 +610,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>   			return ret;
>   	}
>   
> -	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
> -		if (unlikely(!req->bio)) {
> -			/* we can't poll this, so alloc regular req instead */
> -			blk_mq_free_request(req);
> -			rq_flags &= ~REQ_POLLED;
> -			goto retry;
> -		} else {
> -			WRITE_ONCE(ioucmd->cookie, req->bio);
> -			req->bio->bi_opf |= REQ_POLLED;
> -		}
> -	}
> +	if (blk_rq_is_poll(req))
> +		WRITE_ONCE(ioucmd->cookie, req);

Why aren't we always setting the cookie to point at the req?
Kanchan Joshi March 27, 2023, 1:58 p.m. UTC | #3
On Fri, Mar 24, 2023 at 02:28:03PM -0700, Keith Busch wrote:
>From: Keith Busch <kbusch@kernel.org>
>
>The first advantage is that unshared and multipath namespaces can use
>the same polling callback.
>
>The other advantage is that we don't need a bio payload in order to
>poll, allowing commands like 'flush' and 'write zeroes' to be submitted
>on the same high priority queue as read and write commands.
>
>This can also allow for a future optimization for the driver since we no
>longer need to create special hidden block devices to back nvme-generic
>char dev's with unsupported command sets.
>
>Signed-off-by: Keith Busch <kbusch@kernel.org>
>---
> drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
> drivers/nvme/host/multipath.c |  2 +-
> drivers/nvme/host/nvme.h      |  2 -
> 3 files changed, 28 insertions(+), 55 deletions(-)
>
>diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>index 723e7d5b778f2..369e8519b87a2 100644
>--- a/drivers/nvme/host/ioctl.c
>+++ b/drivers/nvme/host/ioctl.c
>@@ -503,7 +503,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
> {
> 	struct io_uring_cmd *ioucmd = req->end_io_data;
> 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>-	void *cookie = READ_ONCE(ioucmd->cookie);
>
> 	req->bio = pdu->bio;
> 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
>@@ -516,9 +515,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
> 	 * For iopoll, complete it directly.
> 	 * Otherwise, move the completion to task work.
> 	 */
>-	if (cookie != NULL && blk_rq_is_poll(req))
>+	if (blk_rq_is_poll(req)) {
>+		WRITE_ONCE(ioucmd->cookie, NULL);
> 		nvme_uring_task_cb(ioucmd);
>-	else
>+	} else
> 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>
> 	return RQ_END_IO_FREE;
>@@ -529,7 +529,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> {
> 	struct io_uring_cmd *ioucmd = req->end_io_data;
> 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>-	void *cookie = READ_ONCE(ioucmd->cookie);
>
> 	req->bio = pdu->bio;
> 	pdu->req = req;
>@@ -538,9 +537,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> 	 * For iopoll, complete it directly.
> 	 * Otherwise, move the completion to task work.
> 	 */
>-	if (cookie != NULL && blk_rq_is_poll(req))
>+	if (blk_rq_is_poll(req)) {
>+		WRITE_ONCE(ioucmd->cookie, NULL);
> 		nvme_uring_task_meta_cb(ioucmd);
>-	else
>+	} else
> 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
>
> 	return RQ_END_IO_NONE;
>@@ -597,7 +597,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 	if (issue_flags & IO_URING_F_IOPOLL)
> 		rq_flags |= REQ_POLLED;
>
>-retry:
> 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
> 	if (IS_ERR(req))
> 		return PTR_ERR(req);
>@@ -611,17 +610,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 			return ret;
> 	}
>
>-	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
>-		if (unlikely(!req->bio)) {
>-			/* we can't poll this, so alloc regular req instead */
>-			blk_mq_free_request(req);
>-			rq_flags &= ~REQ_POLLED;
>-			goto retry;
>-		} else {
>-			WRITE_ONCE(ioucmd->cookie, req->bio);
>-			req->bio->bi_opf |= REQ_POLLED;
>-		}
>-	}
>+	if (blk_rq_is_poll(req))
>+		WRITE_ONCE(ioucmd->cookie, req);

blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
is the case. That defeats one of the purpose of the series i.e. poll on
no-payload commands such as flush/write-zeroes.


> 	/* to free bio on completion, as req->bio will be null at that time */
> 	pdu->bio = req->bio;
> 	pdu->meta_len = d.metadata_len;
>@@ -780,18 +771,27 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
> 				 struct io_comp_batch *iob,
> 				 unsigned int poll_flags)
> {
>-	struct bio *bio;
>+	struct request *req;
> 	int ret = 0;
>-	struct nvme_ns *ns;
>-	struct request_queue *q;
>
>+	/*
>+	 * The rcu lock ensures the 'req' in the command cookie will not be
>+	 * freed until after the unlock. The queue must be frozen to free the
>+	 * request, and the freeze requires an rcu grace period. The cookie is
>+	 * cleared before the request is completed, so we're fine even if a
>+	 * competing polling thread completes this thread's request.
>+	 */
> 	rcu_read_lock();
>-	bio = READ_ONCE(ioucmd->cookie);
>-	ns = container_of(file_inode(ioucmd->file)->i_cdev,
>-			struct nvme_ns, cdev);
>-	q = ns->queue;
>-	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>-		ret = bio_poll(bio, iob, poll_flags);
>+	req = READ_ONCE(ioucmd->cookie);
>+	if (req) {

This is risky. We are not sure if the cookie is actually "req" at this
moment. If driver is loaded without the poll-queues, we will not be able
to set req into ioucmd->cookie during the submission (in
nvme_uring_cmd_io). Therefore, the original code checked for QUEUE_FLAG_POLL
before treating ioucmd->cookie as bio here.

This should handle it (on top of your patch):

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 369e8519b87a..89eef5da4b5c 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -773,6 +773,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 {
        struct request *req;
        int ret = 0;
+       struct request_queue *q;
+       struct nvme_ns *ns;

        /*
         * The rcu lock ensures the 'req' in the command cookie will not be
@@ -783,8 +785,10 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
         */
        rcu_read_lock();
        req = READ_ONCE(ioucmd->cookie);
-       if (req) {
-               struct request_queue *q = req->q;
+       ns = container_of(file_inode(ioucmd->file)->i_cdev, struct nvme_ns,
+                       cdev);
+       q = ns->queue;
+       if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && req) {

                if (percpu_ref_tryget(&q->q_usage_counter)) {
                        ret = blk_mq_poll(q, blk_rq_to_qc(req), iob,
Keith Busch March 27, 2023, 3:20 p.m. UTC | #4
On Mon, Mar 27, 2023 at 07:28:10PM +0530, Kanchan Joshi wrote:
> > -	}
> > +	if (blk_rq_is_poll(req))
> > +		WRITE_ONCE(ioucmd->cookie, req);
> 
> blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
> is the case. That defeats one of the purpose of the series i.e. poll on
> no-payload commands such as flush/write-zeroes.

Sorry, I'm sending out various patches piecemeal. This patch here depends on
this one sent out earlier:

  https://lore.kernel.org/linux-block/3f670ca7-908d-db55-3da1-4090f116005d@nvidia.com/T/#mbc6174ce3f9dbae38ae2ca646518be4bf105f6e4

> > 	rcu_read_lock();
> > -	bio = READ_ONCE(ioucmd->cookie);
> > -	ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > -			struct nvme_ns, cdev);
> > -	q = ns->queue;
> > -	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > -		ret = bio_poll(bio, iob, poll_flags);
> > +	req = READ_ONCE(ioucmd->cookie);
> > +	if (req) {
> 
> This is risky. We are not sure if the cookie is actually "req" at this
> moment.

What else could it be? It's either a real request from a polled hctx tag, or
NULL at this point.

It's safe to check the cookie like this and rely on its contents. The queue's
hctx's can't change within an rcu section, and the cookie is cleared in the
completion path prior to the request being free'd. In the worst case, we're
racing another polling thread completing our request while simultaneously
trying to renumber the hctx's, but the request and the current hctx it points
are reliable if we see non-NULL.

> If driver is loaded without the poll-queues, we will not be able
> to set req into ioucmd->cookie during the submission (in
> nvme_uring_cmd_io). Therefore, the original code checked for QUEUE_FLAG_POLL
> before treating ioucmd->cookie as bio here.

You don't need to check the queue's FLAG_POLL after the request is allocated.
The user can't change this directly, and this flag can't be changed with
requests in flight, so checking blk_rq_is_poll() is the only thing we need to
rely on.

> This should handle it (on top of your patch):

This doesn't work with multipath.
Keith Busch March 27, 2023, 3:29 p.m. UTC | #5
On Sun, Mar 26, 2023 at 04:01:46PM +0300, Sagi Grimberg wrote:
> On 3/25/23 00:28, Keith Busch wrote:
> > +	if (blk_rq_is_poll(req))
> > +		WRITE_ONCE(ioucmd->cookie, req);
> 
> Why aren't we always setting the cookie to point at the req?

As in unconditionally set the cookie even for non-polled requests? We need to
see that the cookie is NULL to prevent polling interrupt queues when an
io_uring polled command is dispatched to a device without polled hctx's.
Kanchan Joshi March 27, 2023, 5:20 p.m. UTC | #6
On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Mon, Mar 27, 2023 at 07:28:10PM +0530, Kanchan Joshi wrote:
> > > -   }
> > > +   if (blk_rq_is_poll(req))
> > > +           WRITE_ONCE(ioucmd->cookie, req);
> >
> > blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
> > is the case. That defeats one of the purpose of the series i.e. poll on
> > no-payload commands such as flush/write-zeroes.
>
> Sorry, I'm sending out various patches piecemeal. This patch here depends on
> this one sent out earlier:
>
>   https://lore.kernel.org/linux-block/3f670ca7-908d-db55-3da1-4090f116005d@nvidia.com/T/#mbc6174ce3f9dbae38ae2ca646518be4bf105f6e4

That clears it up, thanks.

> > >     rcu_read_lock();
> > > -   bio = READ_ONCE(ioucmd->cookie);
> > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > -                   struct nvme_ns, cdev);
> > > -   q = ns->queue;
> > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > -           ret = bio_poll(bio, iob, poll_flags);
> > > +   req = READ_ONCE(ioucmd->cookie);
> > > +   if (req) {
> >
> > This is risky. We are not sure if the cookie is actually "req" at this
> > moment.
>
> What else could it be? It's either a real request from a polled hctx tag, or
> NULL at this point.

It can also be a function pointer that gets assigned on irq-driven completion.
See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
and task_work_cb share the storage.

> It's safe to check the cookie like this and rely on its contents.
Hence not safe. Please try running this without poll-queues (at nvme
level), you'll see failures.
Keith Busch March 28, 2023, 12:48 a.m. UTC | #7
On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
> On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
> > > >     rcu_read_lock();
> > > > -   bio = READ_ONCE(ioucmd->cookie);
> > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > > -                   struct nvme_ns, cdev);
> > > > -   q = ns->queue;
> > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > > -           ret = bio_poll(bio, iob, poll_flags);
> > > > +   req = READ_ONCE(ioucmd->cookie);
> > > > +   if (req) {
> > >
> > > This is risky. We are not sure if the cookie is actually "req" at this
> > > moment.
> >
> > What else could it be? It's either a real request from a polled hctx tag, or
> > NULL at this point.
> 
> It can also be a function pointer that gets assigned on irq-driven completion.
> See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
> and task_work_cb share the storage.
> 
> > It's safe to check the cookie like this and rely on its contents.
> Hence not safe. Please try running this without poll-queues (at nvme
> level), you'll see failures.

Okay, you have a iouring polling instance used with a file that has poll
capabilities, but doesn't have any polling hctx's. It would be nice to exclude
these from io_uring's polling since they're wasting CPU time, but that doesn't
look easily done. This simple patch atop should work though.

---
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 369e8519b87a2..e3ff019404816 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
 	if (blk_rq_is_poll(req))
 		WRITE_ONCE(ioucmd->cookie, req);
+	else if (issue_flags & IO_URING_F_IOPOLL)
+		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
 
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
@@ -774,6 +776,9 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 	struct request *req;
 	int ret = 0;
 
+	if (ioucmd->flags & IORING_URING_CMD_NOPOLL)
+		return 0;
+
 	/*
 	 * The rcu lock ensures the 'req' in the command cookie will not be
 	 * freed until after the unlock. The queue must be frozen to free the
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 934e5dd4ccc08..2abf45491b3df 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_IOPOLL		= (1 << 10),
 };
 
+#define IORING_URING_CMD_NOPOLL	(1U << 31)
+
 struct io_uring_cmd {
 	struct file	*file;
 	const void	*cmd;
--
Kanchan Joshi March 28, 2023, 7:49 a.m. UTC | #8
On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
>On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
>> On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>> > > >     rcu_read_lock();
>> > > > -   bio = READ_ONCE(ioucmd->cookie);
>> > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
>> > > > -                   struct nvme_ns, cdev);
>> > > > -   q = ns->queue;
>> > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>> > > > -           ret = bio_poll(bio, iob, poll_flags);
>> > > > +   req = READ_ONCE(ioucmd->cookie);
>> > > > +   if (req) {
>> > >
>> > > This is risky. We are not sure if the cookie is actually "req" at this
>> > > moment.
>> >
>> > What else could it be? It's either a real request from a polled hctx tag, or
>> > NULL at this point.
>>
>> It can also be a function pointer that gets assigned on irq-driven completion.
>> See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
>> and task_work_cb share the storage.
>>
>> > It's safe to check the cookie like this and rely on its contents.
>> Hence not safe. Please try running this without poll-queues (at nvme
>> level), you'll see failures.
>
>Okay, you have a iouring polling instance used with a file that has poll
>capabilities, but doesn't have any polling hctx's. It would be nice to exclude
>these from io_uring's polling since they're wasting CPU time, but that doesn't
>look easily done.

Do you mean having the ring with IOPOLL set, and yet skip the attempt of
actively reaping the completion for certain IOs?

>This simple patch atop should work though.
>
>---
>diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>index 369e8519b87a2..e3ff019404816 100644
>--- a/drivers/nvme/host/ioctl.c
>+++ b/drivers/nvme/host/ioctl.c
>@@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>
> 	if (blk_rq_is_poll(req))
> 		WRITE_ONCE(ioucmd->cookie, req);
>+	else if (issue_flags & IO_URING_F_IOPOLL)
>+		ioucmd->flags |= IORING_URING_CMD_NOPOLL;

If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
could have just cleared that here. That would avoid the need of NOPOLL flag.
That said, I don't feel strongly about new flag too. You decide.
Sagi Grimberg March 28, 2023, 8:35 a.m. UTC | #9
>>> +	if (blk_rq_is_poll(req))
>>> +		WRITE_ONCE(ioucmd->cookie, req);
>>
>> Why aren't we always setting the cookie to point at the req?
> 
> As in unconditionally set the cookie even for non-polled requests? We need to
> see that the cookie is NULL to prevent polling interrupt queues when an
> io_uring polled command is dispatched to a device without polled hctx's.

Why don't we export that to an explicit flag? It would make it cleaner
I think.
Keith Busch March 28, 2023, 2:52 p.m. UTC | #10
On Tue, Mar 28, 2023 at 01:19:39PM +0530, Kanchan Joshi wrote:
> On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
> > On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
> > > On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
> > > > > >     rcu_read_lock();
> > > > > > -   bio = READ_ONCE(ioucmd->cookie);
> > > > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > > > > -                   struct nvme_ns, cdev);
> > > > > > -   q = ns->queue;
> > > > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > > > > -           ret = bio_poll(bio, iob, poll_flags);
> > > > > > +   req = READ_ONCE(ioucmd->cookie);
> > > > > > +   if (req) {
> > > > >
> > > > > This is risky. We are not sure if the cookie is actually "req" at this
> > > > > moment.
> > > >
> > > > What else could it be? It's either a real request from a polled hctx tag, or
> > > > NULL at this point.
> > > 
> > > It can also be a function pointer that gets assigned on irq-driven completion.
> > > See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
> > > and task_work_cb share the storage.
> > > 
> > > > It's safe to check the cookie like this and rely on its contents.
> > > Hence not safe. Please try running this without poll-queues (at nvme
> > > level), you'll see failures.
> > 
> > Okay, you have a iouring polling instance used with a file that has poll
> > capabilities, but doesn't have any polling hctx's. It would be nice to exclude
> > these from io_uring's polling since they're wasting CPU time, but that doesn't
> > look easily done.
> 
> Do you mean having the ring with IOPOLL set, and yet skip the attempt of
> actively reaping the completion for certain IOs?

Yes, exactly. It'd be great if non-polled requests don't get added to the
ctx->iopoll_list in the first place.
 
> > This simple patch atop should work though.
> > 
> > ---
> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> > index 369e8519b87a2..e3ff019404816 100644
> > --- a/drivers/nvme/host/ioctl.c
> > +++ b/drivers/nvme/host/ioctl.c
> > @@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > 
> > 	if (blk_rq_is_poll(req))
> > 		WRITE_ONCE(ioucmd->cookie, req);
> > +	else if (issue_flags & IO_URING_F_IOPOLL)
> > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> 
> If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> could have just cleared that here. That would avoid the need of NOPOLL flag.
> That said, I don't feel strongly about new flag too. You decide.

IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
need to be a litle careful here: the existing ioucmd->flags is used with uapi
flags.
Kanchan Joshi March 29, 2023, 8:46 a.m. UTC | #11
On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
>On Tue, Mar 28, 2023 at 01:19:39PM +0530, Kanchan Joshi wrote:
>> On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
>> > On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
>> > > On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>> > > > > >     rcu_read_lock();
>> > > > > > -   bio = READ_ONCE(ioucmd->cookie);
>> > > > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
>> > > > > > -                   struct nvme_ns, cdev);
>> > > > > > -   q = ns->queue;
>> > > > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>> > > > > > -           ret = bio_poll(bio, iob, poll_flags);
>> > > > > > +   req = READ_ONCE(ioucmd->cookie);
>> > > > > > +   if (req) {
>> > > > >
>> > > > > This is risky. We are not sure if the cookie is actually "req" at this
>> > > > > moment.
>> > > >
>> > > > What else could it be? It's either a real request from a polled hctx tag, or
>> > > > NULL at this point.
>> > >
>> > > It can also be a function pointer that gets assigned on irq-driven completion.
>> > > See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
>> > > and task_work_cb share the storage.
>> > >
>> > > > It's safe to check the cookie like this and rely on its contents.
>> > > Hence not safe. Please try running this without poll-queues (at nvme
>> > > level), you'll see failures.
>> >
>> > Okay, you have a iouring polling instance used with a file that has poll
>> > capabilities, but doesn't have any polling hctx's. It would be nice to exclude
>> > these from io_uring's polling since they're wasting CPU time, but that doesn't
>> > look easily done.
>>
>> Do you mean having the ring with IOPOLL set, and yet skip the attempt of
>> actively reaping the completion for certain IOs?
>
>Yes, exactly. It'd be great if non-polled requests don't get added to the
>ctx->iopoll_list in the first place.
>
>> > This simple patch atop should work though.
>> >
>> > ---
>> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>> > index 369e8519b87a2..e3ff019404816 100644
>> > --- a/drivers/nvme/host/ioctl.c
>> > +++ b/drivers/nvme/host/ioctl.c
>> > @@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>> >
>> > 	if (blk_rq_is_poll(req))
>> > 		WRITE_ONCE(ioucmd->cookie, req);
>> > +	else if (issue_flags & IO_URING_F_IOPOLL)
>> > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
>>
>> If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
>> could have just cleared that here. That would avoid the need of NOPOLL flag.
>> That said, I don't feel strongly about new flag too. You decide.
>
>IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
>part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
>need to be a litle careful here: the existing ioucmd->flags is used with uapi
>flags.

Indeed. If this is getting crufty, series can just enable polling on
no-payload requests. Reducing nvme handlers - for another day.
Keith Busch March 29, 2023, 4:11 p.m. UTC | #12
On Wed, Mar 29, 2023 at 02:16:18PM +0530, Kanchan Joshi wrote:
> On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
> > > > +	else if (issue_flags & IO_URING_F_IOPOLL)
> > > > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> > > 
> > > If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> > > could have just cleared that here. That would avoid the need of NOPOLL flag.
> > > That said, I don't feel strongly about new flag too. You decide.
> > 
> > IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
> > part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
> > need to be a litle careful here: the existing ioucmd->flags is used with uapi
> > flags.
> 
> Indeed. If this is getting crufty, series can just enable polling on
> no-payload requests. Reducing nvme handlers - for another day.

Well something needs to be done about multipath since it's broken today: if the
path changes between submission and poll, we'll consult the wrong queue for
polling enabled. This could cause a missed polling opprotunity, polling a
pointer that isn't a bio, or poll an irq enabled cq. All are bad.
Kanchan Joshi April 3, 2023, 12:42 p.m. UTC | #13
On Wed, Mar 29, 2023 at 9:41 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Wed, Mar 29, 2023 at 02:16:18PM +0530, Kanchan Joshi wrote:
> > On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
> > > > > +       else if (issue_flags & IO_URING_F_IOPOLL)
> > > > > +               ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> > > >
> > > > If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> > > > could have just cleared that here. That would avoid the need of NOPOLL flag.
> > > > That said, I don't feel strongly about new flag too. You decide.
> > >
> > > IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
> > > part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
> > > need to be a litle careful here: the existing ioucmd->flags is used with uapi
> > > flags.
> >
> > Indeed. If this is getting crufty, series can just enable polling on
> > no-payload requests. Reducing nvme handlers - for another day.
>
> Well something needs to be done about multipath since it's broken today: if the
> path changes between submission and poll, we'll consult the wrong queue for
> polling enabled. This could cause a missed polling opprotunity, polling a
> pointer that isn't a bio, or poll an irq enabled cq. All are bad.

I see, that is because "nvme_find_path" was used.
How about using top 4 bits of "ioucmd->flags" for an internal flag.
That means we can support max 28 flags for "sqe->uring_cmd_flags".
That perhaps is not too bad, given that we use one bit at the moment?
diff mbox series

Patch

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 723e7d5b778f2..369e8519b87a2 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -503,7 +503,6 @@  static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -516,9 +515,10 @@  static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_cb(ioucmd);
-	else
+	} else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
 
 	return RQ_END_IO_FREE;
@@ -529,7 +529,6 @@  static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	pdu->req = req;
@@ -538,9 +537,10 @@  static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_meta_cb(ioucmd);
-	else
+	} else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
 
 	return RQ_END_IO_NONE;
@@ -597,7 +597,6 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (issue_flags & IO_URING_F_IOPOLL)
 		rq_flags |= REQ_POLLED;
 
-retry:
 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -611,17 +610,9 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			return ret;
 	}
 
-	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
-		if (unlikely(!req->bio)) {
-			/* we can't poll this, so alloc regular req instead */
-			blk_mq_free_request(req);
-			rq_flags &= ~REQ_POLLED;
-			goto retry;
-		} else {
-			WRITE_ONCE(ioucmd->cookie, req->bio);
-			req->bio->bi_opf |= REQ_POLLED;
-		}
-	}
+	if (blk_rq_is_poll(req))
+		WRITE_ONCE(ioucmd->cookie, req);
+
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
 	pdu->meta_len = d.metadata_len;
@@ -780,18 +771,27 @@  int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 				 struct io_comp_batch *iob,
 				 unsigned int poll_flags)
 {
-	struct bio *bio;
+	struct request *req;
 	int ret = 0;
-	struct nvme_ns *ns;
-	struct request_queue *q;
 
+	/*
+	 * The rcu lock ensures the 'req' in the command cookie will not be
+	 * freed until after the unlock. The queue must be frozen to free the
+	 * request, and the freeze requires an rcu grace period. The cookie is
+	 * cleared before the request is completed, so we're fine even if a
+	 * competing polling thread completes this thread's request.
+	 */
 	rcu_read_lock();
-	bio = READ_ONCE(ioucmd->cookie);
-	ns = container_of(file_inode(ioucmd->file)->i_cdev,
-			struct nvme_ns, cdev);
-	q = ns->queue;
-	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
-		ret = bio_poll(bio, iob, poll_flags);
+	req = READ_ONCE(ioucmd->cookie);
+	if (req) {
+		struct request_queue *q = req->q;
+
+		if (percpu_ref_tryget(&q->q_usage_counter)) {
+			ret = blk_mq_poll(q, blk_rq_to_qc(req), iob,
+					  poll_flags);
+			blk_queue_exit(q);
+		}
+	}
 	rcu_read_unlock();
 	return ret;
 }
@@ -883,31 +883,6 @@  int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
-
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-				      struct io_comp_batch *iob,
-				      unsigned int poll_flags)
-{
-	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
-	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
-	int srcu_idx = srcu_read_lock(&head->srcu);
-	struct nvme_ns *ns = nvme_find_path(head);
-	struct bio *bio;
-	int ret = 0;
-	struct request_queue *q;
-
-	if (ns) {
-		rcu_read_lock();
-		bio = READ_ONCE(ioucmd->cookie);
-		q = ns->queue;
-		if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio
-				&& bio->bi_bdev)
-			ret = bio_poll(bio, iob, poll_flags);
-		rcu_read_unlock();
-	}
-	srcu_read_unlock(&head->srcu, srcu_idx);
-	return ret;
-}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63b..fcecb731c8bd9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -470,7 +470,7 @@  static const struct file_operations nvme_ns_head_chr_fops = {
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
-	.uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
+	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1e..ca4ea89333660 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -847,8 +847,6 @@  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 		struct io_comp_batch *iob, unsigned int poll_flags);
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-		struct io_comp_batch *iob, unsigned int poll_flags);
 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,