diff mbox

blk-mq: Return invalid cookie if bio was split

Message ID 20161005165127.GA4812@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Keith Busch Oct. 5, 2016, 4:51 p.m. UTC
On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote:
> But .poll() need to check if the specific request is completed or not,
> then blk_poll() can set 'current' as RUNNING if it is completed.
> 
> blk_poll():
>                 ...
>                 ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
>                 if (ret > 0) {
>                         hctx->poll_success++;
>                         set_current_state(TASK_RUNNING);
>                         return true;
>                 }


Right, but the task could be waiting on a whole lot more than just that
one tag, so setting the task to running before knowing those all complete
doesn't sound right.
 
> I am glad to take a look the patch if you post it out.

Here's what I got for block + nvme. It relies on all the requests to
complete to set the task back to running.

---
--
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ming Lei Oct. 6, 2016, 4:06 p.m. UTC | #1
Hi Keith,

On Thu, Oct 6, 2016 at 12:51 AM, Keith Busch <keith.busch@intel.com> wrote:
> On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote:
>> But .poll() need to check if the specific request is completed or not,
>> then blk_poll() can set 'current' as RUNNING if it is completed.
>>
>> blk_poll():
>>                 ...
>>                 ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
>>                 if (ret > 0) {
>>                         hctx->poll_success++;
>>                         set_current_state(TASK_RUNNING);
>>                         return true;
>>                 }
>
>
> Right, but the task could be waiting on a whole lot more than just that
> one tag, so setting the task to running before knowing those all complete
> doesn't sound right.
>
>> I am glad to take a look the patch if you post it out.
>
> Here's what I got for block + nvme. It relies on all the requests to
> complete to set the task back to running.

Yeah, but your patch doesn't add that change, and looks 'task_struct *'
in submission path need to be stored in request or somewhere else.

There are some issues with current polling approach:
- in dio, one dio may include lots of bios, but only the last submitted bio
is polled
- one bio can be splitted into several bios, but submit_bio() only returns
one cookie for polling

Looks your approach via polling current state can fix this issue.

>
> ---
> diff --git a/block/blk-core.c b/block/blk-core.c
> index b993f88..3c1cfbf 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -3342,6 +3342,8 @@ EXPORT_SYMBOL(blk_finish_plug);
>  bool blk_poll(struct request_queue *q, blk_qc_t cookie)
>  {
>         struct blk_plug *plug;
> +       struct blk_mq_hw_ctx *hctx;
> +       unsigned int queue_num;
>         long state;
>
>         if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
> @@ -3353,27 +3355,15 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
>                 blk_flush_plug_list(plug, false);
>
>         state = current->state;
> +       queue_num = blk_qc_t_to_queue_num(cookie);
> +       hctx = q->queue_hw_ctx[queue_num];
>         while (!need_resched()) {
> -               unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
> -               struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
> -               int ret;
> -
> -               hctx->poll_invoked++;
> -
> -               ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
> -               if (ret > 0) {
> -                       hctx->poll_success++;
> -                       set_current_state(TASK_RUNNING);
> -                       return true;
> -               }
> +               q->mq_ops->poll(hctx);
>
>                 if (signal_pending_state(state, current))
>                         set_current_state(TASK_RUNNING);
> -
>                 if (current->state == TASK_RUNNING)
>                         return true;
> -               if (ret < 0)
> -                       break;
>                 cpu_relax();
>         }

Then looks the whole hw queue is polled and only the queue num
in cookie matters.

In theory, there might be one race:

- one dio need to submit several bios(suppose there are two bios: A and B)
- A is submitted to hardware queue M
- B is submitted to hardware queue N because the current task is migrated
to another CPU
- then only hardware queue N is polled

>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index befac5b..2e359e0 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -649,7 +651,7 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
>         return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
>  }
>
> -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
> +static void nvme_process_cq(struct nvme_queue *nvmeq)
>  {
>         u16 head, phase;
>
> @@ -665,9 +667,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
>                         phase = !phase;
>                 }
>
> -               if (tag && *tag == cqe.command_id)
> -                       *tag = -1;
> -
>                 if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
>                         dev_warn(nvmeq->dev->ctrl.device,
>                                 "invalid id %d completed on queue %d\n",
> @@ -711,11 +710,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
>         nvmeq->cqe_seen = 1;
>  }
>
> -static void nvme_process_cq(struct nvme_queue *nvmeq)
> -{
> -       __nvme_process_cq(nvmeq, NULL);
> -}
> -
>  static irqreturn_t nvme_irq(int irq, void *data)
>  {
>         irqreturn_t result;
> @@ -736,20 +730,15 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
>         return IRQ_NONE;
>  }
>
> -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> +static void nvme_poll(struct blk_mq_hw_ctx *hctx)
>  {
>         struct nvme_queue *nvmeq = hctx->driver_data;
>
>         if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
>                 spin_lock_irq(&nvmeq->q_lock);
> -               __nvme_process_cq(nvmeq, &tag);
> +               nvme_process_cq(nvmeq);
>                 spin_unlock_irq(&nvmeq->q_lock);
> -
> -               if (tag == -1)
> -                       return 1;
>         }
> -
> -       return 0;
>  }
>
>  static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 2498fdf..a723af9 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -100,7 +100,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int,
>  typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
>                 bool);
>  typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
> -typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
> +typedef void (poll_fn)(struct blk_mq_hw_ctx *);
>
>
>  struct blk_mq_ops {
> --
Keith Busch Oct. 6, 2016, 4:39 p.m. UTC | #2
On Fri, Oct 07, 2016 at 12:06:27AM +0800, Ming Lei wrote:
> Hi Keith,
> 
> On Thu, Oct 6, 2016 at 12:51 AM, Keith Busch <keith.busch@intel.com> wrote:
> > On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote:
> >> But .poll() need to check if the specific request is completed or not,
> >> then blk_poll() can set 'current' as RUNNING if it is completed.
> >>
> >> blk_poll():
> >>                 ...
> >>                 ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
> >>                 if (ret > 0) {
> >>                         hctx->poll_success++;
> >>                         set_current_state(TASK_RUNNING);
> >>                         return true;
> >>                 }
> >
> >
> > Right, but the task could be waiting on a whole lot more than just that
> > one tag, so setting the task to running before knowing those all complete
> > doesn't sound right.
> >
> >> I am glad to take a look the patch if you post it out.
> >
> > Here's what I got for block + nvme. It relies on all the requests to
> > complete to set the task back to running.
> 
> Yeah, but your patch doesn't add that change, and looks 'task_struct *'
> in submission path need to be stored in request or somewhere else.

The polling function shouldn't have to set the task to running. The
task_struct is the dio's "waiter", and dio_bio_end_io sets its state
to noromal running when every bio submitted and split chained bios
complete. Hopefully those all complete through the ->poll(), and
blk_poll will automatically observe the state changed.

> Then looks the whole hw queue is polled and only the queue num
> in cookie matters.
>
> In theory, there might be one race:
> 
> - one dio need to submit several bios(suppose there are two bios: A and B)
> - A is submitted to hardware queue M
> - B is submitted to hardware queue N because the current task is migrated
> to another CPU
> - then only hardware queue N is polled

Yeah, in that case we'd rely on the queue M's interrupt handler to do
the completion.

Avoiding the context switch was the biggest win for polling, as I
understand it. If the task is being migrated to other CPUs, I think
we've already lost the latency benefit we'd have got.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index b993f88..3c1cfbf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3342,6 +3342,8 @@  EXPORT_SYMBOL(blk_finish_plug);
 bool blk_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_plug *plug;
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int queue_num;
 	long state;
 
 	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
@@ -3353,27 +3355,15 @@  bool blk_poll(struct request_queue *q, blk_qc_t cookie)
 		blk_flush_plug_list(plug, false);
 
 	state = current->state;
+	queue_num = blk_qc_t_to_queue_num(cookie);
+	hctx = q->queue_hw_ctx[queue_num];
 	while (!need_resched()) {
-		unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
-		struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
-		int ret;
-
-		hctx->poll_invoked++;
-
-		ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
-		if (ret > 0) {
-			hctx->poll_success++;
-			set_current_state(TASK_RUNNING);
-			return true;
-		}
+		q->mq_ops->poll(hctx);
 
 		if (signal_pending_state(state, current))
 			set_current_state(TASK_RUNNING);
-
 		if (current->state == TASK_RUNNING)
 			return true;
-		if (ret < 0)
-			break;
 		cpu_relax();
 	}
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index befac5b..2e359e0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -649,7 +651,7 @@  static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
 	return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
 }
 
-static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
+static void nvme_process_cq(struct nvme_queue *nvmeq)
 {
 	u16 head, phase;
 
@@ -665,9 +667,6 @@  static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 			phase = !phase;
 		}
 
-		if (tag && *tag == cqe.command_id)
-			*tag = -1;
-
 		if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
 			dev_warn(nvmeq->dev->ctrl.device,
 				"invalid id %d completed on queue %d\n",
@@ -711,11 +710,6 @@  static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 	nvmeq->cqe_seen = 1;
 }
 
-static void nvme_process_cq(struct nvme_queue *nvmeq)
-{
-	__nvme_process_cq(nvmeq, NULL);
-}
-
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	irqreturn_t result;
@@ -736,20 +730,15 @@  static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_NONE;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static void nvme_poll(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 
 	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
 		spin_lock_irq(&nvmeq->q_lock);
-		__nvme_process_cq(nvmeq, &tag);
+		nvme_process_cq(nvmeq);
 		spin_unlock_irq(&nvmeq->q_lock);
-
-		if (tag == -1)
-			return 1;
 	}
-
-	return 0;
 }
 
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2498fdf..a723af9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -100,7 +100,7 @@  typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
 typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
-typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef void (poll_fn)(struct blk_mq_hw_ctx *);
 
 
 struct blk_mq_ops {