Message ID | 20161005165127.GA4812@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Keith, On Thu, Oct 6, 2016 at 12:51 AM, Keith Busch <keith.busch@intel.com> wrote: > On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote: >> But .poll() need to check if the specific request is completed or not, >> then blk_poll() can set 'current' as RUNNING if it is completed. >> >> blk_poll(): >> ... >> ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); >> if (ret > 0) { >> hctx->poll_success++; >> set_current_state(TASK_RUNNING); >> return true; >> } > > > Right, but the task could be waiting on a whole lot more than just that > one tag, so setting the task to running before knowing those all complete > doesn't sound right. > >> I am glad to take a look the patch if you post it out. > > Here's what I got for block + nvme. It relies on all the requests to > complete to set the task back to running. Yeah, but your patch doesn't add that change, and looks 'task_struct *' in submission path need to be stored in request or somewhere else. There are some issues with current polling approach: - in dio, one dio may include lots of bios, but only the last submitted bio is polled - one bio can be splitted into several bios, but submit_bio() only returns one cookie for polling Looks your approach via polling current state can fix this issue. > > --- > diff --git a/block/blk-core.c b/block/blk-core.c > index b993f88..3c1cfbf 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -3342,6 +3342,8 @@ EXPORT_SYMBOL(blk_finish_plug); > bool blk_poll(struct request_queue *q, blk_qc_t cookie) > { > struct blk_plug *plug; > + struct blk_mq_hw_ctx *hctx; > + unsigned int queue_num; > long state; > > if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || > @@ -3353,27 +3355,15 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) > blk_flush_plug_list(plug, false); > > state = current->state; > + queue_num = blk_qc_t_to_queue_num(cookie); > + hctx = q->queue_hw_ctx[queue_num]; > while (!need_resched()) { > - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); > - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; > - int ret; > - > - hctx->poll_invoked++; > - > - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); > - if (ret > 0) { > - hctx->poll_success++; > - set_current_state(TASK_RUNNING); > - return true; > - } > + q->mq_ops->poll(hctx); > > if (signal_pending_state(state, current)) > set_current_state(TASK_RUNNING); > - > if (current->state == TASK_RUNNING) > return true; > - if (ret < 0) > - break; > cpu_relax(); > } Then looks the whole hw queue is polled and only the queue num in cookie matters. In theory, there might be one race: - one dio need to submit several bios(suppose there are two bios: A and B) - A is submitted to hardware queue M - B is submitted to hardware queue N because the current task is migrated to another CPU - then only hardware queue N is polled > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index befac5b..2e359e0 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -649,7 +651,7 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, > return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; > } > > -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > +static void nvme_process_cq(struct nvme_queue *nvmeq) > { > u16 head, phase; > > @@ -665,9 +667,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > phase = !phase; > } > > - if (tag && *tag == cqe.command_id) > - *tag = -1; > - > if (unlikely(cqe.command_id >= nvmeq->q_depth)) { > dev_warn(nvmeq->dev->ctrl.device, > "invalid id %d completed on queue %d\n", > @@ -711,11 +710,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > nvmeq->cqe_seen = 1; > } > > -static void nvme_process_cq(struct nvme_queue *nvmeq) > -{ > - __nvme_process_cq(nvmeq, NULL); > -} > - > static irqreturn_t nvme_irq(int irq, void *data) > { > irqreturn_t result; > @@ -736,20 +730,15 @@ static irqreturn_t nvme_irq_check(int irq, void *data) > return IRQ_NONE; > } > > -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +static void nvme_poll(struct blk_mq_hw_ctx *hctx) > { > struct nvme_queue *nvmeq = hctx->driver_data; > > if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { > spin_lock_irq(&nvmeq->q_lock); > - __nvme_process_cq(nvmeq, &tag); > + nvme_process_cq(nvmeq); > spin_unlock_irq(&nvmeq->q_lock); > - > - if (tag == -1) > - return 1; > } > - > - return 0; > } > > static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 2498fdf..a723af9 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -100,7 +100,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, > typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, > bool); > typedef void (busy_tag_iter_fn)(struct request *, void *, bool); > -typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); > +typedef void (poll_fn)(struct blk_mq_hw_ctx *); > > > struct blk_mq_ops { > --
On Fri, Oct 07, 2016 at 12:06:27AM +0800, Ming Lei wrote: > Hi Keith, > > On Thu, Oct 6, 2016 at 12:51 AM, Keith Busch <keith.busch@intel.com> wrote: > > On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote: > >> But .poll() need to check if the specific request is completed or not, > >> then blk_poll() can set 'current' as RUNNING if it is completed. > >> > >> blk_poll(): > >> ... > >> ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); > >> if (ret > 0) { > >> hctx->poll_success++; > >> set_current_state(TASK_RUNNING); > >> return true; > >> } > > > > > > Right, but the task could be waiting on a whole lot more than just that > > one tag, so setting the task to running before knowing those all complete > > doesn't sound right. > > > >> I am glad to take a look the patch if you post it out. > > > > Here's what I got for block + nvme. It relies on all the requests to > > complete to set the task back to running. > > Yeah, but your patch doesn't add that change, and looks 'task_struct *' > in submission path need to be stored in request or somewhere else. The polling function shouldn't have to set the task to running. The task_struct is the dio's "waiter", and dio_bio_end_io sets its state to noromal running when every bio submitted and split chained bios complete. Hopefully those all complete through the ->poll(), and blk_poll will automatically observe the state changed. > Then looks the whole hw queue is polled and only the queue num > in cookie matters. > > In theory, there might be one race: > > - one dio need to submit several bios(suppose there are two bios: A and B) > - A is submitted to hardware queue M > - B is submitted to hardware queue N because the current task is migrated > to another CPU > - then only hardware queue N is polled Yeah, in that case we'd rely on the queue M's interrupt handler to do the completion. Avoiding the context switch was the biggest win for polling, as I understand it. If the task is being migrated to other CPUs, I think we've already lost the latency benefit we'd have got. -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/block/blk-core.c b/block/blk-core.c index b993f88..3c1cfbf 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3342,6 +3342,8 @@ EXPORT_SYMBOL(blk_finish_plug); bool blk_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_plug *plug; + struct blk_mq_hw_ctx *hctx; + unsigned int queue_num; long state; if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || @@ -3353,27 +3355,15 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) blk_flush_plug_list(plug, false); state = current->state; + queue_num = blk_qc_t_to_queue_num(cookie); + hctx = q->queue_hw_ctx[queue_num]; while (!need_resched()) { - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } + q->mq_ops->poll(hctx); if (signal_pending_state(state, current)) set_current_state(TASK_RUNNING); - if (current->state == TASK_RUNNING) return true; - if (ret < 0) - break; cpu_relax(); } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index befac5b..2e359e0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -649,7 +651,7 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; } -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) +static void nvme_process_cq(struct nvme_queue *nvmeq) { u16 head, phase; @@ -665,9 +667,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) phase = !phase; } - if (tag && *tag == cqe.command_id) - *tag = -1; - if (unlikely(cqe.command_id >= nvmeq->q_depth)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", @@ -711,11 +710,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) nvmeq->cqe_seen = 1; } -static void nvme_process_cq(struct nvme_queue *nvmeq) -{ - __nvme_process_cq(nvmeq, NULL); -} - static irqreturn_t nvme_irq(int irq, void *data) { irqreturn_t result; @@ -736,20 +730,15 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_NONE; } -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static void nvme_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { spin_lock_irq(&nvmeq->q_lock); - __nvme_process_cq(nvmeq, &tag); + nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); - - if (tag == -1) - return 1; } - - return 0; } static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2498fdf..a723af9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -100,7 +100,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef void (busy_tag_iter_fn)(struct request *, void *, bool); -typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef void (poll_fn)(struct blk_mq_hw_ctx *); struct blk_mq_ops {