diff mbox series

[v3,6/6] nvme-rdma: implement polling queue map

Message ID 20181213213410.9841-7-sagi@grimberg.me (mailing list archive)
State Not Applicable
Headers show
Series restore nvme-rdma polling | expand

Commit Message

Sagi Grimberg Dec. 13, 2018, 9:34 p.m. UTC
When passed with nr_poll_queues setup additional queues with cq polling
context IB_POLL_DIRECT (no interrupts) and make sure to set
QUEUE_FLAG_POLL on the connect_q. In addition add the third queue
mapping for polling queues.

nvmf connect on this queue is polled for like all other requests so make
nvmf_connect_io_queue poll for polling queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 58 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig Dec. 14, 2018, 3:46 p.m. UTC | #1
> +static bool nvme_rdma_poller_queue(struct nvme_rdma_queue *queue)

Can we please make this poll_queue?  or at least polled_queue?
poller sounds odd..

> -		set->nr_maps = 2 /* default + read */;
> +		set->nr_maps = HCTX_MAX_TYPES;
>  	}
>  
>  	ret = blk_mq_alloc_tag_set(set);
> @@ -864,6 +881,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
>  			ret = PTR_ERR(ctrl->ctrl.connect_q);
>  			goto out_free_tag_set;
>  		}
> +
> +		if (ctrl->ctrl.opts->nr_poll_queues)
> +			blk_queue_flag_set(QUEUE_FLAG_POLL,
> +				ctrl->ctrl.connect_q);

The block core is supposed to detect we can poll based on nr_maps > 2,
and then set QUEUE_FLAG_POLL automatically.  Although I got the details
wrong for PCI as well, but I just sent a fix..
> +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct nvme_rdma_queue *queue = hctx->driver_data;
> +	struct ib_cq *cq = queue->ib_cq;
> +
> +	return ib_process_cq_direct(cq, -1);

I think we can skip the cq local variable here.

Otherwise this looks really nice and simple, thanks for looking into it!

Do you have any performance number, especially with Jens' ringbuffer
code?
Sagi Grimberg Dec. 14, 2018, 6:59 p.m. UTC | #2
>> +static bool nvme_rdma_poller_queue(struct nvme_rdma_queue *queue)
> 
> Can we please make this poll_queue?  or at least polled_queue?
> poller sounds odd..

Changed to nvme_rdma_poll_queue..

> 
>> -		set->nr_maps = 2 /* default + read */;
>> +		set->nr_maps = HCTX_MAX_TYPES;
>>   	}
>>   
>>   	ret = blk_mq_alloc_tag_set(set);
>> @@ -864,6 +881,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
>>   			ret = PTR_ERR(ctrl->ctrl.connect_q);
>>   			goto out_free_tag_set;
>>   		}
>> +
>> +		if (ctrl->ctrl.opts->nr_poll_queues)
>> +			blk_queue_flag_set(QUEUE_FLAG_POLL,
>> +				ctrl->ctrl.connect_q);
> 
> The block core is supposed to detect we can poll based on nr_maps > 2,
> and then set QUEUE_FLAG_POLL automatically.  Although I got the details
> wrong for PCI as well, but I just sent a fix..

I'll lose that, but I didn't understand what you got wrong for pci?
(didn't understand the fix either)

>> +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
>> +{
>> +	struct nvme_rdma_queue *queue = hctx->driver_data;
>> +	struct ib_cq *cq = queue->ib_cq;
>> +
>> +	return ib_process_cq_direct(cq, -1);
> 
> I think we can skip the cq local variable here.

Lost..

> Otherwise this looks really nice and simple, thanks for looking into it!
> 
> Do you have any performance number, especially with Jens' ringbuffer
> code?

Well, I can get from my local vm on my laptop on top of soft-roce 7K
iops :) (compared tp 5.5K without polling, but not something to
conclude from)

I don't have any numbers to show for right now...

As I said in the cover letter, we want a way to tell ib_poll_cq_direct
to not count send completions (where we end our sqe), right now we are
probably screwing up poll_success stat...
diff mbox series

Patch

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b907ed43814f..80b3113b45fb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -162,6 +162,13 @@  static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 	return queue - queue->ctrl->queues;
 }
 
+static bool nvme_rdma_poller_queue(struct nvme_rdma_queue *queue)
+{
+	return nvme_rdma_queue_idx(queue) >
+		queue->ctrl->ctrl.opts->nr_io_queues +
+		queue->ctrl->ctrl.opts->nr_write_queues;
+}
+
 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
 {
 	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -440,6 +447,7 @@  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	const int send_wr_factor = 3;			/* MR, SEND, INV */
 	const int cq_factor = send_wr_factor + 1;	/* + RECV */
 	int comp_vector, idx = nvme_rdma_queue_idx(queue);
+	enum ib_poll_context poll_ctx;
 	int ret;
 
 	queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -456,10 +464,16 @@  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	 */
 	comp_vector = idx == 0 ? idx : idx - 1;
 
+	/* Polling queues need direct cq polling context */
+	if (nvme_rdma_poller_queue(queue))
+		poll_ctx = IB_POLL_DIRECT;
+	else
+		poll_ctx = IB_POLL_SOFTIRQ;
+
 	/* +1 for ib_stop_cq */
 	queue->ib_cq = ib_alloc_cq(ibdev, queue,
 				cq_factor * queue->queue_size + 1,
-				comp_vector, IB_POLL_SOFTIRQ);
+				comp_vector, poll_ctx);
 	if (IS_ERR(queue->ib_cq)) {
 		ret = PTR_ERR(queue->ib_cq);
 		goto out_put_dev;
@@ -595,15 +609,17 @@  static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
 
 static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 {
+	struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+	bool poll = nvme_rdma_poller_queue(queue);
 	int ret;
 
 	if (idx)
-		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, false);
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
 	else
 		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 
 	if (!ret)
-		set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
+		set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
 	else
 		dev_info(ctrl->ctrl.device,
 			"failed to connect queue: %d ret=%d\n", idx, ret);
@@ -646,6 +662,7 @@  static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 				ibdev->num_comp_vectors);
 
 	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
 
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret)
@@ -716,7 +733,7 @@  static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
-		set->nr_maps = 2 /* default + read */;
+		set->nr_maps = HCTX_MAX_TYPES;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -864,6 +881,10 @@  static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 			ret = PTR_ERR(ctrl->ctrl.connect_q);
 			goto out_free_tag_set;
 		}
+
+		if (ctrl->ctrl.opts->nr_poll_queues)
+			blk_queue_flag_set(QUEUE_FLAG_POLL,
+				ctrl->ctrl.connect_q);
 	} else {
 		blk_mq_update_nr_hw_queues(&ctrl->tag_set,
 			ctrl->ctrl.queue_count - 1);
@@ -1742,6 +1763,14 @@  static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_IOERR;
 }
 
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_rdma_queue *queue = hctx->driver_data;
+	struct ib_cq *cq = queue->ib_cq;
+
+	return ib_process_cq_direct(cq, -1);
+}
+
 static void nvme_rdma_complete_rq(struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1772,6 +1801,21 @@  static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 			ctrl->device->dev, 0);
 	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
 			ctrl->device->dev, 0);
+
+	if (ctrl->ctrl.opts->nr_poll_queues) {
+		set->map[HCTX_TYPE_POLL].nr_queues =
+				ctrl->ctrl.opts->nr_poll_queues;
+		set->map[HCTX_TYPE_POLL].queue_offset =
+				ctrl->ctrl.opts->nr_io_queues;
+		if (ctrl->ctrl.opts->nr_write_queues)
+			set->map[HCTX_TYPE_POLL].queue_offset +=
+				ctrl->ctrl.opts->nr_write_queues;
+	} else {
+		set->map[HCTX_TYPE_POLL].nr_queues =
+				ctrl->ctrl.opts->nr_io_queues;
+		set->map[HCTX_TYPE_POLL].queue_offset = 0;
+	}
+	blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
 	return 0;
 }
 
@@ -1783,6 +1827,7 @@  static const struct blk_mq_ops nvme_rdma_mq_ops = {
 	.init_hctx	= nvme_rdma_init_hctx,
 	.timeout	= nvme_rdma_timeout,
 	.map_queues	= nvme_rdma_map_queues,
+	.poll		= nvme_rdma_poll,
 };
 
 static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1927,7 +1972,8 @@  static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
 
-	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+				opts->nr_poll_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -1979,7 +2025,7 @@  static struct nvmf_transport_ops nvme_rdma_transport = {
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
-			  NVMF_OPT_NR_WRITE_QUEUES,
+			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
 	.create_ctrl	= nvme_rdma_create_ctrl,
 };