diff mbox

[RFC,1/2] nvme-rdma: Support 8K inline

Message ID 1609e345042f2801f92e8831e8c83b688fad07eb.1525880285.git.swise@opengridcomputing.com (mailing list archive)
State RFC
Headers show

Commit Message

Steve Wise May 9, 2018, 2:31 p.m. UTC
Allow up to 2 pages of inline for NVMF WRITE operations.  This reduces
latency for 8K WRITEs by removing the need to issue a READ WR for IB,
or a REG_MR+READ WR chain for iWarp.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/nvme/host/rdma.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

Comments

Parav Pandit May 9, 2018, 4:55 p.m. UTC | #1
Hi Steve,

> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Steve Wise
> Sent: Wednesday, May 09, 2018 9:31 AM
> To: axboe@fb.com; hch@lst.de; keith.busch@intel.com; sagi@grimberg.me;
> linux-nvme@lists.infradead.org
> Cc: linux-rdma@vger.kernel.org
> Subject: [PATCH RFC 1/2] nvme-rdma: Support 8K inline
> 
> Allow up to 2 pages of inline for NVMF WRITE operations.  This reduces latency
> for 8K WRITEs by removing the need to issue a READ WR for IB, or a
> REG_MR+READ WR chain for iWarp.
> 
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> ---
>  drivers/nvme/host/rdma.c | 21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index
> 1eb4438..9b8af98 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -40,7 +40,7 @@
> 
>  #define NVME_RDMA_MAX_SEGMENTS		256
> 
> -#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
> +#define NVME_RDMA_MAX_INLINE_SEGMENTS	2
> 
I wrote this patch back in Feb 2017 but didn't spend time on V2 to address comments from Sagi, Christoph.
http://lists.infradead.org/pipermail/linux-nvme/2017-February/008057.html
Thanks for taking this forward.
This is helpful for certain db workload who have 8K typical data size too.

>  struct nvme_rdma_device {
>  	struct ib_device	*dev;
> @@ -1086,19 +1086,28 @@ static int nvme_rdma_set_sg_null(struct
> nvme_command *c)  }
> 
>  static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
> -		struct nvme_rdma_request *req, struct nvme_command *c)
> +		struct nvme_rdma_request *req, int count,
> +		struct nvme_command *c)
>  {
>  	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
> +	u32 len;
> 
>  	req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
>  	req->sge[1].length = sg_dma_len(req->sg_table.sgl);
>  	req->sge[1].lkey = queue->device->pd->local_dma_lkey;
> +	len = req->sge[1].length;
> +	if (count == 2) {
> +		req->sge[2].addr = sg_dma_address(req->sg_table.sgl+1);
> +		req->sge[2].length = sg_dma_len(req->sg_table.sgl+1);
> +		req->sge[2].lkey = queue->device->pd->local_dma_lkey;
> +		len += req->sge[2].length;
> +	}
> 
>  	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
> -	sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
> +	sg->length = cpu_to_le32(len);
>  	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) |
> NVME_SGL_FMT_OFFSET;
> 
> -	req->num_sge++;
> +	req->num_sge += count;
>  	return 0;
>  }
> 
> @@ -1191,13 +1200,13 @@ static int nvme_rdma_map_data(struct
> nvme_rdma_queue *queue,
>  		return -EIO;
>  	}
> 
> -	if (count == 1) {
> +	if (count <= 2) {
>  		if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue)
> &&
>  		    blk_rq_payload_bytes(rq) <=
>  				nvme_rdma_inline_data_size(queue))
>  			return nvme_rdma_map_sg_inline(queue, req, c);
> 
> -		if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
> +		if (count == 1 && dev->pd->flags &
> IB_PD_UNSAFE_GLOBAL_RKEY)
>  			return nvme_rdma_map_sg_single(queue, req, c);
>  	}
> 
> --
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise May 9, 2018, 7:28 p.m. UTC | #2
On 5/9/2018 11:55 AM, Parav Pandit wrote:
> Hi Steve,
>
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Steve Wise
>> Sent: Wednesday, May 09, 2018 9:31 AM
>> To: axboe@fb.com; hch@lst.de; keith.busch@intel.com; sagi@grimberg.me;
>> linux-nvme@lists.infradead.org
>> Cc: linux-rdma@vger.kernel.org
>> Subject: [PATCH RFC 1/2] nvme-rdma: Support 8K inline
>>
>> Allow up to 2 pages of inline for NVMF WRITE operations.  This reduces latency
>> for 8K WRITEs by removing the need to issue a READ WR for IB, or a
>> REG_MR+READ WR chain for iWarp.
>>
>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>> ---
>>  drivers/nvme/host/rdma.c | 21 +++++++++++++++------
>>  1 file changed, 15 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index
>> 1eb4438..9b8af98 100644
>> --- a/drivers/nvme/host/rdma.c
>> +++ b/drivers/nvme/host/rdma.c
>> @@ -40,7 +40,7 @@
>>
>>  #define NVME_RDMA_MAX_SEGMENTS		256
>>
>> -#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
>> +#define NVME_RDMA_MAX_INLINE_SEGMENTS	2
>>
> I wrote this patch back in Feb 2017 but didn't spend time on V2 to address comments from Sagi, Christoph.
> http://lists.infradead.org/pipermail/linux-nvme/2017-February/008057.html
> Thanks for taking this forward.
> This is helpful for certain db workload who have 8K typical data size too.

Hey Parav,

I thought I'd remembered something similar previously. :)  Let me see if
I can address the previous comments, going forward.  They are:

- why just 1 or 2 pages vs more?
- dynamic allocation of nvme-rdma resources based on the target's
advertised ioccsz.

And I see you posted the nvmet-rdma patch here, which allows up to 16KB
inline:

http://lists.infradead.org/pipermail/linux-nvme/2017-February/008064.html

And I think the comments needing resolution are:

- make it a configfs option
- adjust the qp depth some if the inline depth is bigger to try and keep
the overall memory footprint reasonable
- avoid high-order allocations - maybe per-core SRQ could be helpful

So the question is, do we have agreement on the way forward?  Sagi and
Christoph, I appreciate any feedback on this.  I'd like to get this
featured merged.  

Thanks,

Steve.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig May 11, 2018, 6:48 a.m. UTC | #3
> -#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
> +#define NVME_RDMA_MAX_INLINE_SEGMENTS	2

Given how little space we use for just the sge array maybe we
want to bump this to 4 once we need to deal with multiple entries?

>  static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
> -		struct nvme_rdma_request *req, struct nvme_command *c)
> +		struct nvme_rdma_request *req, int count,
> +		struct nvme_command *c)

Just gut feeling, but I'd pass the count argument last.

>  	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
> +	u32 len;
>  
>  	req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
>  	req->sge[1].length = sg_dma_len(req->sg_table.sgl);
>  	req->sge[1].lkey = queue->device->pd->local_dma_lkey;
> +	len = req->sge[1].length;
> +	if (count == 2) {
> +		req->sge[2].addr = sg_dma_address(req->sg_table.sgl+1);
> +		req->sge[2].length = sg_dma_len(req->sg_table.sgl+1);
> +		req->sge[2].lkey = queue->device->pd->local_dma_lkey;
> +		len += req->sge[2].length;
> +	}

I think this should be turned into a for loop, e.g.

	u32 len, i;

	for (i = 0; i < count; i++) {
		req->sge[1 + i].addr = sg_dma_address(&req->sg_table.sgl[i]);
		req->sge[1 + i].length = sg_dma_len(&req->sg_table.sgl[i]);
		req->sge[1 + i].lkey = queue->device->pd->local_dma_lkey;
		req->num_sge++;
		len += req->sge[i + 1].length;
	}
		
> -	if (count == 1) {
> +	if (count <= 2) {

This should be NVME_RDMA_MAX_INLINE_SEGMENTS.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise May 14, 2018, 6:33 p.m. UTC | #4
On 5/11/2018 1:48 AM, Christoph Hellwig wrote:
>> -#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
>> +#define NVME_RDMA_MAX_INLINE_SEGMENTS	2
> Given how little space we use for just the sge array maybe we
> want to bump this to 4 once we need to deal with multiple entries?

Agreed.

>>  static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
>> -		struct nvme_rdma_request *req, struct nvme_command *c)
>> +		struct nvme_rdma_request *req, int count,
>> +		struct nvme_command *c)
> Just gut feeling, but I'd pass the count argument last.

Yea ok.

>>  	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
>> +	u32 len;
>>  
>>  	req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
>>  	req->sge[1].length = sg_dma_len(req->sg_table.sgl);
>>  	req->sge[1].lkey = queue->device->pd->local_dma_lkey;
>> +	len = req->sge[1].length;
>> +	if (count == 2) {
>> +		req->sge[2].addr = sg_dma_address(req->sg_table.sgl+1);
>> +		req->sge[2].length = sg_dma_len(req->sg_table.sgl+1);
>> +		req->sge[2].lkey = queue->device->pd->local_dma_lkey;
>> +		len += req->sge[2].length;
>> +	}
> I think this should be turned into a for loop, e.g.

Yes, And I think in the previous incarnation of this patch series, Sagi
suggested an iterator pointer vs sge[blah].

> 	u32 len, i;
>
> 	for (i = 0; i < count; i++) {
> 		req->sge[1 + i].addr = sg_dma_address(&req->sg_table.sgl[i]);
> 		req->sge[1 + i].length = sg_dma_len(&req->sg_table.sgl[i]);
> 		req->sge[1 + i].lkey = queue->device->pd->local_dma_lkey;
> 		req->num_sge++;
> 		len += req->sge[i + 1].length;
> 	}
> 		
>> -	if (count == 1) {
>> +	if (count <= 2) {
> This should be NVME_RDMA_MAX_INLINE_SEGMENTS.

Yup.

Thanks for reviewing!

Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1eb4438..9b8af98 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -40,7 +40,7 @@ 
 
 #define NVME_RDMA_MAX_SEGMENTS		256
 
-#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
+#define NVME_RDMA_MAX_INLINE_SEGMENTS	2
 
 struct nvme_rdma_device {
 	struct ib_device	*dev;
@@ -1086,19 +1086,28 @@  static int nvme_rdma_set_sg_null(struct nvme_command *c)
 }
 
 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
-		struct nvme_rdma_request *req, struct nvme_command *c)
+		struct nvme_rdma_request *req, int count,
+		struct nvme_command *c)
 {
 	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+	u32 len;
 
 	req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
 	req->sge[1].length = sg_dma_len(req->sg_table.sgl);
 	req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+	len = req->sge[1].length;
+	if (count == 2) {
+		req->sge[2].addr = sg_dma_address(req->sg_table.sgl+1);
+		req->sge[2].length = sg_dma_len(req->sg_table.sgl+1);
+		req->sge[2].lkey = queue->device->pd->local_dma_lkey;
+		len += req->sge[2].length;
+	}
 
 	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
-	sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+	sg->length = cpu_to_le32(len);
 	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
 
-	req->num_sge++;
+	req->num_sge += count;
 	return 0;
 }
 
@@ -1191,13 +1200,13 @@  static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 		return -EIO;
 	}
 
-	if (count == 1) {
+	if (count <= 2) {
 		if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
 		    blk_rq_payload_bytes(rq) <=
 				nvme_rdma_inline_data_size(queue))
 			return nvme_rdma_map_sg_inline(queue, req, c);
 
-		if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+		if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
 			return nvme_rdma_map_sg_single(queue, req, c);
 	}