diff mbox

[v4,8/8] libnvdimm: add DMA support for pmem blk-mq

Message ID 150212399902.23722.5878783372576979990.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang Aug. 7, 2017, 4:39 p.m. UTC
Adding DMA support for pmem blk reads. This provides signficant CPU
reduction with large memory reads with good performance. DMAs are triggered
with test against bio_multiple_segment(), so the small I/Os (4k or less?)
are still performed by the CPU in order to reduce latency. By default
the pmem driver will be using blk-mq with DMA.

Numbers below are measured against pmem simulated via DRAM using
memmap=NN!SS.  DMA engine used is the ioatdma on Intel Skylake Xeon
platform.  Keep in mind the performance for actual persistent memory
will differ.
Fio 2.21 was used.

64k: 1 task queuedepth=1
CPU Read:  7631 MB/s  99.7% CPU    DMA Read: 2415 MB/s  54% CPU
CPU Write: 3552 MB/s  100% CPU     DMA Write 2173 MB/s  54% CPU

64k: 16 tasks queuedepth=16
CPU Read: 36800 MB/s  1593% CPU    DMA Read:  29100 MB/s  607% CPU
CPU Write 20900 MB/s  1589% CPU    DMA Write: 23400 MB/s  585% CPU

2M: 1 task queuedepth=1
CPU Read:  6013 MB/s  99.3% CPU    DMA Read:  7986 MB/s  59.3% CPU
CPU Write: 3579 MB/s  100% CPU     DMA Write: 5211 MB/s  58.3% CPU

2M: 16 tasks queuedepth=16
CPU Read:  18100 MB/s 1588% CPU    DMA Read:  21300 MB/s 180.9% CPU
CPU Write: 14100 MB/s 1594% CPU    DMA Write: 20400 MB/s 446.9% CPU

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/nvdimm/pmem.c |  214 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 204 insertions(+), 10 deletions(-)

Comments

Christoph Hellwig Aug. 11, 2017, 11:04 a.m. UTC | #1
On Mon, Aug 07, 2017 at 09:39:59AM -0700, Dave Jiang wrote:
> +static int queue_mode = PMEM_Q_MQ;

So this changes the default for everyone.  How about those systems
without dma engine?

>  module_param(queue_mode, int, 0444);
> -MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ)");
> +MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ & DMA)");
> +
> +static int queue_depth = 128;
> +module_param(queue_depth, int, 0444);
> +MODULE_PARM_DESC(queue_depth, "I/O Queue Depth for multi queue mode");

This changes the default from the previous patch, why not introduce
this parameter and default there.  And an explanation of the magic
value would still be nice.

> +/* typically maps to number of DMA channels/devices per socket */
> +static int q_per_node = 8;
> +module_param(q_per_node, int, 0444);
> +MODULE_PARM_DESC(q_per_node, "Hardware queues per node\n");

How can a fixed constant "typically map" to a hardware dependent
resource?

> +	if (res) {

unlikely?

> +		enum dmaengine_tx_result dma_err = res->result;
> +
> +		switch (dma_err) {

do you really need the local variable for a single check of it?

> +		case DMA_TRANS_READ_FAILED:
> +		case DMA_TRANS_WRITE_FAILED:
> +		case DMA_TRANS_ABORTED:
> +			dev_dbg(dev, "bio failed\n");
> +			rc = -ENXIO;
> +			break;
> +		case DMA_TRANS_NOERROR:
> +		default:
> +			break;
> +		}
> +	}
> +
> +	if (req->cmd_flags & REQ_FUA)
> +		nvdimm_flush(nd_region);
> +
> +	blk_mq_end_request(cmd->rq, rc);

blk_mq_end_request takes a blk_status_t.  Note that sparse would
have caught that, so please run your code through sparse.

> +	if (cmd->sg_nents > 128) {

WARN_ON_ONCE as this should not happen.

> +	if (queue_mode == PMEM_Q_MQ) {
>  		pmem->tag_set.ops = &pmem_mq_ops;
> -		pmem->tag_set.nr_hw_queues = nr_online_nodes;
> -		pmem->tag_set.queue_depth = 64;
> +		pmem->tag_set.nr_hw_queues = nr_online_nodes * q_per_node;
> +		pmem->tag_set.queue_depth = queue_depth;

please use present nodes - we don't remap on cpu soft online/offline.

>  		pmem->tag_set.numa_node = dev_to_node(dev);
> -		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd);
> +		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd) +
> +			sizeof(struct scatterlist) * 128;

s/128/queue_depth/

>  		pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>  		pmem->tag_set.driver_data = pmem;
>  
> @@ -466,7 +649,11 @@ static int pmem_attach_disk(struct device *dev,
>  	blk_queue_write_cache(pmem->q, wbc, fua);
>  	blk_queue_physical_block_size(pmem->q, PAGE_SIZE);
>  	blk_queue_logical_block_size(pmem->q, pmem_sector_size(ndns));
> -	blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
> +	if (queue_mode == PMEM_Q_MQ) {
> +		blk_queue_max_hw_sectors(pmem->q, 0x200000);
> +		blk_queue_max_segments(pmem->q, 128);

Where do these magic numbers come from?
Dave Jiang Aug. 11, 2017, 6:01 p.m. UTC | #2
On 08/11/2017 04:04 AM, Christoph Hellwig wrote:
> On Mon, Aug 07, 2017 at 09:39:59AM -0700, Dave Jiang wrote:
>> +static int queue_mode = PMEM_Q_MQ;
> 
> So this changes the default for everyone.  How about those systems
> without dma engine?

The code requests a DMA engine during initialization to see if it's
possible and would disable DMA implementation if there are no DMA
engines found and revert back to the BIO path. With a separate driver,
it would just use blk-mq w/o DMA.

> 
>>  module_param(queue_mode, int, 0444);
>> -MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ)");
>> +MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ & DMA)");
>> +
>> +static int queue_depth = 128;
>> +module_param(queue_depth, int, 0444);
>> +MODULE_PARM_DESC(queue_depth, "I/O Queue Depth for multi queue mode");
> 
> This changes the default from the previous patch, why not introduce
> this parameter and default there.  And an explanation of the magic
> value would still be nice.

The value has more relation with ioatdma descriptor numbers per channel.
Max numbers of descriptors per channel is 64k. A queue depth of 128 with
possible size of scatterlist of 256 entries gives 32k. It just seemed a
reasonable number if the DMA channels may be shared with other users as
well. Of course on other platforms, this number will have to be adjusted
for the platform DMA engine capability.

> 
>> +/* typically maps to number of DMA channels/devices per socket */
>> +static int q_per_node = 8;
>> +module_param(q_per_node, int, 0444);
>> +MODULE_PARM_DESC(q_per_node, "Hardware queues per node\n");
> 
> How can a fixed constant "typically map" to a hardware dependent
> resource?

My concern is because this could be dependent on whose platform the
drive is running on. ioatdma provides 8 channel per socket. Some other
DMA engine may be different and needs to be altered.

> 
>> +	if (res) {
> 
> unlikely?

ok

> 
>> +		enum dmaengine_tx_result dma_err = res->result;
>> +
>> +		switch (dma_err) {
> 
> do you really need the local variable for a single check of it?

ok

> 
>> +		case DMA_TRANS_READ_FAILED:
>> +		case DMA_TRANS_WRITE_FAILED:
>> +		case DMA_TRANS_ABORTED:
>> +			dev_dbg(dev, "bio failed\n");
>> +			rc = -ENXIO;
>> +			break;
>> +		case DMA_TRANS_NOERROR:
>> +		default:
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (req->cmd_flags & REQ_FUA)
>> +		nvdimm_flush(nd_region);
>> +
>> +	blk_mq_end_request(cmd->rq, rc);
> 
> blk_mq_end_request takes a blk_status_t.  Note that sparse would
> have caught that, so please run your code through sparse.

ok

> 
>> +	if (cmd->sg_nents > 128) {
> 
> WARN_ON_ONCE as this should not happen.

ok

> 
>> +	if (queue_mode == PMEM_Q_MQ) {
>>  		pmem->tag_set.ops = &pmem_mq_ops;
>> -		pmem->tag_set.nr_hw_queues = nr_online_nodes;
>> -		pmem->tag_set.queue_depth = 64;
>> +		pmem->tag_set.nr_hw_queues = nr_online_nodes * q_per_node;
>> +		pmem->tag_set.queue_depth = queue_depth;
> 
> please use present nodes - we don't remap on cpu soft online/offline.

ok

> 
>>  		pmem->tag_set.numa_node = dev_to_node(dev);
>> -		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd);
>> +		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd) +
>> +			sizeof(struct scatterlist) * 128;
> 
> s/128/queue_depth/
Good catch. But it shouldn't be queue depth right? It would be whatever
we define the max_segments to be?

> 
>>  		pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>>  		pmem->tag_set.driver_data = pmem;
>>  
>> @@ -466,7 +649,11 @@ static int pmem_attach_disk(struct device *dev,
>>  	blk_queue_write_cache(pmem->q, wbc, fua);
>>  	blk_queue_physical_block_size(pmem->q, PAGE_SIZE);
>>  	blk_queue_logical_block_size(pmem->q, pmem_sector_size(ndns));
>> -	blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
>> +	if (queue_mode == PMEM_Q_MQ) {
>> +		blk_queue_max_hw_sectors(pmem->q, 0x200000);
>> +		blk_queue_max_segments(pmem->q, 128);
> 
> Where do these magic numbers come from?
>

The 2M is limitation of ioatdma. I really should plumb dmaengine
subsystem to provide the xfer_cap.

I'm assuming the max_segments limits the number of scatterlist entries
to that? I should make that a tuneable...
diff mbox

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 519b949..8eeb646 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -32,6 +32,8 @@ 
 #include <linux/dax.h>
 #include <linux/nd.h>
 #include <linux/blk-mq.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
 #include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
@@ -41,12 +43,24 @@  enum {
 	PMEM_Q_MQ = 1,
 };
 
-static int queue_mode = PMEM_Q_BIO;
+static int queue_mode = PMEM_Q_MQ;
 module_param(queue_mode, int, 0444);
-MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ)");
+MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ & DMA)");
+
+static int queue_depth = 128;
+module_param(queue_depth, int, 0444);
+MODULE_PARM_DESC(queue_depth, "I/O Queue Depth for multi queue mode");
+
+/* typically maps to number of DMA channels/devices per socket */
+static int q_per_node = 8;
+module_param(q_per_node, int, 0444);
+MODULE_PARM_DESC(q_per_node, "Hardware queues per node\n");
 
 struct pmem_cmd {
 	struct request *rq;
+	struct dma_chan *chan;
+	int sg_nents;
+	struct scatterlist sg[];
 };
 
 static struct device *to_dev(struct pmem_device *pmem)
@@ -298,6 +312,159 @@  static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static void nd_pmem_dma_callback(void *data,
+		const struct dmaengine_result *res)
+{
+	struct pmem_cmd *cmd = data;
+	struct request *req = cmd->rq;
+	struct request_queue *q = req->q;
+	struct pmem_device *pmem = q->queuedata;
+	struct nd_region *nd_region = to_region(pmem);
+	struct device *dev = to_dev(pmem);
+	int rc = 0;
+
+	if (res) {
+		enum dmaengine_tx_result dma_err = res->result;
+
+		switch (dma_err) {
+		case DMA_TRANS_READ_FAILED:
+		case DMA_TRANS_WRITE_FAILED:
+		case DMA_TRANS_ABORTED:
+			dev_dbg(dev, "bio failed\n");
+			rc = -ENXIO;
+			break;
+		case DMA_TRANS_NOERROR:
+		default:
+			break;
+		}
+	}
+
+	if (req->cmd_flags & REQ_FUA)
+		nvdimm_flush(nd_region);
+
+	blk_mq_end_request(cmd->rq, rc);
+}
+
+static int pmem_handle_cmd_dma(struct pmem_cmd *cmd, bool is_write)
+{
+	struct request *req = cmd->rq;
+	struct request_queue *q = req->q;
+	struct pmem_device *pmem = q->queuedata;
+	struct device *dev = to_dev(pmem);
+	phys_addr_t pmem_off = blk_rq_pos(req) * 512 + pmem->data_offset;
+	void *pmem_addr = pmem->virt_addr + pmem_off;
+	struct nd_region *nd_region = to_region(pmem);
+	size_t len;
+	struct dma_device *dma = cmd->chan->device;
+	struct dmaengine_unmap_data *unmap;
+	dma_cookie_t cookie;
+	struct dma_async_tx_descriptor *txd;
+	struct page *page;
+	unsigned int off;
+	int rc;
+	enum dma_data_direction dir;
+	dma_addr_t dma_addr;
+
+	if (req->cmd_flags & REQ_FLUSH)
+		nvdimm_flush(nd_region);
+
+	unmap = dmaengine_get_unmap_data(dma->dev, 2, GFP_NOWAIT);
+	if (!unmap) {
+		dev_dbg(dev, "failed to get dma unmap data\n");
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * If reading from pmem, writing to scatterlist,
+	 * and if writing to pmem, reading from scatterlist.
+	 */
+	dir = is_write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	cmd->sg_nents = blk_rq_map_sg(req->q, req, cmd->sg);
+	if (cmd->sg_nents < 1) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (cmd->sg_nents > 128) {
+		rc = -ENOMEM;
+		dev_warn(dev, "Number of sg greater than allocated\n");
+		goto err;
+	}
+
+	rc = dma_map_sg(dma->dev, cmd->sg, cmd->sg_nents, dir);
+	if (rc < 1) {
+		rc = -ENXIO;
+		goto err;
+	}
+
+	len = blk_rq_payload_bytes(req);
+	page = virt_to_page(pmem_addr);
+	off = offset_in_page(pmem_addr);
+	dir = is_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	dma_addr = dma_map_page(dma->dev, page, off, len, dir);
+	if (dma_mapping_error(dma->dev, unmap->addr[0])) {
+		dev_dbg(dma->dev, "src DMA mapping error\n");
+		rc = -ENXIO;
+		goto err_unmap_sg;
+	}
+
+	unmap->len = len;
+
+	if (is_write) {
+		unmap->addr[0] = dma_addr;
+		dma_unmap_data_set_virt(unmap, cmd->sg, 1);
+		unmap->to_cnt = 1;
+		unmap->from_cnt = 0;
+		dma_unmap_data_sg_from_nents(unmap, 2) = cmd->sg_nents;
+	} else {
+		dma_unmap_data_set_virt(unmap, cmd->sg, 0);
+		unmap->addr[1] = dma_addr;
+		unmap->from_cnt = 1;
+		unmap->to_cnt = 0;
+		dma_unmap_data_sg_to_nents(unmap, 2) = cmd->sg_nents;
+	}
+
+	txd = dma->device_prep_dma_memcpy_sg(cmd->chan,
+				cmd->sg, cmd->sg_nents, dma_addr,
+				!is_write, DMA_PREP_INTERRUPT);
+	if (!txd) {
+		dev_dbg(dma->dev, "dma prep failed\n");
+		rc = -ENXIO;
+		goto err_unmap_buffer;
+	}
+
+	txd->callback_result = nd_pmem_dma_callback;
+	txd->callback_param = cmd;
+	dma_set_unmap(txd, unmap);
+	cookie = dmaengine_submit(txd);
+	if (dma_submit_error(cookie)) {
+		dev_dbg(dma->dev, "dma submit error\n");
+		rc = -ENXIO;
+		goto err_set_unmap;
+	}
+
+	dmaengine_unmap_put(unmap);
+	dma_async_issue_pending(cmd->chan);
+
+	return 0;
+
+err_set_unmap:
+	dmaengine_unmap_put(unmap);
+err_unmap_buffer:
+	dma_unmap_page(dev, dma_addr, len, dir);
+err_unmap_sg:
+	if (dir == DMA_TO_DEVICE)
+		dir = DMA_FROM_DEVICE;
+	else
+		dir = DMA_TO_DEVICE;
+	dma_unmap_sg(dev, cmd->sg, cmd->sg_nents, dir);
+	dmaengine_unmap_put(unmap);
+err:
+	blk_mq_end_request(cmd->rq, rc);
+	return rc;
+}
+
 static int pmem_handle_cmd(struct pmem_cmd *cmd)
 {
 	struct request *req = cmd->rq;
@@ -331,12 +498,18 @@  static blk_status_t pmem_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
 	struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	struct request *req;
+	int rc;
 
-	cmd->rq = bd->rq;
-
-	blk_mq_start_request(bd->rq);
+	req = cmd->rq = bd->rq;
+	cmd->chan = dma_find_channel(DMA_MEMCPY_SG);
+	blk_mq_start_request(req);
 
-	if (pmem_handle_cmd(cmd) < 0)
+	if (cmd->chan && bio_multiple_segments(req->bio))
+		rc = pmem_handle_cmd_dma(cmd, op_is_write(req_op(req)));
+	else
+		rc = pmem_handle_cmd(cmd);
+	if (rc < 0)
 		return BLK_STS_IOERR;
 	else
 		return BLK_STS_OK;
@@ -363,6 +536,7 @@  static int pmem_attach_disk(struct device *dev,
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	struct dma_chan *chan;
 
 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
 	if (is_nd_pfn(dev)) {
@@ -396,11 +570,20 @@  static int pmem_attach_disk(struct device *dev,
 	}
 
 	if (queue_mode == PMEM_Q_MQ) {
+		chan = dma_find_channel(DMA_MEMCPY_SG);
+		if (!chan) {
+			queue_mode = PMEM_Q_BIO;
+			dev_warn(dev, "Forced back to PMEM_Q_BIO, no DMA\n");
+		}
+	}
+
+	if (queue_mode == PMEM_Q_MQ) {
 		pmem->tag_set.ops = &pmem_mq_ops;
-		pmem->tag_set.nr_hw_queues = nr_online_nodes;
-		pmem->tag_set.queue_depth = 64;
+		pmem->tag_set.nr_hw_queues = nr_online_nodes * q_per_node;
+		pmem->tag_set.queue_depth = queue_depth;
 		pmem->tag_set.numa_node = dev_to_node(dev);
-		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd);
+		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd) +
+			sizeof(struct scatterlist) * 128;
 		pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 		pmem->tag_set.driver_data = pmem;
 
@@ -466,7 +649,11 @@  static int pmem_attach_disk(struct device *dev,
 	blk_queue_write_cache(pmem->q, wbc, fua);
 	blk_queue_physical_block_size(pmem->q, PAGE_SIZE);
 	blk_queue_logical_block_size(pmem->q, pmem_sector_size(ndns));
-	blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
+	if (queue_mode == PMEM_Q_MQ) {
+		blk_queue_max_hw_sectors(pmem->q, 0x200000);
+		blk_queue_max_segments(pmem->q, 128);
+	} else
+		blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->q);
 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, pmem->q);
 	pmem->q->queuedata = pmem;
@@ -628,15 +815,22 @@  static struct nd_device_driver nd_pmem_driver = {
 
 static int __init pmem_init(void)
 {
+	if (queue_mode == PMEM_Q_MQ)
+		dmaengine_get();
+
 	return nd_driver_register(&nd_pmem_driver);
 }
 module_init(pmem_init);
 
 static void pmem_exit(void)
 {
+	if (queue_mode == PMEM_Q_MQ)
+		dmaengine_put();
+
 	driver_unregister(&nd_pmem_driver.drv);
 }
 module_exit(pmem_exit);
 
+MODULE_SOFTDEP("pre: dmaengine");
 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
 MODULE_LICENSE("GPL v2");