diff mbox series

[v8,13/13] nvmet: Optionally use PCI P2P memory

Message ID 20180927165420.5290-14-logang@deltatee.com (mailing list archive)
State Superseded
Headers show
Series Copy Offload in NVMe Fabrics with P2P PCI Memory | expand

Commit Message

Logan Gunthorpe Sept. 27, 2018, 4:54 p.m. UTC
We create a configfs attribute in each nvme-fabrics target port to
enable p2p memory use. When enabled, the port will only then use the
p2p memory if a p2p memory device can be found which is behind the
same switch hierarchy as the RDMA port and all the block devices in
use. If the user enabled it and no devices are found, then the system
will silently fall back on using regular memory.

If appropriate, that port will allocate memory for the RDMA buffers
for queues from the p2pmem device falling back to system memory should
anything fail.

Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
save an extra PCI transfer as the NVME card could just take the data
out of it's own memory. However, at this time, only a limited number
of cards with CMB buffers seem to be available.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
[hch: partial rewrite of the initial code]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/nvme/target/configfs.c    |  36 ++++++++
 drivers/nvme/target/core.c        | 138 +++++++++++++++++++++++++++++-
 drivers/nvme/target/io-cmd-bdev.c |   3 +
 drivers/nvme/target/nvmet.h       |  13 +++
 drivers/nvme/target/rdma.c        |   2 +
 5 files changed, 191 insertions(+), 1 deletion(-)

Comments

Keith Busch Sept. 27, 2018, 5:12 p.m. UTC | #1
On Thu, Sep 27, 2018 at 10:54:20AM -0600, Logan Gunthorpe wrote:
> We create a configfs attribute in each nvme-fabrics target port to
> enable p2p memory use. When enabled, the port will only then use the
> p2p memory if a p2p memory device can be found which is behind the
> same switch hierarchy as the RDMA port and all the block devices in
> use. If the user enabled it and no devices are found, then the system
> will silently fall back on using regular memory.
> 
> If appropriate, that port will allocate memory for the RDMA buffers
> for queues from the p2pmem device falling back to system memory should
> anything fail.
> 
> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
> save an extra PCI transfer as the NVME card could just take the data
> out of it's own memory. However, at this time, only a limited number
> of cards with CMB buffers seem to be available.
> 
> Signed-off-by: Stephen Bates <sbates@raithlin.com>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> [hch: partial rewrite of the initial code]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>

I haven't the necessary hardware to try this out, but looking forward
to it in the future. Looks good.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Logan Gunthorpe Sept. 27, 2018, 5:29 p.m. UTC | #2
On 2018-09-27 11:12 AM, Keith Busch wrote:
> Reviewed-by: Keith Busch <keith.busch@intel.com>

Thanks for the reviews Keith!

Logan
Sagi Grimberg Oct. 1, 2018, 9:34 p.m. UTC | #3
On 09/27/2018 09:54 AM, Logan Gunthorpe wrote:
> We create a configfs attribute in each nvme-fabrics target port to
> enable p2p memory use. When enabled, the port will only then use the
> p2p memory if a p2p memory device can be found which is behind the
> same switch hierarchy as the RDMA port and all the block devices in
> use. If the user enabled it and no devices are found, then the system
> will silently fall back on using regular memory.
> 
> If appropriate, that port will allocate memory for the RDMA buffers
> for queues from the p2pmem device falling back to system memory should
> anything fail.
> 
> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
> save an extra PCI transfer as the NVME card could just take the data
> out of it's own memory. However, at this time, only a limited number
> of cards with CMB buffers seem to be available.
> 
> Signed-off-by: Stephen Bates <sbates@raithlin.com>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> [hch: partial rewrite of the initial code]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>   drivers/nvme/target/configfs.c    |  36 ++++++++
>   drivers/nvme/target/core.c        | 138 +++++++++++++++++++++++++++++-
>   drivers/nvme/target/io-cmd-bdev.c |   3 +
>   drivers/nvme/target/nvmet.h       |  13 +++
>   drivers/nvme/target/rdma.c        |   2 +
>   5 files changed, 191 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> index b37a8e3e3f80..0dfb0e0c3d21 100644
> --- a/drivers/nvme/target/configfs.c
> +++ b/drivers/nvme/target/configfs.c
> @@ -17,6 +17,8 @@
>   #include <linux/slab.h>
>   #include <linux/stat.h>
>   #include <linux/ctype.h>
> +#include <linux/pci.h>
> +#include <linux/pci-p2pdma.h>
>   
>   #include "nvmet.h"
>   
> @@ -1094,6 +1096,37 @@ static void nvmet_port_release(struct config_item *item)
>   	kfree(port);
>   }
>   
> +#ifdef CONFIG_PCI_P2PDMA
> +static ssize_t nvmet_p2pmem_show(struct config_item *item, char *page)
> +{
> +	struct nvmet_port *port = to_nvmet_port(item);
> +
> +	return pci_p2pdma_enable_show(page, port->p2p_dev, port->use_p2pmem);
> +}
> +
> +static ssize_t nvmet_p2pmem_store(struct config_item *item,
> +		const char *page, size_t count)
> +{
> +	struct nvmet_port *port = to_nvmet_port(item);
> +	struct pci_dev *p2p_dev = NULL;
> +	bool use_p2pmem;
> +	int error;
> +
> +	error = pci_p2pdma_enable_store(page, &p2p_dev, &use_p2pmem);
> +	if (error)
> +		return error;
> +
> +	down_write(&nvmet_config_sem);
> +	port->use_p2pmem = use_p2pmem;
> +	pci_dev_put(port->p2p_dev);
> +	port->p2p_dev = p2p_dev;
> +	up_write(&nvmet_config_sem);
> +
> +	return count;
> +}
> +CONFIGFS_ATTR(nvmet_, p2pmem);
> +#endif /* CONFIG_PCI_P2PDMA */
> +
>   static struct configfs_attribute *nvmet_port_attrs[] = {
>   	&nvmet_attr_addr_adrfam,
>   	&nvmet_attr_addr_treq,
> @@ -1101,6 +1134,9 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
>   	&nvmet_attr_addr_trsvcid,
>   	&nvmet_attr_addr_trtype,
>   	&nvmet_attr_param_inline_data_size,
> +#ifdef CONFIG_PCI_P2PDMA
> +	&nvmet_attr_p2pmem,
> +#endif
>   	NULL,
>   };
>   
> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index bddd1599b826..7ade16cb4ed3 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -15,6 +15,7 @@
>   #include <linux/module.h>
>   #include <linux/random.h>
>   #include <linux/rculist.h>
> +#include <linux/pci-p2pdma.h>
>   
>   #include "nvmet.h"
>   
> @@ -365,9 +366,29 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
>   	nvmet_file_ns_disable(ns);
>   }
>   
> +static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl,
> +		struct nvmet_ns *ns)
> +{
> +	int ret;
> +
> +	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
> +		pr_err("peer-to-peer DMA is not supported by %s\n",
> +		       ns->device_path);
> +		return -EINVAL;
> +	}
> +
> +	ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
> +	if (ret)
> +		pr_err("failed to add peer-to-peer DMA client %s: %d\n",
> +		       ns->device_path, ret);
> +
> +	return ret;
> +}
> +
>   int nvmet_ns_enable(struct nvmet_ns *ns)
>   {
>   	struct nvmet_subsys *subsys = ns->subsys;
> +	struct nvmet_ctrl *ctrl;
>   	int ret;
>   
>   	mutex_lock(&subsys->lock);
> @@ -389,6 +410,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
>   	if (ret)
>   		goto out_dev_put;
>   
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		if (ctrl->p2p_dev) {
> +			ret = nvmet_p2pdma_add_client(ctrl, ns);
> +			if (ret)
> +				goto out_remove_clients;
> +		}
> +	}
> +
>   	if (ns->nsid > subsys->max_nsid)
>   		subsys->max_nsid = ns->nsid;
>   
> @@ -417,6 +446,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
>   out_unlock:
>   	mutex_unlock(&subsys->lock);
>   	return ret;
> +out_remove_clients:
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
> +		pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
>   out_dev_put:
>   	nvmet_ns_dev_disable(ns);
>   	goto out_unlock;
> @@ -425,6 +457,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
>   void nvmet_ns_disable(struct nvmet_ns *ns)
>   {
>   	struct nvmet_subsys *subsys = ns->subsys;
> +	struct nvmet_ctrl *ctrl;
>   
>   	mutex_lock(&subsys->lock);
>   	if (!ns->enabled)
> @@ -450,6 +483,12 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
>   	percpu_ref_exit(&ns->ref);
>   
>   	mutex_lock(&subsys->lock);
> +
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
> +		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);

Hi Logan, what is this event here?

> +	}
> +
>   	subsys->nr_namespaces--;
>   	nvmet_ns_changed(subsys, ns->nsid);
>   	nvmet_ns_dev_disable(ns);
> @@ -727,6 +766,23 @@ EXPORT_SYMBOL_GPL(nvmet_req_execute);
>   
>   int nvmet_req_alloc_sgl(struct nvmet_req *req, struct nvmet_sq *sq)
>   {
> +	struct pci_dev *p2p_dev = NULL;
> +
> +	if (IS_ENABLED(CONFIG_PCI_P2PDMA)) {
> +		if (sq->ctrl)
> +			p2p_dev = sq->ctrl->p2p_dev;
> +
> +		req->p2p_dev = NULL;
> +		if (sq->qid && p2p_dev) {
> +			req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
> +						       req->transfer_len);
> +			if (req->sg) {
> +				req->p2p_dev = p2p_dev;
> +				return 0;
> +			}

Would be useful to comment that we fall to normal sgl allocation.

> +		}
> +	}
> +
>   	req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
>   	if (!req->sg)
>   		return -ENOMEM;
> @@ -737,7 +793,11 @@ EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
>   
>   void nvmet_req_free_sgl(struct nvmet_req *req)
>   {
> -	sgl_free(req->sg);
> +	if (req->p2p_dev)
> +		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
> +	else
> +		sgl_free(req->sg);
> +
>   	req->sg = NULL;
>   	req->sg_cnt = 0;
>   }
> @@ -939,6 +999,79 @@ bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
>   		return __nvmet_host_allowed(subsys, hostnqn);
>   }
>   
> +/*
> + * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for
> + * Ι/O commands. This requires the PCI p2p device to be compatible with the
> + * backing device for every namespace on this controller.
> + */
> +static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
> +{
> +	struct nvmet_ns *ns;
> +	int ret;
> +
> +	if (!req->port->use_p2pmem || !req->p2p_client)
> +		return;

Nit, IMO would be better to check at the call-site, but not a hard
must...

I still do not fully understand why p2p_dev has to be ctrl-wide and not
per namespace. Sorry to keep bringing this up (again). But if people are
OK with it then I guess I can stop asking about this...


> +
> +	mutex_lock(&ctrl->subsys->lock);
> +
> +	ret = pci_p2pdma_add_client(&ctrl->p2p_clients, req->p2p_client);
> +	if (ret) {
> +		pr_err("failed adding peer-to-peer DMA client %s: %d\n",
> +		       dev_name(req->p2p_client), ret);
> +		goto free_devices;
> +	}
> +
> +	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
> +		ret = nvmet_p2pdma_add_client(ctrl, ns);
> +		if (ret)
> +			goto free_devices;

I think that at some point we said that this looks like it should fall
back to host memory for those namespaces.. when we allocate the sgl we
already assigned a namespace to the request (nvmet_req_init).

Aside from my questions the patch looks good.
Logan Gunthorpe Oct. 1, 2018, 9:55 p.m. UTC | #4
On 2018-10-01 3:34 p.m., Sagi Grimberg wrote:
>> +
>> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
>> +		pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
>> +		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
> 
> Hi Logan, what is this event here?

Oops, that must have been from a bad rebase.... Will Fix.

>> +		if (sq->qid && p2p_dev) {
>> +			req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
>> +						       req->transfer_len);
>> +			if (req->sg) {
>> +				req->p2p_dev = p2p_dev;
>> +				return 0;
>> +			}
> 
> Would be useful to comment that we fall to normal sgl allocation.

Ok.

>> +/*
>> + * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for
>> + * Ι/O commands. This requires the PCI p2p device to be compatible with the
>> + * backing device for every namespace on this controller.
>> + */
>> +static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
>> +{
>> +	struct nvmet_ns *ns;
>> +	int ret;
>> +
>> +	if (!req->port->use_p2pmem || !req->p2p_client)
>> +		return;
> 
> Nit, IMO would be better to check at the call-site, but not a hard
> must...

I'd rather keep the logic for whether to enable p2pmem in it's own
function. nvme_alloc_ctrl() is already very long and complicated.

> I still do not fully understand why p2p_dev has to be ctrl-wide and not
> per namespace. Sorry to keep bringing this up (again). But if people are
> OK with it then I guess I can stop asking about this...

Because you never answered my question back in March[1] (which I think
you've answered below)....

> I think that at some point we said that this looks like it should fall
> back to host memory for those namespaces.. when we allocate the sgl we
> already assigned a namespace to the request (nvmet_req_init).

I did not realize the namespace would be available at this time. I guess
I can give this a try, but it's going to be a fairly big change from
what's presented here... Though, I agree it'll probably be an improvement.

Logan

[1]
https://lore.kernel.org/lkml/7163af93-2f37-a8b6-986a-3cb2e62bee29@deltatee.com/T/#u
Sagi Grimberg Oct. 1, 2018, 10:23 p.m. UTC | #5
>>> +/*
>>> + * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for
>>> + * Ι/O commands. This requires the PCI p2p device to be compatible with the
>>> + * backing device for every namespace on this controller.
>>> + */
>>> +static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
>>> +{
>>> +	struct nvmet_ns *ns;
>>> +	int ret;
>>> +
>>> +	if (!req->port->use_p2pmem || !req->p2p_client)
>>> +		return;
>>
>> Nit, IMO would be better to check at the call-site, but not a hard
>> must...
> 
> I'd rather keep the logic for whether to enable p2pmem in it's own
> function. nvme_alloc_ctrl() is already very long and complicated.

Fair enough..

>> I still do not fully understand why p2p_dev has to be ctrl-wide and not
>> per namespace. Sorry to keep bringing this up (again). But if people are
>> OK with it then I guess I can stop asking about this...
> 
> Because you never answered my question back in March[1] (which I think
> you've answered below)....

I'm sorry... I lost tracking on this...

>> I think that at some point we said that this looks like it should fall
>> back to host memory for those namespaces.. when we allocate the sgl we
>> already assigned a namespace to the request (nvmet_req_init).
> 
> I did not realize the namespace would be available at this time. I guess
> I can give this a try, but it's going to be a fairly big change from
> what's presented here... Though, I agree it'll probably be an improvement.

Thanks, if it turns out to create to much of a churn, we could defer
that to a later stage, but we can at least document it.
Logan Gunthorpe Oct. 1, 2018, 11:43 p.m. UTC | #6
On 01/10/18 04:23 PM, Sagi Grimberg wrote:
>> I did not realize the namespace would be available at this time. I guess
>> I can give this a try, but it's going to be a fairly big change from
>> what's presented here... Though, I agree it'll probably be an
>> improvement.
> 
> Thanks, if it turns out to create to much of a churn, we could defer
> that to a later stage, but we can at least document it.

Yeah, it's going to create a bunch of churn, but it's probably worth
doing before merging because I think it will remove a bunch of
complexity (ie. the need for the whole p2p client infrastructure because
we now only need to worry about only one namespace at a time, instead of
needing to find a p2p device that works with all namespaces at once).

I'll try to get a v9 with this change published in the next day or two.

Logan
diff mbox series

Patch

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index b37a8e3e3f80..0dfb0e0c3d21 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -17,6 +17,8 @@ 
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/ctype.h>
+#include <linux/pci.h>
+#include <linux/pci-p2pdma.h>
 
 #include "nvmet.h"
 
@@ -1094,6 +1096,37 @@  static void nvmet_port_release(struct config_item *item)
 	kfree(port);
 }
 
+#ifdef CONFIG_PCI_P2PDMA
+static ssize_t nvmet_p2pmem_show(struct config_item *item, char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	return pci_p2pdma_enable_show(page, port->p2p_dev, port->use_p2pmem);
+}
+
+static ssize_t nvmet_p2pmem_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	struct pci_dev *p2p_dev = NULL;
+	bool use_p2pmem;
+	int error;
+
+	error = pci_p2pdma_enable_store(page, &p2p_dev, &use_p2pmem);
+	if (error)
+		return error;
+
+	down_write(&nvmet_config_sem);
+	port->use_p2pmem = use_p2pmem;
+	pci_dev_put(port->p2p_dev);
+	port->p2p_dev = p2p_dev;
+	up_write(&nvmet_config_sem);
+
+	return count;
+}
+CONFIGFS_ATTR(nvmet_, p2pmem);
+#endif /* CONFIG_PCI_P2PDMA */
+
 static struct configfs_attribute *nvmet_port_attrs[] = {
 	&nvmet_attr_addr_adrfam,
 	&nvmet_attr_addr_treq,
@@ -1101,6 +1134,9 @@  static struct configfs_attribute *nvmet_port_attrs[] = {
 	&nvmet_attr_addr_trsvcid,
 	&nvmet_attr_addr_trtype,
 	&nvmet_attr_param_inline_data_size,
+#ifdef CONFIG_PCI_P2PDMA
+	&nvmet_attr_p2pmem,
+#endif
 	NULL,
 };
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index bddd1599b826..7ade16cb4ed3 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -15,6 +15,7 @@ 
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/rculist.h>
+#include <linux/pci-p2pdma.h>
 
 #include "nvmet.h"
 
@@ -365,9 +366,29 @@  static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 	nvmet_file_ns_disable(ns);
 }
 
+static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl,
+		struct nvmet_ns *ns)
+{
+	int ret;
+
+	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
+		pr_err("peer-to-peer DMA is not supported by %s\n",
+		       ns->device_path);
+		return -EINVAL;
+	}
+
+	ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
+	if (ret)
+		pr_err("failed to add peer-to-peer DMA client %s: %d\n",
+		       ns->device_path, ret);
+
+	return ret;
+}
+
 int nvmet_ns_enable(struct nvmet_ns *ns)
 {
 	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
 	int ret;
 
 	mutex_lock(&subsys->lock);
@@ -389,6 +410,14 @@  int nvmet_ns_enable(struct nvmet_ns *ns)
 	if (ret)
 		goto out_dev_put;
 
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (ctrl->p2p_dev) {
+			ret = nvmet_p2pdma_add_client(ctrl, ns);
+			if (ret)
+				goto out_remove_clients;
+		}
+	}
+
 	if (ns->nsid > subsys->max_nsid)
 		subsys->max_nsid = ns->nsid;
 
@@ -417,6 +446,9 @@  int nvmet_ns_enable(struct nvmet_ns *ns)
 out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
+out_remove_clients:
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+		pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
 out_dev_put:
 	nvmet_ns_dev_disable(ns);
 	goto out_unlock;
@@ -425,6 +457,7 @@  int nvmet_ns_enable(struct nvmet_ns *ns)
 void nvmet_ns_disable(struct nvmet_ns *ns)
 {
 	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
 
 	mutex_lock(&subsys->lock);
 	if (!ns->enabled)
@@ -450,6 +483,12 @@  void nvmet_ns_disable(struct nvmet_ns *ns)
 	percpu_ref_exit(&ns->ref);
 
 	mutex_lock(&subsys->lock);
+
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns));
+		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
+	}
+
 	subsys->nr_namespaces--;
 	nvmet_ns_changed(subsys, ns->nsid);
 	nvmet_ns_dev_disable(ns);
@@ -727,6 +766,23 @@  EXPORT_SYMBOL_GPL(nvmet_req_execute);
 
 int nvmet_req_alloc_sgl(struct nvmet_req *req, struct nvmet_sq *sq)
 {
+	struct pci_dev *p2p_dev = NULL;
+
+	if (IS_ENABLED(CONFIG_PCI_P2PDMA)) {
+		if (sq->ctrl)
+			p2p_dev = sq->ctrl->p2p_dev;
+
+		req->p2p_dev = NULL;
+		if (sq->qid && p2p_dev) {
+			req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
+						       req->transfer_len);
+			if (req->sg) {
+				req->p2p_dev = p2p_dev;
+				return 0;
+			}
+		}
+	}
+
 	req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
 	if (!req->sg)
 		return -ENOMEM;
@@ -737,7 +793,11 @@  EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
 
 void nvmet_req_free_sgl(struct nvmet_req *req)
 {
-	sgl_free(req->sg);
+	if (req->p2p_dev)
+		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
+	else
+		sgl_free(req->sg);
+
 	req->sg = NULL;
 	req->sg_cnt = 0;
 }
@@ -939,6 +999,79 @@  bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
 		return __nvmet_host_allowed(subsys, hostnqn);
 }
 
+/*
+ * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for
+ * Ι/O commands. This requires the PCI p2p device to be compatible with the
+ * backing device for every namespace on this controller.
+ */
+static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
+{
+	struct nvmet_ns *ns;
+	int ret;
+
+	if (!req->port->use_p2pmem || !req->p2p_client)
+		return;
+
+	mutex_lock(&ctrl->subsys->lock);
+
+	ret = pci_p2pdma_add_client(&ctrl->p2p_clients, req->p2p_client);
+	if (ret) {
+		pr_err("failed adding peer-to-peer DMA client %s: %d\n",
+		       dev_name(req->p2p_client), ret);
+		goto free_devices;
+	}
+
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		ret = nvmet_p2pdma_add_client(ctrl, ns);
+		if (ret)
+			goto free_devices;
+	}
+
+	if (req->port->p2p_dev) {
+		/* A specific P2P device was selected in configfs */
+		if (!pci_p2pdma_assign_provider(req->port->p2p_dev,
+						&ctrl->p2p_clients)) {
+			pr_info("peer-to-peer memory on %s is not supported\n",
+				pci_name(req->port->p2p_dev));
+			goto free_devices;
+		}
+		ctrl->p2p_dev = pci_dev_get(req->port->p2p_dev);
+	} else {
+		/*
+		 * No P2P device was provided in configfs, therefore find one
+		 * automatically.
+		 */
+		ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients);
+		if (!ctrl->p2p_dev) {
+			pr_info("no supported peer-to-peer memory devices found\n");
+			goto free_devices;
+		}
+	}
+
+	mutex_unlock(&ctrl->subsys->lock);
+
+	pr_info("using peer-to-peer memory on %s\n", pci_name(ctrl->p2p_dev));
+	return;
+
+free_devices:
+	pci_p2pdma_client_list_free(&ctrl->p2p_clients);
+	mutex_unlock(&ctrl->subsys->lock);
+}
+
+static void nvmet_release_p2pmem(struct nvmet_ctrl *ctrl)
+{
+	if (!ctrl->p2p_dev)
+		return;
+
+	mutex_lock(&ctrl->subsys->lock);
+
+	pci_p2pdma_client_list_free(&ctrl->p2p_clients);
+	pci_dev_put(ctrl->p2p_dev);
+	ctrl->p2p_dev = NULL;
+
+	mutex_unlock(&ctrl->subsys->lock);
+}
+
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
 {
@@ -980,6 +1113,7 @@  u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
 	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
 	INIT_LIST_HEAD(&ctrl->async_events);
+	INIT_LIST_HEAD(&ctrl->p2p_clients);
 
 	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
 	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
@@ -1041,6 +1175,7 @@  u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 		ctrl->kato = DIV_ROUND_UP(kato, 1000);
 	}
 	nvmet_start_keep_alive_timer(ctrl);
+	nvmet_setup_p2pmem(ctrl, req);
 
 	mutex_lock(&subsys->lock);
 	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
@@ -1079,6 +1214,7 @@  static void nvmet_ctrl_free(struct kref *ref)
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fatal_err_work);
 
+	nvmet_release_p2pmem(ctrl);
 	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 
 	kfree(ctrl->sqs);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 7bc9f6240432..5660dd7ca755 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -78,6 +78,9 @@  static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 		op = REQ_OP_READ;
 	}
 
+	if (is_pci_p2pdma_page(sg_page(req->sg)))
+		op_flags |= REQ_NOMERGE;
+
 	sector = le64_to_cpu(req->cmd->rw.slba);
 	sector <<= (req->ns->blksize_shift - 9);
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7d6cb61021e4..297861064dd8 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -84,6 +84,11 @@  static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
 	return container_of(to_config_group(item), struct nvmet_ns, group);
 }
 
+static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
+{
+	return disk_to_dev(ns->bdev->bd_disk);
+}
+
 struct nvmet_cq {
 	u16			qid;
 	u16			size;
@@ -134,6 +139,8 @@  struct nvmet_port {
 	void				*priv;
 	bool				enabled;
 	int				inline_data_size;
+	bool				use_p2pmem;
+	struct pci_dev			*p2p_dev;
 };
 
 static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
@@ -182,6 +189,9 @@  struct nvmet_ctrl {
 	__le32			*changed_ns_list;
 	u32			nr_changed_ns;
 
+	struct pci_dev		*p2p_dev;
+	struct list_head	p2p_clients;
+
 	char			subsysnqn[NVMF_NQN_FIELD_LEN];
 	char			hostnqn[NVMF_NQN_FIELD_LEN];
 };
@@ -294,6 +304,9 @@  struct nvmet_req {
 
 	void (*execute)(struct nvmet_req *req);
 	const struct nvmet_fabrics_ops *ops;
+
+	struct pci_dev *p2p_dev;
+	struct device *p2p_client;
 };
 
 extern struct workqueue_struct *buffered_io_wq;
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index b0d0cedc74bb..e5f00449ac68 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -749,6 +749,8 @@  static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
 		cmd->send_sge.addr, cmd->send_sge.length,
 		DMA_TO_DEVICE);
 
+	cmd->req.p2p_client = &queue->dev->device->dev;
+
 	if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
 			&queue->nvme_sq, &nvmet_rdma_ops))
 		return;