diff mbox series

[V10,6/8] PCI/P2PDMA: Add a 10-Bit Tag check in P2PDMA

Message ID 20211009104938.48225-7-liudongdong3@huawei.com (mailing list archive)
State Not Applicable
Headers show
Series PCI: Enable 10-Bit tag support for PCIe devices | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Dongdong Liu Oct. 9, 2021, 10:49 a.m. UTC
Add a 10-Bit Tag check in the P2PDMA code to ensure that a device with
10-Bit Tag Requester doesn't interact with a device that does not
support 10-Bit Tag Completer. Before that happens, the kernel should
emit a warning.

"echo 0 > /sys/bus/pci/devices/.../10bit_tag" to disable 10-Bit Tag
Requester for PF device.

"echo 0 > /sys/bus/pci/devices/.../sriov_vf_10bit_tag_ctl" to disable
10-Bit Tag Requester for VF device.

Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/p2pdma.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

Comments

Bjorn Helgaas Oct. 27, 2021, 9:20 p.m. UTC | #1
On Sat, Oct 09, 2021 at 06:49:36PM +0800, Dongdong Liu wrote:
> Add a 10-Bit Tag check in the P2PDMA code to ensure that a device with
> 10-Bit Tag Requester doesn't interact with a device that does not
> support 10-Bit Tag Completer. 

Shouldn't this also take into account Extended Tags (8 bits)?  I think
the only tag size guaranteed to be supported is 5 bits.

> Before that happens, the kernel should emit a warning.

The warning is nice, but the critical thing is that the P2PDMA mapping
should fail so we don't attempt DMA in this situation.  I guess that's
sort of what you're saying with "ensure that a device ... doesn't
interact with a device ..."

> "echo 0 > /sys/bus/pci/devices/.../10bit_tag" to disable 10-Bit Tag
> Requester for PF device.
> 
> "echo 0 > /sys/bus/pci/devices/.../sriov_vf_10bit_tag_ctl" to disable
> 10-Bit Tag Requester for VF device.
> 
> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
> Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>  drivers/pci/p2pdma.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 48 insertions(+)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 50cdde3e9a8b..804e390f4c22 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -19,6 +19,7 @@
>  #include <linux/random.h>
>  #include <linux/seq_buf.h>
>  #include <linux/xarray.h>
> +#include "pci.h"
>  
>  enum pci_p2pdma_map_type {
>  	PCI_P2PDMA_MAP_UNKNOWN = 0,
> @@ -410,6 +411,50 @@ static unsigned long map_types_idx(struct pci_dev *client)
>  		(client->bus->number << 8) | client->devfn;
>  }
>  
> +static bool pci_10bit_tags_unsupported(struct pci_dev *a,
> +				       struct pci_dev *b,
> +				       bool verbose)
> +{
> +	bool req;
> +	bool comp;
> +	u16 ctl;
> +	const char *str = "10bit_tag";
> +
> +	if (a->is_virtfn) {
> +#ifdef CONFIG_PCI_IOV
> +		req = !!(a->physfn->sriov->ctrl &
> +			 PCI_SRIOV_CTRL_VF_10BIT_TAG_REQ_EN);
> +#endif
> +	} else {
> +		pcie_capability_read_word(a, PCI_EXP_DEVCTL2, &ctl);
> +		req = !!(ctl & PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN);
> +	}
> +
> +	comp = !!(b->devcap2 & PCI_EXP_DEVCAP2_10BIT_TAG_COMP);
> +	/* 10-bit tags not enabled on requester */
> +	if (!req)
> +		return false;
> +
> +	 /* Completer can handle anything */
> +	if (comp)
> +		return false;
> +
> +	if (!verbose)
> +		return true;
> +
> +	pci_warn(a, "cannot be used for peer-to-peer DMA as 10-Bit Tag Requester enable is set for this device, but peer device (%s) does not support the 10-Bit Tag Completer\n",
> +		 pci_name(b));
> +
> +	if (a->is_virtfn)
> +		str = "sriov_vf_10bit_tag_ctl";
> +
> +	pci_warn(a, "to disable 10-Bit Tag Requester for this device, echo 0 > /sys/bus/pci/devices/%s/%s\n",
> +		 pci_name(a), str);
> +
> +	return true;
> +}
> +
>  /*
>   * Calculate the P2PDMA mapping type and distance between two PCI devices.
>   *
> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>  	}
>  done:
> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
> +
>  	rcu_read_lock();
>  	p2pdma = rcu_dereference(provider->p2pdma);
>  	if (p2pdma)
> -- 
> 2.22.0
>
Bjorn Helgaas Oct. 27, 2021, 11:11 p.m. UTC | #2
On Sat, Oct 09, 2021 at 06:49:36PM +0800, Dongdong Liu wrote:
> Add a 10-Bit Tag check in the P2PDMA code to ensure that a device with
> 10-Bit Tag Requester doesn't interact with a device that does not
> support 10-Bit Tag Completer. Before that happens, the kernel should
> emit a warning.

> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
> Reviewed-by: Logan Gunthorpe <logang@deltatee.com>

> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>  	}
>  done:
> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;

I need to be convinced that this check is in the right spot to catch
all potential P2PDMA situations.  The pci_p2pmem_find() and
pci_p2pdma_distance() interfaces eventually call
calc_map_type_and_dist().  But those interfaces don't actually produce
DMA bus addresses, and I'm not convinced that all P2PDMA users use
them.

nvme *does* use them, but infiniband (rdma_rw_map_sg()) does not, and
it calls pci_p2pdma_map_sg().

amdgpu_dma_buf_attach() calls pci_p2pdma_distance_many() but I don't
know where it sets up P2PDMA transactions.

cxgb4 and qed mention "peer2peer", but I don't know whether they are
related; they don't seem to use any pci_p2p.* interfaces.

>  	rcu_read_lock();
>  	p2pdma = rcu_dereference(provider->p2pdma);
>  	if (p2pdma)
> -- 
> 2.22.0
>
Logan Gunthorpe Oct. 27, 2021, 11:41 p.m. UTC | #3
On 2021-10-27 5:11 p.m., Bjorn Helgaas wrote:
>> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
>>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>>  	}
>>  done:
>> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
>> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
> 
> I need to be convinced that this check is in the right spot to catch
> all potential P2PDMA situations.  The pci_p2pmem_find() and
> pci_p2pdma_distance() interfaces eventually call
> calc_map_type_and_dist().  But those interfaces don't actually produce
> DMA bus addresses, and I'm not convinced that all P2PDMA users use
> them.
> 
> nvme *does* use them, but infiniband (rdma_rw_map_sg()) does not, and
> it calls pci_p2pdma_map_sg().

The rules of the current code is that calc_map_type_and_dist() must be
called before pci_p2pdma_map_sg(). The calc function caches the mapping
type in an xarray. If it was not called ahead of time,
pci_p2pdma_map_type() will return PCI_P2PDMA_MAP_NOT_SUPPORTED, and the
WARN_ON_ONCE will be hit in
pci_p2pdma_map_sg_attrs().

Both NVMe and RDMA (only used in the nvme fabrics code) do the correct
thing here and we can be sure calc_map_type_and_dist() is called before
any pages are mapped.

The patch set I'm currently working on will ensure that
calc_map_type_and_dist() is called before anyone maps a PCI P2PDMA page
with dma_map_sg*().

> amdgpu_dma_buf_attach() calls pci_p2pdma_distance_many() but I don't
> know where it sets up P2PDMA transactions.

The amdgpu driver hacked this in before proper support was done, but at
least it's using pci_p2pdma_distance_many() presumably before trying any
transfer. Though it's likely broken as it doesn't take into account the
mapping type and thus I think it always assumes traffic goes through the
host bridge (seeing it doesn't use pci_p2pdma_map_sg()).

> cxgb4 and qed mention "peer2peer", but I don't know whether they are
> related; they don't seem to use any pci_p2p.* interfaces.

I'm really not sure what these drivers are doing at all. However, I
think this is unrelated based on this old patch description[1]:

  Open MPI, Intel MPI and other applications don't support the iWARP
  requirement that the client side send the first RDMA message. This
  class of application connection setup is called peer-2-peer. Typically
  once the connection is setup, _both_ sides want to send data.

  This patch enables supporting peer-2-peer over the chelsio rnic by
  enforcing this iWARP requirement in the driver itself as part of RDMA
  connection setup.

Logan

[1] http://lkml.iu.edu/hypermail/linux/kernel/0804.3/1416.html
Bjorn Helgaas Oct. 28, 2021, 1:39 a.m. UTC | #4
On Wed, Oct 27, 2021 at 05:41:07PM -0600, Logan Gunthorpe wrote:
> On 2021-10-27 5:11 p.m., Bjorn Helgaas wrote:
> >> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
> >>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
> >>  	}
> >>  done:
> >> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
> >> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
> > 
> > I need to be convinced that this check is in the right spot to catch
> > all potential P2PDMA situations.  The pci_p2pmem_find() and
> > pci_p2pdma_distance() interfaces eventually call
> > calc_map_type_and_dist().  But those interfaces don't actually produce
> > DMA bus addresses, and I'm not convinced that all P2PDMA users use
> > them.
> > 
> > nvme *does* use them, but infiniband (rdma_rw_map_sg()) does not, and
> > it calls pci_p2pdma_map_sg().
> 
> The rules of the current code is that calc_map_type_and_dist() must be
> called before pci_p2pdma_map_sg(). The calc function caches the mapping
> type in an xarray. If it was not called ahead of time,
> pci_p2pdma_map_type() will return PCI_P2PDMA_MAP_NOT_SUPPORTED, and the
> WARN_ON_ONCE will be hit in
> pci_p2pdma_map_sg_attrs().

Seems like it requires fairly deep analysis to prove all this.  Is
this something we don't want to put directly in the map path because
it's a hot path, or it just doesn't fit there in the model, or ...?

> Both NVMe and RDMA (only used in the nvme fabrics code) do the correct
> thing here and we can be sure calc_map_type_and_dist() is called before
> any pages are mapped.
> 
> The patch set I'm currently working on will ensure that
> calc_map_type_and_dist() is called before anyone maps a PCI P2PDMA page
> with dma_map_sg*().
> 
> > amdgpu_dma_buf_attach() calls pci_p2pdma_distance_many() but I don't
> > know where it sets up P2PDMA transactions.
> 
> The amdgpu driver hacked this in before proper support was done, but at
> least it's using pci_p2pdma_distance_many() presumably before trying any
> transfer. Though it's likely broken as it doesn't take into account the
> mapping type and thus I think it always assumes traffic goes through the
> host bridge (seeing it doesn't use pci_p2pdma_map_sg()).

What does it mean to go through the host bridge?  Obviously DMA to
system memory would go through the host bridge, but this seems
different.  Is this a "between PCI hierarchies" case like to a device
below a different root port?  I don't know what the tag rules are for
that.

> > cxgb4 and qed mention "peer2peer", but I don't know whether they are
> > related; they don't seem to use any pci_p2p.* interfaces.
> 
> I'm really not sure what these drivers are doing at all. However, I
> think this is unrelated based on this old patch description[1]:
> 
>   Open MPI, Intel MPI and other applications don't support the iWARP
>   requirement that the client side send the first RDMA message. This
>   class of application connection setup is called peer-2-peer. Typically
>   once the connection is setup, _both_ sides want to send data.
> 
>   This patch enables supporting peer-2-peer over the chelsio rnic by
>   enforcing this iWARP requirement in the driver itself as part of RDMA
>   connection setup.

Thanks!

> Logan
> 
> [1] http://lkml.iu.edu/hypermail/linux/kernel/0804.3/1416.html
Dongdong Liu Oct. 28, 2021, 7:56 a.m. UTC | #5
On 2021/10/28 5:20, Bjorn Helgaas wrote:
> On Sat, Oct 09, 2021 at 06:49:36PM +0800, Dongdong Liu wrote:
>> Add a 10-Bit Tag check in the P2PDMA code to ensure that a device with
>> 10-Bit Tag Requester doesn't interact with a device that does not
>> support 10-Bit Tag Completer.
>
> Shouldn't this also take into account Extended Tags (8 bits)?  I think
> the only tag size guaranteed to be supported is 5 bits.
As all PCIe completers are required to support 8-bit tags, seems no need
to take into account Extended Tags.
>
>> Before that happens, the kernel should emit a warning.
>
> The warning is nice, but the critical thing is that the P2PDMA mapping
> should fail so we don't attempt DMA in this situation.  I guess that's
> sort of what you're saying with "ensure that a device ... doesn't
> interact with a device ..."
Yes, that is.
>
>> "echo 0 > /sys/bus/pci/devices/.../10bit_tag" to disable 10-Bit Tag
>> Requester for PF device.
>>
>> "echo 0 > /sys/bus/pci/devices/.../sriov_vf_10bit_tag_ctl" to disable
>> 10-Bit Tag Requester for VF device.
>>
>> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
>> Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
>> ---
>>  drivers/pci/p2pdma.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 48 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 50cdde3e9a8b..804e390f4c22 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -19,6 +19,7 @@
>>  #include <linux/random.h>
>>  #include <linux/seq_buf.h>
>>  #include <linux/xarray.h>
>> +#include "pci.h"
>>
>>  enum pci_p2pdma_map_type {
>>  	PCI_P2PDMA_MAP_UNKNOWN = 0,
>> @@ -410,6 +411,50 @@ static unsigned long map_types_idx(struct pci_dev *client)
>>  		(client->bus->number << 8) | client->devfn;
>>  }
>>
>> +static bool pci_10bit_tags_unsupported(struct pci_dev *a,
>> +				       struct pci_dev *b,
>> +				       bool verbose)
>> +{
>> +	bool req;
>> +	bool comp;
>> +	u16 ctl;
>> +	const char *str = "10bit_tag";
>> +
>> +	if (a->is_virtfn) {
>> +#ifdef CONFIG_PCI_IOV
>> +		req = !!(a->physfn->sriov->ctrl &
>> +			 PCI_SRIOV_CTRL_VF_10BIT_TAG_REQ_EN);
>> +#endif
>> +	} else {
>> +		pcie_capability_read_word(a, PCI_EXP_DEVCTL2, &ctl);
>> +		req = !!(ctl & PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN);
>> +	}
>> +
>> +	comp = !!(b->devcap2 & PCI_EXP_DEVCAP2_10BIT_TAG_COMP);
>> +	/* 10-bit tags not enabled on requester */
>> +	if (!req)
>> +		return false;
>> +
>> +	 /* Completer can handle anything */
>> +	if (comp)
>> +		return false;
>> +
>> +	if (!verbose)
>> +		return true;
>> +
>> +	pci_warn(a, "cannot be used for peer-to-peer DMA as 10-Bit Tag Requester enable is set for this device, but peer device (%s) does not support the 10-Bit Tag Completer\n",
>> +		 pci_name(b));
>> +
>> +	if (a->is_virtfn)
>> +		str = "sriov_vf_10bit_tag_ctl";
>> +
>> +	pci_warn(a, "to disable 10-Bit Tag Requester for this device, echo 0 > /sys/bus/pci/devices/%s/%s\n",
>> +		 pci_name(a), str);
>> +
>> +	return true;
>> +}
>> +
>>  /*
>>   * Calculate the P2PDMA mapping type and distance between two PCI devices.
>>   *
>> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
>>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>>  	}
>>  done:
>> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
>> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>> +
>>  	rcu_read_lock();
>>  	p2pdma = rcu_dereference(provider->p2pdma);
>>  	if (p2pdma)
>> --
>> 2.22.0
>>
> .
>
Logan Gunthorpe Oct. 28, 2021, 3:56 p.m. UTC | #6
On 2021-10-27 7:39 p.m., Bjorn Helgaas wrote:
> On Wed, Oct 27, 2021 at 05:41:07PM -0600, Logan Gunthorpe wrote:
>> On 2021-10-27 5:11 p.m., Bjorn Helgaas wrote:
>>>> @@ -532,6 +577,9 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
>>>>  		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>>>>  	}
>>>>  done:
>>>> +	if (pci_10bit_tags_unsupported(client, provider, verbose))
>>>> +		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
>>>
>>> I need to be convinced that this check is in the right spot to catch
>>> all potential P2PDMA situations.  The pci_p2pmem_find() and
>>> pci_p2pdma_distance() interfaces eventually call
>>> calc_map_type_and_dist().  But those interfaces don't actually produce
>>> DMA bus addresses, and I'm not convinced that all P2PDMA users use
>>> them.
>>>
>>> nvme *does* use them, but infiniband (rdma_rw_map_sg()) does not, and
>>> it calls pci_p2pdma_map_sg().
>>
>> The rules of the current code is that calc_map_type_and_dist() must be
>> called before pci_p2pdma_map_sg(). The calc function caches the mapping
>> type in an xarray. If it was not called ahead of time,
>> pci_p2pdma_map_type() will return PCI_P2PDMA_MAP_NOT_SUPPORTED, and the
>> WARN_ON_ONCE will be hit in
>> pci_p2pdma_map_sg_attrs().
> 
> Seems like it requires fairly deep analysis to prove all this.  Is
> this something we don't want to put directly in the map path because
> it's a hot path, or it just doesn't fit there in the model, or ...?

Yes, that's pretty much what my next patch set does. It just took a
while to get there (adding the xarray, etc).

>> Both NVMe and RDMA (only used in the nvme fabrics code) do the correct
>> thing here and we can be sure calc_map_type_and_dist() is called before
>> any pages are mapped.
>>
>> The patch set I'm currently working on will ensure that
>> calc_map_type_and_dist() is called before anyone maps a PCI P2PDMA page
>> with dma_map_sg*().
>>
>>> amdgpu_dma_buf_attach() calls pci_p2pdma_distance_many() but I don't
>>> know where it sets up P2PDMA transactions.
>>
>> The amdgpu driver hacked this in before proper support was done, but at
>> least it's using pci_p2pdma_distance_many() presumably before trying any
>> transfer. Though it's likely broken as it doesn't take into account the
>> mapping type and thus I think it always assumes traffic goes through the
>> host bridge (seeing it doesn't use pci_p2pdma_map_sg()).
> 
> What does it mean to go through the host bridge?  Obviously DMA to
> system memory would go through the host bridge, but this seems
> different.  Is this a "between PCI hierarchies" case like to a device
> below a different root port?  I don't know what the tag rules are for
> that.

It means both devices are connected to the host bridge without a switch.
So TLPs are routed through the route complex and thus would be affected
by the IOMMU. I also don't know how the tag rules apply here. But the
code in this patch will ensure that no two devices with different tag
sizes will ever use p2pdma in any case.

Logan
diff mbox series

Patch

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 50cdde3e9a8b..804e390f4c22 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -19,6 +19,7 @@ 
 #include <linux/random.h>
 #include <linux/seq_buf.h>
 #include <linux/xarray.h>
+#include "pci.h"
 
 enum pci_p2pdma_map_type {
 	PCI_P2PDMA_MAP_UNKNOWN = 0,
@@ -410,6 +411,50 @@  static unsigned long map_types_idx(struct pci_dev *client)
 		(client->bus->number << 8) | client->devfn;
 }
 
+static bool pci_10bit_tags_unsupported(struct pci_dev *a,
+				       struct pci_dev *b,
+				       bool verbose)
+{
+	bool req;
+	bool comp;
+	u16 ctl;
+	const char *str = "10bit_tag";
+
+	if (a->is_virtfn) {
+#ifdef CONFIG_PCI_IOV
+		req = !!(a->physfn->sriov->ctrl &
+			 PCI_SRIOV_CTRL_VF_10BIT_TAG_REQ_EN);
+#endif
+	} else {
+		pcie_capability_read_word(a, PCI_EXP_DEVCTL2, &ctl);
+		req = !!(ctl & PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN);
+	}
+
+	comp = !!(b->devcap2 & PCI_EXP_DEVCAP2_10BIT_TAG_COMP);
+
+	/* 10-bit tags not enabled on requester */
+	if (!req)
+		return false;
+
+	 /* Completer can handle anything */
+	if (comp)
+		return false;
+
+	if (!verbose)
+		return true;
+
+	pci_warn(a, "cannot be used for peer-to-peer DMA as 10-Bit Tag Requester enable is set for this device, but peer device (%s) does not support the 10-Bit Tag Completer\n",
+		 pci_name(b));
+
+	if (a->is_virtfn)
+		str = "sriov_vf_10bit_tag_ctl";
+
+	pci_warn(a, "to disable 10-Bit Tag Requester for this device, echo 0 > /sys/bus/pci/devices/%s/%s\n",
+		 pci_name(a), str);
+
+	return true;
+}
+
 /*
  * Calculate the P2PDMA mapping type and distance between two PCI devices.
  *
@@ -532,6 +577,9 @@  calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
 		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
 	}
 done:
+	if (pci_10bit_tags_unsupported(client, provider, verbose))
+		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
+
 	rcu_read_lock();
 	p2pdma = rcu_dereference(provider->p2pdma);
 	if (p2pdma)