diff mbox

[v2,04/10] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches

Message ID 20180228234006.21093-5-logang@deltatee.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Logan Gunthorpe Feb. 28, 2018, 11:40 p.m. UTC
For peer-to-peer transactions to work the downstream ports in each
switch must not have the ACS flags set. At this time there is no way
to dynamically change the flags and update the corresponding IOMMU
groups so this is done at enumeration time before the the groups are
assigned.

This effectively means that if CONFIG_PCI_P2PDMA is selected then
all devices behind any switch will be in the same IOMMU group.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/Kconfig        |  4 ++++
 drivers/pci/p2pdma.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c          |  4 ++++
 include/linux/pci-p2pdma.h |  5 +++++
 4 files changed, 57 insertions(+)

Comments

Bjorn Helgaas March 1, 2018, 6:02 p.m. UTC | #1
On Wed, Feb 28, 2018 at 04:40:00PM -0700, Logan Gunthorpe wrote:
> For peer-to-peer transactions to work the downstream ports in each
> switch must not have the ACS flags set. At this time there is no way
> to dynamically change the flags and update the corresponding IOMMU
> groups so this is done at enumeration time before the the groups are
> assigned.

s/the the/the/

> This effectively means that if CONFIG_PCI_P2PDMA is selected then
> all devices behind any switch will be in the same IOMMU group.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>  drivers/pci/Kconfig        |  4 ++++
>  drivers/pci/p2pdma.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  drivers/pci/pci.c          |  4 ++++
>  include/linux/pci-p2pdma.h |  5 +++++
>  4 files changed, 57 insertions(+)
> 
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 840831418cbd..a430672f0ad4 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -138,6 +138,10 @@ config PCI_P2PDMA
>  	  it's hard to tell which support it with good performance, so
>  	  at this time you will need a PCIe switch.
>  
> +	  Enabling this option will also disable ACS on all ports behind
> +	  any PCIe switch. This effictively puts all devices behind any
> +	  switch into the same IOMMU group.

s/effictively/effectively/

Does this really mean "all devices behind the same Root Port"?

What does this mean in terms of device security?  I assume it means,
at least, that individual devices can't be assigned to separate VMs.

I don't mind admitting that this patch makes me pretty nervous, and I
don't have a clear idea of what the implications of this are, or how
to communicate those to end users.  "The same IOMMU group" is a pretty
abstract idea.

>  	  If unsure, say N.
>  
>  config PCI_LABEL
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 4e1c81f64b29..61af07acd21a 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -255,6 +255,50 @@ static struct pci_dev *get_upstream_bridge_port(struct pci_dev *pdev)
>  	return up2;
>  }
>  
> +/*
> + * pci_p2pdma_disable_acs - disable ACS flags for ports in PCI
> + *	bridges/switches
> + * @pdev: device to disable ACS flags for
> + *
> + * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
> + * to be disabled on any downstream port in any switch in order for
> + * the TLPs to not be forwarded up to the RC which is not what we want
> + * for P2P.
> + *
> + * This function is called when the devices are first enumerated and
> + * will result in all devices behind any switch to be in the same IOMMU
> + * group. At this time there is no way to "hotplug" IOMMU groups so we rely
> + * on this largish hammer. If you need the devices to be in separate groups
> + * don't enable CONFIG_PCI_P2PDMA.
> + *
> + * Returns 1 if the ACS bits for this device were cleared, otherwise 0.
> + */
> +int pci_p2pdma_disable_acs(struct pci_dev *pdev)
> +{
> +	struct pci_dev *up;
> +	int pos;
> +	u16 ctrl;
> +
> +	up = get_upstream_bridge_port(pdev);
> +	if (!up)
> +		return 0;
> +	pci_dev_put(up);
> +
> +	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
> +	if (!pos)
> +		return 0;
> +
> +	dev_info(&pdev->dev, "disabling ACS flags for peer-to-peer DMA\n");
> +
> +	pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl);
> +
> +	ctrl &= ~(PCI_ACS_RR | PCI_ACS_CR);
> +
> +	pci_write_config_word(pdev, pos + PCI_ACS_CTRL, ctrl);
> +
> +	return 1;
> +}
> +
>  static bool __upstream_bridges_match(struct pci_dev *upstream,
>  				     struct pci_dev *client)
>  {
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index f6a4dd10d9b0..95ad3cf288c8 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -16,6 +16,7 @@
>  #include <linux/of.h>
>  #include <linux/of_pci.h>
>  #include <linux/pci.h>
> +#include <linux/pci-p2pdma.h>
>  #include <linux/pm.h>
>  #include <linux/slab.h>
>  #include <linux/module.h>
> @@ -2826,6 +2827,9 @@ static void pci_std_enable_acs(struct pci_dev *dev)
>   */
>  void pci_enable_acs(struct pci_dev *dev)
>  {
> +	if (pci_p2pdma_disable_acs(dev))
> +		return;

This doesn't read naturally to me.  I do see that when
CONFIG_PCI_P2PDMA is not set, pci_p2pdma_disable_acs() does nothing
and returns 0, so we'll go ahead and try to enable ACS as before.

But I think it would be clearer to have an #ifdef CONFIG_PCI_P2PDMA
right here so it's more obvious that we only disable ACS when it's
selected.

>  	if (!pci_acs_enable)
>  		return;
>  
> diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
> index 126eca697ab3..f537f521f60c 100644
> --- a/include/linux/pci-p2pdma.h
> +++ b/include/linux/pci-p2pdma.h
> @@ -22,6 +22,7 @@ struct block_device;
>  struct scatterlist;
>  
>  #ifdef CONFIG_PCI_P2PDMA
> +int pci_p2pdma_disable_acs(struct pci_dev *pdev);
>  int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>  		u64 offset);
>  int pci_p2pdma_add_client(struct list_head *head, struct device *dev);
> @@ -41,6 +42,10 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
>  void pci_p2pdma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>  			 enum dma_data_direction dir);
>  #else /* CONFIG_PCI_P2PDMA */
> +static inline int pci_p2pdma_disable_acs(struct pci_dev *pdev)
> +{
> +	return 0;
> +}
>  static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
>  		size_t size, u64 offset)
>  {
> -- 
> 2.11.0
>
Stephen Bates March 1, 2018, 6:54 p.m. UTC | #2
Thanks for the detailed review Bjorn!

>>  
>> +	  Enabling this option will also disable ACS on all ports behind
>> +	  any PCIe switch. This effictively puts all devices behind any
>> +	  switch into the same IOMMU group.

>
>  Does this really mean "all devices behind the same Root Port"?

Not necessarily. You might have a cascade of switches (i.e switches below a switch) to achieve a very large fan-out (in an NVMe SSD array for example) and we will only disable ACS on the ports below the relevant switch.

> What does this mean in terms of device security?  I assume it means,
> at least, that individual devices can't be assigned to separate VMs.

This was discussed during v1 [1]. Disabling ACS on all downstream ports of the switch means that all the EPs below it have to part of the same IOMMU grouping. However it was also agreed that as long as the ACS disable occurred at boot time (which is does in v2) then the virtualization layer will be aware of it and will perform the IOMMU group formation correctly.
    
> I don't mind admitting that this patch makes me pretty nervous, and I
> don't have a clear idea of what the implications of this are, or how
> to communicate those to end users.  "The same IOMMU group" is a pretty
> abstract idea.
    
Alex gave a good overview of the implications in [1].

Stephen 

[1] https://marc.info/?l=linux-pci&m=151512320031739&w=2
Logan Gunthorpe March 1, 2018, 7:13 p.m. UTC | #3
On 01/03/18 11:02 AM, Bjorn Helgaas wrote:
>>   void pci_enable_acs(struct pci_dev *dev)
>>   {
>> +	if (pci_p2pdma_disable_acs(dev))
>> +		return;
> 
> This doesn't read naturally to me.  I do see that when
> CONFIG_PCI_P2PDMA is not set, pci_p2pdma_disable_acs() does nothing
> and returns 0, so we'll go ahead and try to enable ACS as before.
> 
> But I think it would be clearer to have an #ifdef CONFIG_PCI_P2PDMA
> right here so it's more obvious that we only disable ACS when it's
> selected.

I could do this... however, I wrote it this way because I've read Linus 
dislikes using #ifdef's inside function bodies and I personally agree 
with that sentiment.

Logan
Alex Williamson March 1, 2018, 9:21 p.m. UTC | #4
On Thu, 1 Mar 2018 18:54:01 +0000
"Stephen  Bates" <sbates@raithlin.com> wrote:

> Thanks for the detailed review Bjorn!
> 
> >>  
> >> +	  Enabling this option will also disable ACS on all ports behind
> >> +	  any PCIe switch. This effictively puts all devices behind any
> >> +	  switch into the same IOMMU group.  
> 
> >
> >  Does this really mean "all devices behind the same Root Port"?  
> 
> Not necessarily. You might have a cascade of switches (i.e switches below a switch) to achieve a very large fan-out (in an NVMe SSD array for example) and we will only disable ACS on the ports below the relevant switch.
> 
> > What does this mean in terms of device security?  I assume it means,
> > at least, that individual devices can't be assigned to separate VMs.  
> 
> This was discussed during v1 [1]. Disabling ACS on all downstream ports of the switch means that all the EPs below it have to part of the same IOMMU grouping. However it was also agreed that as long as the ACS disable occurred at boot time (which is does in v2) then the virtualization layer will be aware of it and will perform the IOMMU group formation correctly.

This is still a pretty terrible solution though, your kernel provider
needs to decide whether they favor device assignment or p2p, because we
can't do both, unless there's a patch I haven't seen yet that allows
boot time rather than compile time configuration.  There are absolutely
supported device assignment cases of switches proving isolation between
devices allowing the downstream EPs to be used independently.  I think
this is a non-starter for distribution support without boot time or
dynamic configuration.  I could imagine dynamic configuration through
sysfs that might trigger a soft remove and rescan of the affected
devices in order to rebuild the IOMMU group.  The hard part might be
determining which points to allow that to guarantee correctness.  For
instance, upstream switch ports don't actually support ACS, but they'd
otherwise be an obvious concentration point to trigger a
reconfiguration.  Thanks,

Alex
Logan Gunthorpe March 1, 2018, 9:26 p.m. UTC | #5
On 01/03/18 02:21 PM, Alex Williamson wrote:
> This is still a pretty terrible solution though, your kernel provider
> needs to decide whether they favor device assignment or p2p, because we
> can't do both, unless there's a patch I haven't seen yet that allows
> boot time rather than compile time configuration.  There are absolutely
> supported device assignment cases of switches proving isolation between
> devices allowing the downstream EPs to be used independently.  I think
> this is a non-starter for distribution support without boot time or
> dynamic configuration.  I could imagine dynamic configuration through
> sysfs that might trigger a soft remove and rescan of the affected
> devices in order to rebuild the IOMMU group.  The hard part might be
> determining which points to allow that to guarantee correctness.  For
> instance, upstream switch ports don't actually support ACS, but they'd
> otherwise be an obvious concentration point to trigger a
> reconfiguration.  Thanks,

At this point, I don't expect this to be enabled in any distribution. 
The use case is for custom hardware intended to do P2P which means they 
can have a custom kernel with P2P enabled. The default for 
CONFIG_PCI_P2P is 'no' and I expect it to stay that way for a long time.

Another boot option which has to be set for this to work isn't something 
we'd like to see.

Logan
Stephen Bates March 1, 2018, 9:32 p.m. UTC | #6
> your kernel provider needs to decide whether they favor device assignment or p2p

Thanks Alex! The hardware requirements for P2P (switch, high performance EPs) are such that we really only expect CONFIG_P2P_DMA to be enabled in specific instances and in those instances the users have made a decision to favor P2P over IOMMU isolation. Or they have setup their PCIe topology in a way that gives them IOMMU isolation where they want it and P2P where they want it.

Stephen
Jerome Glisse March 1, 2018, 9:35 p.m. UTC | #7
On Thu, Mar 01, 2018 at 09:32:20PM +0000, Stephen  Bates wrote:
> > your kernel provider needs to decide whether they favor device assignment or p2p
> 
> Thanks Alex! The hardware requirements for P2P (switch, high performance EPs) are such that we really only expect CONFIG_P2P_DMA to be enabled in specific instances and in those instances the users have made a decision to favor P2P over IOMMU isolation. Or they have setup their PCIe topology in a way that gives them IOMMU isolation where they want it and P2P where they want it.
> 
> 

Note that they are usecase for P2P where IOMMU isolation matter and
the traffic through root complex isn't see as an issue. For instance
for GPU the idea is that you want to allow the RDMA device to directly
read or write from GPU memory to avoid having to migrate memory to
system memory. This isn't so much for performance than for ease of
use.

Cheers,
Jérôme
Logan Gunthorpe March 1, 2018, 9:37 p.m. UTC | #8
On 01/03/18 02:35 PM, Jerome Glisse wrote:
> Note that they are usecase for P2P where IOMMU isolation matter and
> the traffic through root complex isn't see as an issue. 

Well, we can worry about that once we have a solution to the problem of 
knowing whether a root complex supports P2P at all. I'm not sure how 
people are going to solve that one.

For the time being, this work is for cards behind a switch only.

Logan
Bjorn Helgaas March 1, 2018, 11:15 p.m. UTC | #9
On Thu, Mar 01, 2018 at 06:54:01PM +0000, Stephen  Bates wrote:
> Thanks for the detailed review Bjorn!
> 
> >> +	  Enabling this option will also disable ACS on all ports behind
> >> +	  any PCIe switch. This effictively puts all devices behind any
> >> +	  switch into the same IOMMU group.
> 
> >  Does this really mean "all devices behind the same Root Port"?
> 
> Not necessarily. You might have a cascade of switches (i.e switches
> below a switch) to achieve a very large fan-out (in an NVMe SSD
> array for example) and we will only disable ACS on the ports below
> the relevant switch.

The question is what the relevant switch is.  We call pci_enable_acs()
on every PCI device, including Root Ports.  It looks like this relies
on get_upstream_bridge_port() to filter out some things.  I don't
think get_upstream_bridge_port() is doing the right thing, so we need
to sort that out first, I guess.

> > What does this mean in terms of device security?  I assume it means,
> > at least, that individual devices can't be assigned to separate VMs.
> 
> This was discussed during v1 [1]. Disabling ACS on all downstream
> ports of the switch means that all the EPs below it have to part of
> the same IOMMU grouping. However it was also agreed that as long as
> the ACS disable occurred at boot time (which is does in v2) then the
> virtualization layer will be aware of it and will perform the IOMMU
> group formation correctly.
>     
> > I don't mind admitting that this patch makes me pretty nervous, and I
> > don't have a clear idea of what the implications of this are, or how
> > to communicate those to end users.  "The same IOMMU group" is a pretty
> > abstract idea.
>     
> Alex gave a good overview of the implications in [1].
> 
> [1] https://marc.info/?l=linux-pci&m=151512320031739&w=2

This might be a good start, but whatever the implications are, they
need to be distilled and made clear directly in the changelog and the
Kconfig help text.

Bjorn
Logan Gunthorpe March 1, 2018, 11:59 p.m. UTC | #10
On 01/03/18 04:15 PM, Bjorn Helgaas wrote:
> The question is what the relevant switch is.  We call pci_enable_acs()
> on every PCI device, including Root Ports.  It looks like this relies
> on get_upstream_bridge_port() to filter out some things.  I don't
> think get_upstream_bridge_port() is doing the right thing, so we need
> to sort that out first, I guess.

Yes, well if a port has an upstream bridge port and that port has an 
upstream bridge port we are almost certainly looking at a port in a 
switch. If you have a better suggestion to only disable ACS bits on 
downstream switch ports I'd be happy to consider it.


> This might be a good start, but whatever the implications are, they
> need to be distilled and made clear directly in the changelog and the
> Kconfig help text.

Well, I tried. I'll expand on it for v3.

Logan
Bjorn Helgaas March 5, 2018, 10:28 p.m. UTC | #11
On Thu, Mar 01, 2018 at 12:13:10PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 01/03/18 11:02 AM, Bjorn Helgaas wrote:
> > >   void pci_enable_acs(struct pci_dev *dev)
> > >   {
> > > +	if (pci_p2pdma_disable_acs(dev))
> > > +		return;
> > 
> > This doesn't read naturally to me.  I do see that when
> > CONFIG_PCI_P2PDMA is not set, pci_p2pdma_disable_acs() does nothing
> > and returns 0, so we'll go ahead and try to enable ACS as before.
> > 
> > But I think it would be clearer to have an #ifdef CONFIG_PCI_P2PDMA
> > right here so it's more obvious that we only disable ACS when it's
> > selected.
> 
> I could do this... however, I wrote it this way because I've read Linus
> dislikes using #ifdef's inside function bodies and I personally agree with
> that sentiment.

I try to avoid #ifdefs too, but in this case the plain reading of the
code makes no sense (we're in a function called "enable_acs()", and
the first thing we do is call a function to "disable_acs()".

Disabling ACS is scary for all the security reasons mentioned
elsewhere, so a reader who knows what ACS does and cares about
virtualization and security *has* to go look up the definition of
pci_p2pdma_disable_acs().

If you put the #ifdef right here, then it's easier to read because we
can see that "oh, this is a special and uncommon case that I can
probably ignore".

Bjorn
Logan Gunthorpe March 5, 2018, 11:01 p.m. UTC | #12
On 05/03/18 03:28 PM, Bjorn Helgaas wrote:
> If you put the #ifdef right here, then it's easier to read because we
> can see that "oh, this is a special and uncommon case that I can
> probably ignore".

Makes sense. I'll do that.

Thanks,

Logan
diff mbox

Patch

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 840831418cbd..a430672f0ad4 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -138,6 +138,10 @@  config PCI_P2PDMA
 	  it's hard to tell which support it with good performance, so
 	  at this time you will need a PCIe switch.
 
+	  Enabling this option will also disable ACS on all ports behind
+	  any PCIe switch. This effictively puts all devices behind any
+	  switch into the same IOMMU group.
+
 	  If unsure, say N.
 
 config PCI_LABEL
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4e1c81f64b29..61af07acd21a 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -255,6 +255,50 @@  static struct pci_dev *get_upstream_bridge_port(struct pci_dev *pdev)
 	return up2;
 }
 
+/*
+ * pci_p2pdma_disable_acs - disable ACS flags for ports in PCI
+ *	bridges/switches
+ * @pdev: device to disable ACS flags for
+ *
+ * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
+ * to be disabled on any downstream port in any switch in order for
+ * the TLPs to not be forwarded up to the RC which is not what we want
+ * for P2P.
+ *
+ * This function is called when the devices are first enumerated and
+ * will result in all devices behind any switch to be in the same IOMMU
+ * group. At this time there is no way to "hotplug" IOMMU groups so we rely
+ * on this largish hammer. If you need the devices to be in separate groups
+ * don't enable CONFIG_PCI_P2PDMA.
+ *
+ * Returns 1 if the ACS bits for this device were cleared, otherwise 0.
+ */
+int pci_p2pdma_disable_acs(struct pci_dev *pdev)
+{
+	struct pci_dev *up;
+	int pos;
+	u16 ctrl;
+
+	up = get_upstream_bridge_port(pdev);
+	if (!up)
+		return 0;
+	pci_dev_put(up);
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
+	if (!pos)
+		return 0;
+
+	dev_info(&pdev->dev, "disabling ACS flags for peer-to-peer DMA\n");
+
+	pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl);
+
+	ctrl &= ~(PCI_ACS_RR | PCI_ACS_CR);
+
+	pci_write_config_word(pdev, pos + PCI_ACS_CTRL, ctrl);
+
+	return 1;
+}
+
 static bool __upstream_bridges_match(struct pci_dev *upstream,
 				     struct pci_dev *client)
 {
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index f6a4dd10d9b0..95ad3cf288c8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -16,6 +16,7 @@ 
 #include <linux/of.h>
 #include <linux/of_pci.h>
 #include <linux/pci.h>
+#include <linux/pci-p2pdma.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -2826,6 +2827,9 @@  static void pci_std_enable_acs(struct pci_dev *dev)
  */
 void pci_enable_acs(struct pci_dev *dev)
 {
+	if (pci_p2pdma_disable_acs(dev))
+		return;
+
 	if (!pci_acs_enable)
 		return;
 
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 126eca697ab3..f537f521f60c 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -22,6 +22,7 @@  struct block_device;
 struct scatterlist;
 
 #ifdef CONFIG_PCI_P2PDMA
+int pci_p2pdma_disable_acs(struct pci_dev *pdev);
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		u64 offset);
 int pci_p2pdma_add_client(struct list_head *head, struct device *dev);
@@ -41,6 +42,10 @@  int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 void pci_p2pdma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 			 enum dma_data_direction dir);
 #else /* CONFIG_PCI_P2PDMA */
+static inline int pci_p2pdma_disable_acs(struct pci_dev *pdev)
+{
+	return 0;
+}
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
 		size_t size, u64 offset)
 {