diff mbox

[v2,2/2] PCI: Don't disable PF's memory decoding when enabling SRIOV

Message ID 1475192870-7763-2-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Gavin Shan Sept. 29, 2016, 11:47 p.m. UTC
pci_update_resource() might be called to update (shift) IOV BARs
in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
SRIOV capability. At that point, the PF have been functional if
the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
updating its IOV BARs with pci_update_resource(). Otherwise, we
receives EEH error caused by MMIO access to PF's memory BARs during
the window when PF's memory decoding is disabled.

   sriov_numvfs_store
   pdev->driver->sriov_configure
   mlx5_core_sriov_configure
   pci_enable_sriov
   sriov_enable
   pcibios_sriov_enable
   pnv_pci_sriov_enable
   pnv_pci_vf_resource_shift
   pci_update_resource

This doesn't change the PF's memory decoding in the path by introducing
additional parameter (@mmio_force_on) to pci_update_resource() in
the above path.

Reported-by: Carol Soto <clsoto@us.ibm.com>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Tested-by: Carol Soto <clsoto@us.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 drivers/pci/iov.c                         | 2 +-
 drivers/pci/pci.c                         | 2 +-
 drivers/pci/setup-res.c                   | 9 +++++----
 include/linux/pci.h                       | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

Comments

Bjorn Helgaas Oct. 21, 2016, 4:50 p.m. UTC | #1
Hi Gavin,

On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:
> pci_update_resource() might be called to update (shift) IOV BARs
> in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
> SRIOV capability. At that point, the PF have been functional if
> the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
> memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
> updating its IOV BARs with pci_update_resource(). Otherwise, we
> receives EEH error caused by MMIO access to PF's memory BARs during
> the window when PF's memory decoding is disabled.

The fact that you get EEH errors is irrelevant.  We can't write code
simply to avoid errors -- we have to write code to make the system
work correctly.

I do not want to add a "mmio_force_on" parameter to
pci_update_resource().  That puts the burden on the caller to
understand this subtle issue.  If the caller passes mmio_force_on=1
when it shouldn't, things will almost always work, but once in a blue
moon a half-updated BAR will conflict with some other device in the
system, and we'll have an unreproducible, undebuggable crash.

What you do need is an explanation of why it's safe to non-atomically
update a VF BARx in the SR-IOV capability.  I think this probably
involves the VF MSE bit, and you probably have to either disable VFs
completely or clear the MSE bit while you're updating the VF BARx.  We
should be able to do this inside pci_update_resource() without
changing the interface.

>    sriov_numvfs_store
>    pdev->driver->sriov_configure
>    mlx5_core_sriov_configure
>    pci_enable_sriov
>    sriov_enable
>    pcibios_sriov_enable
>    pnv_pci_sriov_enable
>    pnv_pci_vf_resource_shift
>    pci_update_resource
> 
> This doesn't change the PF's memory decoding in the path by introducing
> additional parameter (@mmio_force_on) to pci_update_resource() in
> the above path.

> 
> Reported-by: Carol Soto <clsoto@us.ibm.com>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> Tested-by: Carol Soto <clsoto@us.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
>  drivers/pci/iov.c                         | 2 +-
>  drivers/pci/pci.c                         | 2 +-
>  drivers/pci/setup-res.c                   | 9 +++++----
>  include/linux/pci.h                       | 2 +-
>  5 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 38a5c65..f4ccc5b 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1006,7 +1006,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>  		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
>  			 i, &res2, res, (offset > 0) ? "En" : "Dis",
>  			 num_vfs, offset);
> -		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES, true);
>  	}
>  	return 0;
>  }
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index f1343f0..db31966 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -511,7 +511,7 @@ static void sriov_restore_state(struct pci_dev *dev)
>  		return;
>  
>  	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
> -		pci_update_resource(dev, i);
> +		pci_update_resource(dev, i, false);
>  
>  	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
>  	pci_iov_set_numvfs(dev, iov->num_VFs);
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index aab9d51..87a33c0 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -545,7 +545,7 @@ static void pci_restore_bars(struct pci_dev *dev)
>  		return;
>  
>  	for (i = 0; i < PCI_BRIDGE_RESOURCES; i++)
> -		pci_update_resource(dev, i);
> +		pci_update_resource(dev, i, false);
>  }
>  
>  static const struct pci_platform_pm_ops *pci_platform_pm;
> diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
> index 66c4d8f..e8a50ff 100644
> --- a/drivers/pci/setup-res.c
> +++ b/drivers/pci/setup-res.c
> @@ -26,7 +26,7 @@
>  #include "pci.h"
>  
>  
> -void pci_update_resource(struct pci_dev *dev, int resno)
> +void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on)
>  {
>  	struct pci_bus_region region;
>  	bool disable;
> @@ -81,7 +81,8 @@ void pci_update_resource(struct pci_dev *dev, int resno)
>  	 * disable decoding so that a half-updated BAR won't conflict
>  	 * with another device.
>  	 */
> -	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
> +	disable = (res->flags & IORESOURCE_MEM_64) &&
> +		  !mmio_force_on && !dev->mmio_always_on;
>  	if (disable) {
>  		pci_read_config_word(dev, PCI_COMMAND, &cmd);
>  		pci_write_config_word(dev, PCI_COMMAND,
> @@ -310,7 +311,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
>  	res->flags &= ~IORESOURCE_STARTALIGN;
>  	dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res);
>  	if (resno < PCI_BRIDGE_RESOURCES)
> -		pci_update_resource(dev, resno);
> +		pci_update_resource(dev, resno, false);
>  
>  	return 0;
>  }
> @@ -350,7 +351,7 @@ int pci_reassign_resource(struct pci_dev *dev, int resno, resource_size_t addsiz
>  	dev_info(&dev->dev, "BAR %d: reassigned %pR (expanded by %#llx)\n",
>  		 resno, res, (unsigned long long) addsize);
>  	if (resno < PCI_BRIDGE_RESOURCES)
> -		pci_update_resource(dev, resno);
> +		pci_update_resource(dev, resno, false);
>  
>  	return 0;
>  }
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 0ab8359..99231d1 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1039,7 +1039,7 @@ int pci_try_reset_bus(struct pci_bus *bus);
>  void pci_reset_secondary_bus(struct pci_dev *dev);
>  void pcibios_reset_secondary_bus(struct pci_dev *dev);
>  void pci_reset_bridge_secondary_bus(struct pci_dev *dev);
> -void pci_update_resource(struct pci_dev *dev, int resno);
> +void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on);
>  int __must_check pci_assign_resource(struct pci_dev *dev, int i);
>  int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
>  int pci_select_bars(struct pci_dev *dev, unsigned long flags);
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan Oct. 23, 2016, 11:28 p.m. UTC | #2
On Fri, Oct 21, 2016 at 11:50:34AM -0500, Bjorn Helgaas wrote:
>Hi Gavin,
>
>On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:
>> pci_update_resource() might be called to update (shift) IOV BARs
>> in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
>> SRIOV capability. At that point, the PF have been functional if
>> the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
>> memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
>> updating its IOV BARs with pci_update_resource(). Otherwise, we
>> receives EEH error caused by MMIO access to PF's memory BARs during
>> the window when PF's memory decoding is disabled.
>
>The fact that you get EEH errors is irrelevant.  We can't write code
>simply to avoid errors -- we have to write code to make the system
>work correctly.
>
>I do not want to add a "mmio_force_on" parameter to
>pci_update_resource().  That puts the burden on the caller to
>understand this subtle issue.  If the caller passes mmio_force_on=1
>when it shouldn't, things will almost always work, but once in a blue
>moon a half-updated BAR will conflict with some other device in the
>system, and we'll have an unreproducible, undebuggable crash.
>

Bjorn, thanks for your comments. Yes, the EEH error was caused by MMIO
access to PF's normal BARs, not VF BARs. Yeah, I also had the conern
that adding parameter to pci_update_resource() increases the visible
complexity to the caller of the function.

>What you do need is an explanation of why it's safe to non-atomically
>update a VF BARx in the SR-IOV capability.  I think this probably
>involves the VF MSE bit, and you probably have to either disable VFs
>completely or clear the MSE bit while you're updating the VF BARx.  We
>should be able to do this inside pci_update_resource() without
>changing the interface.
>

Yes, It's what PATCH[1/2] does: (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)
are set after VF BARs are updated with pci_update_resource() in this PPC
specific scenario. There are other two situations where the IOV BARs are
updated: PCI resource resizing and allocation during system booting or hot
adding PF. The VF shouldn't be enabled in both cases when updating IOV BARs.

I think you suggest to add check in pci_update_resource(): Don't disable
PF's memory decoding when updating VF BARs. I will send updated revision
if it's what you want.

	/*
	 * The PF might be functional when updating its IOV BARs. So PF's
	 * memory decoding shouldn't be disabled when updating its IOV BARs.
	 */
	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
#ifdef CONFIG_PCI_IOV
	disable &= !(resno >= PCI_IOV_RESOURCES &&
		     resno <= PCI_IOV_RESOURCE_END);
#endif
	if (disable) {
		pci_read_config_word(dev, PCI_COMMAND, &cmd);
		pci_write_config_word(dev, PCI_COMMAND,
				      cmd & ~PCI_COMMAND_MEMORY);
	}

Thanks,
Gavin

>>    sriov_numvfs_store
>>    pdev->driver->sriov_configure
>>    mlx5_core_sriov_configure
>>    pci_enable_sriov
>>    sriov_enable
>>    pcibios_sriov_enable
>>    pnv_pci_sriov_enable
>>    pnv_pci_vf_resource_shift
>>    pci_update_resource
>> 
>> This doesn't change the PF's memory decoding in the path by introducing
>> additional parameter (@mmio_force_on) to pci_update_resource() in
>> the above path.
>
>> 
>> Reported-by: Carol Soto <clsoto@us.ibm.com>
>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> Tested-by: Carol Soto <clsoto@us.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
>>  drivers/pci/iov.c                         | 2 +-
>>  drivers/pci/pci.c                         | 2 +-
>>  drivers/pci/setup-res.c                   | 9 +++++----
>>  include/linux/pci.h                       | 2 +-
>>  5 files changed, 9 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 38a5c65..f4ccc5b 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1006,7 +1006,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>  		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
>>  			 i, &res2, res, (offset > 0) ? "En" : "Dis",
>>  			 num_vfs, offset);
>> -		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
>> +		pci_update_resource(dev, i + PCI_IOV_RESOURCES, true);
>>  	}
>>  	return 0;
>>  }
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index f1343f0..db31966 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -511,7 +511,7 @@ static void sriov_restore_state(struct pci_dev *dev)
>>  		return;
>>  
>>  	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
>> -		pci_update_resource(dev, i);
>> +		pci_update_resource(dev, i, false);
>>  
>>  	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
>>  	pci_iov_set_numvfs(dev, iov->num_VFs);
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index aab9d51..87a33c0 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -545,7 +545,7 @@ static void pci_restore_bars(struct pci_dev *dev)
>>  		return;
>>  
>>  	for (i = 0; i < PCI_BRIDGE_RESOURCES; i++)
>> -		pci_update_resource(dev, i);
>> +		pci_update_resource(dev, i, false);
>>  }
>>  
>>  static const struct pci_platform_pm_ops *pci_platform_pm;
>> diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
>> index 66c4d8f..e8a50ff 100644
>> --- a/drivers/pci/setup-res.c
>> +++ b/drivers/pci/setup-res.c
>> @@ -26,7 +26,7 @@
>>  #include "pci.h"
>>  
>>  
>> -void pci_update_resource(struct pci_dev *dev, int resno)
>> +void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on)
>>  {
>>  	struct pci_bus_region region;
>>  	bool disable;
>> @@ -81,7 +81,8 @@ void pci_update_resource(struct pci_dev *dev, int resno)
>>  	 * disable decoding so that a half-updated BAR won't conflict
>>  	 * with another device.
>>  	 */
>> -	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
>> +	disable = (res->flags & IORESOURCE_MEM_64) &&
>> +		  !mmio_force_on && !dev->mmio_always_on;
>>  	if (disable) {
>>  		pci_read_config_word(dev, PCI_COMMAND, &cmd);
>>  		pci_write_config_word(dev, PCI_COMMAND,
>> @@ -310,7 +311,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
>>  	res->flags &= ~IORESOURCE_STARTALIGN;
>>  	dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res);
>>  	if (resno < PCI_BRIDGE_RESOURCES)
>> -		pci_update_resource(dev, resno);
>> +		pci_update_resource(dev, resno, false);
>>  
>>  	return 0;
>>  }
>> @@ -350,7 +351,7 @@ int pci_reassign_resource(struct pci_dev *dev, int resno, resource_size_t addsiz
>>  	dev_info(&dev->dev, "BAR %d: reassigned %pR (expanded by %#llx)\n",
>>  		 resno, res, (unsigned long long) addsize);
>>  	if (resno < PCI_BRIDGE_RESOURCES)
>> -		pci_update_resource(dev, resno);
>> +		pci_update_resource(dev, resno, false);
>>  
>>  	return 0;
>>  }
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 0ab8359..99231d1 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1039,7 +1039,7 @@ int pci_try_reset_bus(struct pci_bus *bus);
>>  void pci_reset_secondary_bus(struct pci_dev *dev);
>>  void pcibios_reset_secondary_bus(struct pci_dev *dev);
>>  void pci_reset_bridge_secondary_bus(struct pci_dev *dev);
>> -void pci_update_resource(struct pci_dev *dev, int resno);
>> +void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on);
>>  int __must_check pci_assign_resource(struct pci_dev *dev, int i);
>>  int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
>>  int pci_select_bars(struct pci_dev *dev, unsigned long flags);
>> -- 
>> 2.1.0
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Oct. 24, 2016, 2:03 p.m. UTC | #3
On Mon, Oct 24, 2016 at 10:28:02AM +1100, Gavin Shan wrote:
> On Fri, Oct 21, 2016 at 11:50:34AM -0500, Bjorn Helgaas wrote:
> >Hi Gavin,
> >
> >On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:
> >> pci_update_resource() might be called to update (shift) IOV BARs
> >> in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
> >> SRIOV capability. At that point, the PF have been functional if
> >> the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
> >> memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
> >> updating its IOV BARs with pci_update_resource(). Otherwise, we
> >> receives EEH error caused by MMIO access to PF's memory BARs during
> >> the window when PF's memory decoding is disabled.
> >
> >The fact that you get EEH errors is irrelevant.  We can't write code
> >simply to avoid errors -- we have to write code to make the system
> >work correctly.
> >
> >I do not want to add a "mmio_force_on" parameter to
> >pci_update_resource().  That puts the burden on the caller to
> >understand this subtle issue.  If the caller passes mmio_force_on=1
> >when it shouldn't, things will almost always work, but once in a blue
> >moon a half-updated BAR will conflict with some other device in the
> >system, and we'll have an unreproducible, undebuggable crash.
> >
> 
> Bjorn, thanks for your comments. Yes, the EEH error was caused by MMIO
> access to PF's normal BARs, not VF BARs. Yeah, I also had the conern
> that adding parameter to pci_update_resource() increases the visible
> complexity to the caller of the function.
> 
> >What you do need is an explanation of why it's safe to non-atomically
> >update a VF BARx in the SR-IOV capability.  I think this probably
> >involves the VF MSE bit, and you probably have to either disable VFs
> >completely or clear the MSE bit while you're updating the VF BARx.  We
> >should be able to do this inside pci_update_resource() without
> >changing the interface.
> >
> 
> Yes, It's what PATCH[1/2] does: (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)
> are set after VF BARs are updated with pci_update_resource() in this PPC
> specific scenario. There are other two situations where the IOV BARs are
> updated: PCI resource resizing and allocation during system booting or hot
> adding PF. The VF shouldn't be enabled in both cases when updating IOV BARs.
> 
> I think you suggest to add check in pci_update_resource(): Don't disable
> PF's memory decoding when updating VF BARs. I will send updated revision
> if it's what you want.
> 
> 	/*
> 	 * The PF might be functional when updating its IOV BARs. So PF's
> 	 * memory decoding shouldn't be disabled when updating its IOV BARs.
> 	 */
> 	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
> #ifdef CONFIG_PCI_IOV
> 	disable &= !(resno >= PCI_IOV_RESOURCES &&
> 		     resno <= PCI_IOV_RESOURCE_END);
> #endif
> 	if (disable) {
> 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
> 		pci_write_config_word(dev, PCI_COMMAND,
> 				      cmd & ~PCI_COMMAND_MEMORY);
> 	}

Not exactly.  A 64-bit BAR cannot be updated atomically.  The whole
point of this exercise is to make sure that when we update such a BAR,
whether it is a normal PCI BAR or an SR-IOV BAR, the transient state
does not conflict with anything else in the system.

For example, consider two devices that do not conflict:

  device A BAR 0: 0x00000000_20000000
  device B BAR 0: 0x00000000_40000000

We want to update A's BAR 0 to 0x00000001_40000000.  We can't do the
update atomically, so we have this sequence:

  before update:            device A BAR 0: 0x00000000_20000000
  after writing lower half: device A BAR 0: 0x00000000_40000000
  after writing upper half: device A BAR 0: 0x00000001_40000000

If the driver for device B accesses B between the writes, both A and B
may respond, which is a fatal error.

Probably the *best* thing would be to make pci_update_resource()
return an error if it's asked to update a BAR that is enabled, but I
haven't looked at all the callers to see whether that's practical.

The current strategy in pci_update_resource() is to clear
PCI_COMMAND_MEMORY when updating a 64-bit memory BAR.  This only
applies to the regular PCI BARs 0-5.

Extending that strategy to SR-IOV would mean clearing
PCI_SRIOV_CTRL_MSE when updating a 64-bit VF BAR.  Obviously you
wouldn't clear PCI_COMMAND_MEMORY in this case because
PCI_COMMAND_MEMORY doesn't affect the VF BARs.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan Oct. 25, 2016, 1:47 a.m. UTC | #4
On Mon, Oct 24, 2016 at 09:03:16AM -0500, Bjorn Helgaas wrote:
>On Mon, Oct 24, 2016 at 10:28:02AM +1100, Gavin Shan wrote:
>> On Fri, Oct 21, 2016 at 11:50:34AM -0500, Bjorn Helgaas wrote:
>> >Hi Gavin,
>> >
>> >On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:
>> >> pci_update_resource() might be called to update (shift) IOV BARs
>> >> in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
>> >> SRIOV capability. At that point, the PF have been functional if
>> >> the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
>> >> memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
>> >> updating its IOV BARs with pci_update_resource(). Otherwise, we
>> >> receives EEH error caused by MMIO access to PF's memory BARs during
>> >> the window when PF's memory decoding is disabled.
>> >
>> >The fact that you get EEH errors is irrelevant.  We can't write code
>> >simply to avoid errors -- we have to write code to make the system
>> >work correctly.
>> >
>> >I do not want to add a "mmio_force_on" parameter to
>> >pci_update_resource().  That puts the burden on the caller to
>> >understand this subtle issue.  If the caller passes mmio_force_on=1
>> >when it shouldn't, things will almost always work, but once in a blue
>> >moon a half-updated BAR will conflict with some other device in the
>> >system, and we'll have an unreproducible, undebuggable crash.
>> >
>> 
>> Bjorn, thanks for your comments. Yes, the EEH error was caused by MMIO
>> access to PF's normal BARs, not VF BARs. Yeah, I also had the conern
>> that adding parameter to pci_update_resource() increases the visible
>> complexity to the caller of the function.
>> 
>> >What you do need is an explanation of why it's safe to non-atomically
>> >update a VF BARx in the SR-IOV capability.  I think this probably
>> >involves the VF MSE bit, and you probably have to either disable VFs
>> >completely or clear the MSE bit while you're updating the VF BARx.  We
>> >should be able to do this inside pci_update_resource() without
>> >changing the interface.
>> >
>> 
>> Yes, It's what PATCH[1/2] does: (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)
>> are set after VF BARs are updated with pci_update_resource() in this PPC
>> specific scenario. There are other two situations where the IOV BARs are
>> updated: PCI resource resizing and allocation during system booting or hot
>> adding PF. The VF shouldn't be enabled in both cases when updating IOV BARs.
>> 
>> I think you suggest to add check in pci_update_resource(): Don't disable
>> PF's memory decoding when updating VF BARs. I will send updated revision
>> if it's what you want.
>> 
>> 	/*
>> 	 * The PF might be functional when updating its IOV BARs. So PF's
>> 	 * memory decoding shouldn't be disabled when updating its IOV BARs.
>> 	 */
>> 	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
>> #ifdef CONFIG_PCI_IOV
>> 	disable &= !(resno >= PCI_IOV_RESOURCES &&
>> 		     resno <= PCI_IOV_RESOURCE_END);
>> #endif
>> 	if (disable) {
>> 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
>> 		pci_write_config_word(dev, PCI_COMMAND,
>> 				      cmd & ~PCI_COMMAND_MEMORY);
>> 	}
>
>Not exactly.  A 64-bit BAR cannot be updated atomically.  The whole
>point of this exercise is to make sure that when we update such a BAR,
>whether it is a normal PCI BAR or an SR-IOV BAR, the transient state
>does not conflict with anything else in the system.
>
>For example, consider two devices that do not conflict:
>
>  device A BAR 0: 0x00000000_20000000
>  device B BAR 0: 0x00000000_40000000
>
>We want to update A's BAR 0 to 0x00000001_40000000.  We can't do the
>update atomically, so we have this sequence:
>
>  before update:            device A BAR 0: 0x00000000_20000000
>  after writing lower half: device A BAR 0: 0x00000000_40000000
>  after writing upper half: device A BAR 0: 0x00000001_40000000
>
>If the driver for device B accesses B between the writes, both A and B
>may respond, which is a fatal error.
>

Thanks for the detailed explanation. Apart from pdev->mmio_always_on,
the normal BARs are updated with PCI_COMMAND_MEMORY cleared. With
PATCH[1/2], The IOV BARs are updated with PCI_SRIOV_CTRL_MSE and
PCI_SRIOV_CTRL_VFE cleared in the problematic path the patch tried
to address initially. However, I prefer your suggestion at end of
the reply.

>Probably the *best* thing would be to make pci_update_resource()
>return an error if it's asked to update a BAR that is enabled, but I
>haven't looked at all the callers to see whether that's practical.
>

It arguably enforces users to tackle the limitation: the memory
decoding (PCI_COMMAND_MEMORY or PCI_SRIOV_CTRL_VFE) should be
disabled before updating the BARs with pci_update_resource().
It means user cannot call APIs in relatively relaxed order
as before. For example, pci_enable_device() followed by
pci_update_resource(), which is allowed previously, won't
work.

We can hide the limitation inside pci_update_resource() because
nobody accesses the device's memory space that is being updated
by pci_update_resource(). 

>The current strategy in pci_update_resource() is to clear
>PCI_COMMAND_MEMORY when updating a 64-bit memory BAR.  This only
>applies to the regular PCI BARs 0-5.
>
>Extending that strategy to SR-IOV would mean clearing
>PCI_SRIOV_CTRL_MSE when updating a 64-bit VF BAR.  Obviously you
>wouldn't clear PCI_COMMAND_MEMORY in this case because
>PCI_COMMAND_MEMORY doesn't affect the VF BARs.
>

Yeah, it would be the solution to have. If you agree, I will post
updatd version according to this: Clearing PCI_SRIOV_CTRL_MSE when
updating IOV BARs. The bit won't be touched if pdev->mmio_always_on
is true.

Thanks,
Gavin

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Oct. 25, 2016, 3:51 a.m. UTC | #5
On Tue, Oct 25, 2016 at 12:47:28PM +1100, Gavin Shan wrote:
> On Mon, Oct 24, 2016 at 09:03:16AM -0500, Bjorn Helgaas wrote:
> >On Mon, Oct 24, 2016 at 10:28:02AM +1100, Gavin Shan wrote:
> >> On Fri, Oct 21, 2016 at 11:50:34AM -0500, Bjorn Helgaas wrote:
> >> >Hi Gavin,
> >> >
> >> >On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:
> >> >> pci_update_resource() might be called to update (shift) IOV BARs
> >> >> in PPC PowerNV specific pcibios_sriov_enable() when enabling PF's
> >> >> SRIOV capability. At that point, the PF have been functional if
> >> >> the SRIOV is enabled through sysfs entry "sriov_numvfs". The PF's
> >> >> memory decoding (0x2 in PCI_COMMAND) shouldn't be disabled when
> >> >> updating its IOV BARs with pci_update_resource(). Otherwise, we
> >> >> receives EEH error caused by MMIO access to PF's memory BARs during
> >> >> the window when PF's memory decoding is disabled.
> >> >
> >> >The fact that you get EEH errors is irrelevant.  We can't write code
> >> >simply to avoid errors -- we have to write code to make the system
> >> >work correctly.
> >> >
> >> >I do not want to add a "mmio_force_on" parameter to
> >> >pci_update_resource().  That puts the burden on the caller to
> >> >understand this subtle issue.  If the caller passes mmio_force_on=1
> >> >when it shouldn't, things will almost always work, but once in a blue
> >> >moon a half-updated BAR will conflict with some other device in the
> >> >system, and we'll have an unreproducible, undebuggable crash.
> >> >
> >> 
> >> Bjorn, thanks for your comments. Yes, the EEH error was caused by MMIO
> >> access to PF's normal BARs, not VF BARs. Yeah, I also had the conern
> >> that adding parameter to pci_update_resource() increases the visible
> >> complexity to the caller of the function.
> >> 
> >> >What you do need is an explanation of why it's safe to non-atomically
> >> >update a VF BARx in the SR-IOV capability.  I think this probably
> >> >involves the VF MSE bit, and you probably have to either disable VFs
> >> >completely or clear the MSE bit while you're updating the VF BARx.  We
> >> >should be able to do this inside pci_update_resource() without
> >> >changing the interface.
> >> >
> >> 
> >> Yes, It's what PATCH[1/2] does: (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)
> >> are set after VF BARs are updated with pci_update_resource() in this PPC
> >> specific scenario. There are other two situations where the IOV BARs are
> >> updated: PCI resource resizing and allocation during system booting or hot
> >> adding PF. The VF shouldn't be enabled in both cases when updating IOV BARs.
> >> 
> >> I think you suggest to add check in pci_update_resource(): Don't disable
> >> PF's memory decoding when updating VF BARs. I will send updated revision
> >> if it's what you want.
> >> 
> >> 	/*
> >> 	 * The PF might be functional when updating its IOV BARs. So PF's
> >> 	 * memory decoding shouldn't be disabled when updating its IOV BARs.
> >> 	 */
> >> 	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
> >> #ifdef CONFIG_PCI_IOV
> >> 	disable &= !(resno >= PCI_IOV_RESOURCES &&
> >> 		     resno <= PCI_IOV_RESOURCE_END);
> >> #endif
> >> 	if (disable) {
> >> 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
> >> 		pci_write_config_word(dev, PCI_COMMAND,
> >> 				      cmd & ~PCI_COMMAND_MEMORY);
> >> 	}
> >
> >Not exactly.  A 64-bit BAR cannot be updated atomically.  The whole
> >point of this exercise is to make sure that when we update such a BAR,
> >whether it is a normal PCI BAR or an SR-IOV BAR, the transient state
> >does not conflict with anything else in the system.
> >
> >For example, consider two devices that do not conflict:
> >
> >  device A BAR 0: 0x00000000_20000000
> >  device B BAR 0: 0x00000000_40000000
> >
> >We want to update A's BAR 0 to 0x00000001_40000000.  We can't do the
> >update atomically, so we have this sequence:
> >
> >  before update:            device A BAR 0: 0x00000000_20000000
> >  after writing lower half: device A BAR 0: 0x00000000_40000000
> >  after writing upper half: device A BAR 0: 0x00000001_40000000
> >
> >If the driver for device B accesses B between the writes, both A and B
> >may respond, which is a fatal error.
> >
> 
> Thanks for the detailed explanation. Apart from pdev->mmio_always_on,
> the normal BARs are updated with PCI_COMMAND_MEMORY cleared. With
> PATCH[1/2], The IOV BARs are updated with PCI_SRIOV_CTRL_MSE and
> PCI_SRIOV_CTRL_VFE cleared in the problematic path the patch tried
> to address initially. However, I prefer your suggestion at end of
> the reply.
> 
> >Probably the *best* thing would be to make pci_update_resource()
> >return an error if it's asked to update a BAR that is enabled, but I
> >haven't looked at all the callers to see whether that's practical.
> >
> 
> It arguably enforces users to tackle the limitation: the memory
> decoding (PCI_COMMAND_MEMORY or PCI_SRIOV_CTRL_VFE) should be
> disabled before updating the BARs with pci_update_resource().
> It means user cannot call APIs in relatively relaxed order
> as before. For example, pci_enable_device() followed by
> pci_update_resource(), which is allowed previously, won't
> work.

That specific case (pci_enable_device() followed by
pci_update_resource()) should *not* work.  pci_enable_device() is
normally called by a driver's .probe() method, and after we call a
.probe() method, the PCI core shouldn't touch the device at all
because there's no means of mutual exclusion between the driver and
the PCI core.

I think pci_update_resource() should only be called in situations
where the caller already knows that nobody is using the device.  For
regular PCI BARs, that doesn't necessarily mean PCI_COMMAND_MEMORY is
turned off, because firmware leaves PCI_COMMAND_MEMORY enabled for
many devices, even though nobody is using them.

Anyway, I think that's a project for another day.  That's too much to
tackle for the limited problem you're trying to solve.

> We can hide the limitation inside pci_update_resource() because
> nobody accesses the device's memory space that is being updated
> by pci_update_resource(). 
> 
> >The current strategy in pci_update_resource() is to clear
> >PCI_COMMAND_MEMORY when updating a 64-bit memory BAR.  This only
> >applies to the regular PCI BARs 0-5.
> >
> >Extending that strategy to SR-IOV would mean clearing
> >PCI_SRIOV_CTRL_MSE when updating a 64-bit VF BAR.  Obviously you
> >wouldn't clear PCI_COMMAND_MEMORY in this case because
> >PCI_COMMAND_MEMORY doesn't affect the VF BARs.
> >
> 
> Yeah, it would be the solution to have. If you agree, I will post
> updatd version according to this: Clearing PCI_SRIOV_CTRL_MSE when
> updating IOV BARs. The bit won't be touched if pdev->mmio_always_on
> is true.

I think you should ignore pdev->mmio_always_on for IOV BARs.
mmio_always_on is basically a workaround for devices that either don't
follow the spec or where we didn't completely understand the problem.
I don't think there's any reason to set mmio_always_on for SR-IOV
devices.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan Oct. 26, 2016, 1:02 a.m. UTC | #6
On Mon, Oct 24, 2016 at 10:51:13PM -0500, Bjorn Helgaas wrote:
>On Tue, Oct 25, 2016 at 12:47:28PM +1100, Gavin Shan wrote:
>> On Mon, Oct 24, 2016 at 09:03:16AM -0500, Bjorn Helgaas wrote:
>> >On Mon, Oct 24, 2016 at 10:28:02AM +1100, Gavin Shan wrote:
>> >> On Fri, Oct 21, 2016 at 11:50:34AM -0500, Bjorn Helgaas wrote:
>> >> >On Fri, Sep 30, 2016 at 09:47:50AM +1000, Gavin Shan wrote:

.../...

>
>That specific case (pci_enable_device() followed by
>pci_update_resource()) should *not* work.  pci_enable_device() is
>normally called by a driver's .probe() method, and after we call a
>.probe() method, the PCI core shouldn't touch the device at all
>because there's no means of mutual exclusion between the driver and
>the PCI core.
>
>I think pci_update_resource() should only be called in situations
>where the caller already knows that nobody is using the device.  For
>regular PCI BARs, that doesn't necessarily mean PCI_COMMAND_MEMORY is
>turned off, because firmware leaves PCI_COMMAND_MEMORY enabled for
>many devices, even though nobody is using them.
>
>Anyway, I think that's a project for another day.  That's too much to
>tackle for the limited problem you're trying to solve.
>

Bjorn, it's all about discussion. Please take your time and reply when
you have bandwidth.

Well, some drivers break the order and expects the relaxed order to work.
One example is drivers/char/agp/efficeon-agp.c::agp_efficeon_probe(). I
didn't check all usage cases.

I think it's hard for user, who calls pci_update_resource(), to know
that nobody is using the devcie (limited to memory BARs as we concern).
The memory write is usually non-posted transaction and it can be on
the way to target device when pci_update_resource() is called. So which
one tranaction will be completed first (disabling memory decoding and
memory write). I guess it can happen even with the mutual exclusion,
especially on SMP system. Yes, the situation is worse without the
synchronization.

.../...

>> 
>> Yeah, it would be the solution to have. If you agree, I will post
>> updatd version according to this: Clearing PCI_SRIOV_CTRL_MSE when
>> updating IOV BARs. The bit won't be touched if pdev->mmio_always_on
>> is true.
>
>I think you should ignore pdev->mmio_always_on for IOV BARs.
>mmio_always_on is basically a workaround for devices that either don't
>follow the spec or where we didn't completely understand the problem.
>I don't think there's any reason to set mmio_always_on for SR-IOV
>devices.
>

Agree, thanks for the comments again. I will post updated version
shortly.

Thanks,
Gavin

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 38a5c65..f4ccc5b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1006,7 +1006,7 @@  static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
 			 i, &res2, res, (offset > 0) ? "En" : "Dis",
 			 num_vfs, offset);
-		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES, true);
 	}
 	return 0;
 }
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index f1343f0..db31966 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -511,7 +511,7 @@  static void sriov_restore_state(struct pci_dev *dev)
 		return;
 
 	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
-		pci_update_resource(dev, i);
+		pci_update_resource(dev, i, false);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
 	pci_iov_set_numvfs(dev, iov->num_VFs);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index aab9d51..87a33c0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -545,7 +545,7 @@  static void pci_restore_bars(struct pci_dev *dev)
 		return;
 
 	for (i = 0; i < PCI_BRIDGE_RESOURCES; i++)
-		pci_update_resource(dev, i);
+		pci_update_resource(dev, i, false);
 }
 
 static const struct pci_platform_pm_ops *pci_platform_pm;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 66c4d8f..e8a50ff 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -26,7 +26,7 @@ 
 #include "pci.h"
 
 
-void pci_update_resource(struct pci_dev *dev, int resno)
+void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on)
 {
 	struct pci_bus_region region;
 	bool disable;
@@ -81,7 +81,8 @@  void pci_update_resource(struct pci_dev *dev, int resno)
 	 * disable decoding so that a half-updated BAR won't conflict
 	 * with another device.
 	 */
-	disable = (res->flags & IORESOURCE_MEM_64) && !dev->mmio_always_on;
+	disable = (res->flags & IORESOURCE_MEM_64) &&
+		  !mmio_force_on && !dev->mmio_always_on;
 	if (disable) {
 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
 		pci_write_config_word(dev, PCI_COMMAND,
@@ -310,7 +311,7 @@  int pci_assign_resource(struct pci_dev *dev, int resno)
 	res->flags &= ~IORESOURCE_STARTALIGN;
 	dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res);
 	if (resno < PCI_BRIDGE_RESOURCES)
-		pci_update_resource(dev, resno);
+		pci_update_resource(dev, resno, false);
 
 	return 0;
 }
@@ -350,7 +351,7 @@  int pci_reassign_resource(struct pci_dev *dev, int resno, resource_size_t addsiz
 	dev_info(&dev->dev, "BAR %d: reassigned %pR (expanded by %#llx)\n",
 		 resno, res, (unsigned long long) addsize);
 	if (resno < PCI_BRIDGE_RESOURCES)
-		pci_update_resource(dev, resno);
+		pci_update_resource(dev, resno, false);
 
 	return 0;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0ab8359..99231d1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1039,7 +1039,7 @@  int pci_try_reset_bus(struct pci_bus *bus);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
 void pci_reset_bridge_secondary_bus(struct pci_dev *dev);
-void pci_update_resource(struct pci_dev *dev, int resno);
+void pci_update_resource(struct pci_dev *dev, int resno, bool mmio_force_on);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);