diff mbox series

[v5,4/7] PCI/ATS: Add PRI support for PCIe VF devices

Message ID 827d051ef8c8bbfa815908ce927e607870780cb6.1564702313.git.sathyanarayanan.kuppuswamy@linux.intel.com (mailing list archive)
State Superseded, archived
Headers show
Series Fix PF/VF dependency issue | expand

Commit Message

Kuppuswamy Sathyanarayanan Aug. 2, 2019, 12:06 a.m. UTC
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

When IOMMU tries to enable Page Request Interface (PRI) for VF device
in iommu_enable_dev_iotlb(), it always fails because PRI support for
PCIe VF device is currently broken. Current implementation expects
the given PCIe device (PF & VF) to implement PRI capability before
enabling the PRI support. But this assumption is incorrect. As per PCIe
spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
PRI of the PF and not implement it. Hence we need to create exception
for handling the PRI support for PCIe VF device.

Also, since PRI is a shared resource between PF/VF, following rules
should apply.

1. Use proper locking before accessing/modifying PF resources in VF
   PRI enable/disable call.
2. Use reference count logic to track the usage of PRI resource.
3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
 include/linux/pci.h |   2 +
 2 files changed, 112 insertions(+), 33 deletions(-)

Comments

Bjorn Helgaas Aug. 12, 2019, 8:04 p.m. UTC | #1
On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> When IOMMU tries to enable Page Request Interface (PRI) for VF device
> in iommu_enable_dev_iotlb(), it always fails because PRI support for
> PCIe VF device is currently broken. Current implementation expects
> the given PCIe device (PF & VF) to implement PRI capability before
> enabling the PRI support. But this assumption is incorrect. As per PCIe
> spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> PRI of the PF and not implement it. Hence we need to create exception
> for handling the PRI support for PCIe VF device.
> 
> Also, since PRI is a shared resource between PF/VF, following rules
> should apply.
> 
> 1. Use proper locking before accessing/modifying PF resources in VF
>    PRI enable/disable call.
> 2. Use reference count logic to track the usage of PRI resource.
> 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> 
> Cc: Ashok Raj <ashok.raj@intel.com>
> Cc: Keith Busch <keith.busch@intel.com>
> Suggested-by: Ashok Raj <ashok.raj@intel.com>
> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>  drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
>  include/linux/pci.h |   2 +
>  2 files changed, 112 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
> index 1f4be27a071d..079dc5444444 100644
> --- a/drivers/pci/ats.c
> +++ b/drivers/pci/ats.c
> @@ -189,6 +189,8 @@ void pci_pri_init(struct pci_dev *pdev)
>  	if (pdev->is_virtfn)
>  		return;
>  
> +	mutex_init(&pdev->pri_lock);
> +
>  	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
>  	if (!pos)
>  		return;
> @@ -221,29 +223,57 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
>  {
>  	u16 control, status;
>  	u32 max_requests;
> +	int ret = 0;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> -	if (!(status & PCI_PRI_STATUS_STOPPED))
> -		return -EBUSY;
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto pri_unlock;
> +	}
> +
> +	if (pdev->is_virtfn && pf->pri_enabled)
> +		goto update_status;
> +
> +	/*
> +	 * Before updating PRI registers, make sure there is no
> +	 * outstanding PRI requests.
> +	 */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
> +	if (!(status & PCI_PRI_STATUS_STOPPED)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
> -			      &max_requests);
> +	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
>  	reqs = min(max_requests, reqs);
> -	pdev->pri_reqs_alloc = reqs;
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pf->pri_reqs_alloc = reqs;
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>  
>  	control = PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> -	pdev->pri_enabled = 1;
> +	/*
> +	 * If PRI is not already enabled in PF, increment the PF
> +	 * pri_ref_cnt to track the usage of PRI interface.
> +	 */
> +	if (pdev->is_virtfn && !pf->pri_enabled) {
> +		atomic_inc(&pf->pri_ref_cnt);
> +		pf->pri_enabled = 1;
> +	}
>  
> -	return 0;
> +update_status:
> +	atomic_inc(&pf->pri_ref_cnt);
> +	pdev->pri_enabled = 1;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_enable_pri);
>  
> @@ -256,18 +286,30 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
>  void pci_disable_pri(struct pci_dev *pdev)
>  {
>  	u16 control;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(!pdev->pri_enabled))
> -		return;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return;
> +	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
> +		goto pri_unlock;
> +
> +	atomic_dec(&pf->pri_ref_cnt);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
> +	/*
> +	 * If pri_ref_cnt is not zero, then don't modify hardware
> +	 * registers.
> +	 */
> +	if (atomic_read(&pf->pri_ref_cnt))
> +		goto done;
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>  	control &= ~PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> +done:
>  	pdev->pri_enabled = 0;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_disable_pri);
>  
> @@ -277,17 +319,31 @@ EXPORT_SYMBOL_GPL(pci_disable_pri);
>   */
>  void pci_restore_pri_state(struct pci_dev *pdev)
>  {
> -	u16 control = PCI_PRI_CTRL_ENABLE;
> -	u32 reqs = pdev->pri_reqs_alloc;
> +	u16 control;
> +	u32 reqs;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
>  	if (!pdev->pri_enabled)
>  		return;
>  
> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap)
>  		return;
>  
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	mutex_lock(&pf->pri_lock);
> +
> +	/* If PRI is already enabled by other VF's or PF, return */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
> +	if (control & PCI_PRI_CTRL_ENABLE)
> +		goto pri_unlock;
> +
> +	reqs = pf->pri_reqs_alloc;
> +	control = PCI_PRI_CTRL_ENABLE;
> +
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);

Why use "control" here instead of just PCI_PRI_CTRL_ENABLE?

> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>  
> @@ -300,18 +356,32 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>   */
>  int pci_reset_pri(struct pci_dev *pdev)
>  {
> +	struct pci_dev *pf = pci_physfn(pdev);
>  	u16 control;
> +	int ret = 0;
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto done;
> +	}
> +
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto done;
> +	}
> +
> +	/* If PRI is already enabled by other VF's or PF, return 0 */
> +	if (pf->pri_enabled)
> +		goto done;
>  
>  	control = PCI_PRI_CTRL_RESET;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>  
> -	return 0;
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);

Also here (you didn't add this one, but "control" is completely
pointless in this function).

> +done:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_reset_pri);
>  #endif /* CONFIG_PCI_PRI */
> @@ -475,11 +545,18 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
>  int pci_prg_resp_pasid_required(struct pci_dev *pdev)
>  {
>  	u16 status;
> +	struct pci_dev *pf = pci_physfn(pdev);
> +
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap) {
> +		mutex_unlock(&pf->pri_lock);
>  		return 0;
> +	}
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> +	mutex_unlock(&pf->pri_lock);
>  
>  	if (status & PCI_PRI_STATUS_PASID)
>  		return 1;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 27224c0db849..3c9c4c82be27 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -455,8 +455,10 @@ struct pci_dev {
>  	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
>  #endif
>  #ifdef CONFIG_PCI_PRI
> +	struct mutex	pri_lock;	/* PRI enable lock */
>  	u16		pri_cap;	/* PRI Capability offset */
>  	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
> +	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
>  #endif
>  #ifdef CONFIG_PCI_PASID
>  	u16		pasid_cap;	/* PASID Capability offset */
> -- 
> 2.21.0
>
Kuppuswamy Sathyanarayanan Aug. 12, 2019, 9:40 p.m. UTC | #2
Hi,

On 8/12/19 1:04 PM, Bjorn Helgaas wrote:
> On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
>> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>
>> When IOMMU tries to enable Page Request Interface (PRI) for VF device
>> in iommu_enable_dev_iotlb(), it always fails because PRI support for
>> PCIe VF device is currently broken. Current implementation expects
>> the given PCIe device (PF & VF) to implement PRI capability before
>> enabling the PRI support. But this assumption is incorrect. As per PCIe
>> spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
>> PRI of the PF and not implement it. Hence we need to create exception
>> for handling the PRI support for PCIe VF device.
>>
>> Also, since PRI is a shared resource between PF/VF, following rules
>> should apply.
>>
>> 1. Use proper locking before accessing/modifying PF resources in VF
>>     PRI enable/disable call.
>> 2. Use reference count logic to track the usage of PRI resource.
>> 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
>>
>> Cc: Ashok Raj <ashok.raj@intel.com>
>> Cc: Keith Busch <keith.busch@intel.com>
>> Suggested-by: Ashok Raj <ashok.raj@intel.com>
>> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>> ---
>>   drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
>>   include/linux/pci.h |   2 +
>>   2 files changed, 112 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
>> index 1f4be27a071d..079dc5444444 100644
>> --- a/drivers/pci/ats.c
>> +++ b/drivers/pci/ats.c
>> @@ -189,6 +189,8 @@ void pci_pri_init(struct pci_dev *pdev)
>>   	if (pdev->is_virtfn)
>>   		return;
>>   
>> +	mutex_init(&pdev->pri_lock);
>> +
>>   	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
>>   	if (!pos)
>>   		return;
>> @@ -221,29 +223,57 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
>>   {
>>   	u16 control, status;
>>   	u32 max_requests;
>> +	int ret = 0;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>> -	if (WARN_ON(pdev->pri_enabled))
>> -		return -EBUSY;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return -EINVAL;
>> +	if (WARN_ON(pdev->pri_enabled)) {
>> +		ret = -EBUSY;
>> +		goto pri_unlock;
>> +	}
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
>> -	if (!(status & PCI_PRI_STATUS_STOPPED))
>> -		return -EBUSY;
>> +	if (!pf->pri_cap) {
>> +		ret = -EINVAL;
>> +		goto pri_unlock;
>> +	}
>> +
>> +	if (pdev->is_virtfn && pf->pri_enabled)
>> +		goto update_status;
>> +
>> +	/*
>> +	 * Before updating PRI registers, make sure there is no
>> +	 * outstanding PRI requests.
>> +	 */
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>> +	if (!(status & PCI_PRI_STATUS_STOPPED)) {
>> +		ret = -EBUSY;
>> +		goto pri_unlock;
>> +	}
>>   
>> -	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
>> -			      &max_requests);
>> +	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
>>   	reqs = min(max_requests, reqs);
>> -	pdev->pri_reqs_alloc = reqs;
>> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> +	pf->pri_reqs_alloc = reqs;
>> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>>   
>>   	control = PCI_PRI_CTRL_ENABLE;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>>   
>> -	pdev->pri_enabled = 1;
>> +	/*
>> +	 * If PRI is not already enabled in PF, increment the PF
>> +	 * pri_ref_cnt to track the usage of PRI interface.
>> +	 */
>> +	if (pdev->is_virtfn && !pf->pri_enabled) {
>> +		atomic_inc(&pf->pri_ref_cnt);
>> +		pf->pri_enabled = 1;
>> +	}
>>   
>> -	return 0;
>> +update_status:
>> +	atomic_inc(&pf->pri_ref_cnt);
>> +	pdev->pri_enabled = 1;
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>> +	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(pci_enable_pri);
>>   
>> @@ -256,18 +286,30 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
>>   void pci_disable_pri(struct pci_dev *pdev)
>>   {
>>   	u16 control;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>> -	if (WARN_ON(!pdev->pri_enabled))
>> -		return;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return;
>> +	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
>> +		goto pri_unlock;
>> +
>> +	atomic_dec(&pf->pri_ref_cnt);
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
>> +	/*
>> +	 * If pri_ref_cnt is not zero, then don't modify hardware
>> +	 * registers.
>> +	 */
>> +	if (atomic_read(&pf->pri_ref_cnt))
>> +		goto done;
>> +
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>>   	control &= ~PCI_PRI_CTRL_ENABLE;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>>   
>> +done:
>>   	pdev->pri_enabled = 0;
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>>   }
>>   EXPORT_SYMBOL_GPL(pci_disable_pri);
>>   
>> @@ -277,17 +319,31 @@ EXPORT_SYMBOL_GPL(pci_disable_pri);
>>    */
>>   void pci_restore_pri_state(struct pci_dev *pdev)
>>   {
>> -	u16 control = PCI_PRI_CTRL_ENABLE;
>> -	u32 reqs = pdev->pri_reqs_alloc;
>> +	u16 control;
>> +	u32 reqs;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>>   	if (!pdev->pri_enabled)
>>   		return;
>>   
>> -	if (!pdev->pri_cap)
>> +	if (!pf->pri_cap)
>>   		return;
>>   
>> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	mutex_lock(&pf->pri_lock);
>> +
>> +	/* If PRI is already enabled by other VF's or PF, return */
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>> +	if (control & PCI_PRI_CTRL_ENABLE)
>> +		goto pri_unlock;
>> +
>> +	reqs = pf->pri_reqs_alloc;
>> +	control = PCI_PRI_CTRL_ENABLE;
>> +
>> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> Why use "control" here instead of just PCI_PRI_CTRL_ENABLE?
It can be done. Even in original code, using control did not serve any 
purpose. I just left the implementation as original code.
>
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>>   }
>>   EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>>   
>> @@ -300,18 +356,32 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>>    */
>>   int pci_reset_pri(struct pci_dev *pdev)
>>   {
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   	u16 control;
>> +	int ret = 0;
>>   
>> -	if (WARN_ON(pdev->pri_enabled))
>> -		return -EBUSY;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return -EINVAL;
>> +	if (WARN_ON(pdev->pri_enabled)) {
>> +		ret = -EBUSY;
>> +		goto done;
>> +	}
>> +
>> +	if (!pf->pri_cap) {
>> +		ret = -EINVAL;
>> +		goto done;
>> +	}
>> +
>> +	/* If PRI is already enabled by other VF's or PF, return 0 */
>> +	if (pf->pri_enabled)
>> +		goto done;
>>   
>>   	control = PCI_PRI_CTRL_RESET;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>>   
>> -	return 0;
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> Also here (you didn't add this one, but "control" is completely
> pointless in this function).
>
>> +done:
>> +	mutex_unlock(&pf->pri_lock);
>> +	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(pci_reset_pri);
>>   #endif /* CONFIG_PCI_PRI */
>> @@ -475,11 +545,18 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
>>   int pci_prg_resp_pasid_required(struct pci_dev *pdev)
>>   {
>>   	u16 status;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>> +
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> +	if (!pf->pri_cap) {
>> +		mutex_unlock(&pf->pri_lock);
>>   		return 0;
>> +	}
>> +
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
>> +	mutex_unlock(&pf->pri_lock);
>>   
>>   	if (status & PCI_PRI_STATUS_PASID)
>>   		return 1;
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 27224c0db849..3c9c4c82be27 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -455,8 +455,10 @@ struct pci_dev {
>>   	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
>>   #endif
>>   #ifdef CONFIG_PCI_PRI
>> +	struct mutex	pri_lock;	/* PRI enable lock */
>>   	u16		pri_cap;	/* PRI Capability offset */
>>   	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
>> +	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
>>   #endif
>>   #ifdef CONFIG_PCI_PASID
>>   	u16		pasid_cap;	/* PASID Capability offset */
>> -- 
>> 2.21.0
>>
Bjorn Helgaas Aug. 13, 2019, 4:16 a.m. UTC | #3
On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> When IOMMU tries to enable Page Request Interface (PRI) for VF device
> in iommu_enable_dev_iotlb(), it always fails because PRI support for
> PCIe VF device is currently broken. Current implementation expects
> the given PCIe device (PF & VF) to implement PRI capability before
> enabling the PRI support. But this assumption is incorrect. As per PCIe
> spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> PRI of the PF and not implement it. Hence we need to create exception
> for handling the PRI support for PCIe VF device.
> 
> Also, since PRI is a shared resource between PF/VF, following rules
> should apply.
> 
> 1. Use proper locking before accessing/modifying PF resources in VF
>    PRI enable/disable call.
> 2. Use reference count logic to track the usage of PRI resource.
> 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> 
> Cc: Ashok Raj <ashok.raj@intel.com>
> Cc: Keith Busch <keith.busch@intel.com>
> Suggested-by: Ashok Raj <ashok.raj@intel.com>
> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>  drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
>  include/linux/pci.h |   2 +
>  2 files changed, 112 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
> index 1f4be27a071d..079dc5444444 100644
> --- a/drivers/pci/ats.c
> +++ b/drivers/pci/ats.c
> @@ -189,6 +189,8 @@ void pci_pri_init(struct pci_dev *pdev)
>  	if (pdev->is_virtfn)
>  		return;
>  
> +	mutex_init(&pdev->pri_lock);
> +
>  	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
>  	if (!pos)
>  		return;
> @@ -221,29 +223,57 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
>  {
>  	u16 control, status;
>  	u32 max_requests;
> +	int ret = 0;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> -	if (!(status & PCI_PRI_STATUS_STOPPED))
> -		return -EBUSY;
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto pri_unlock;
> +	}
> +
> +	if (pdev->is_virtfn && pf->pri_enabled)
> +		goto update_status;
> +
> +	/*
> +	 * Before updating PRI registers, make sure there is no
> +	 * outstanding PRI requests.
> +	 */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
> +	if (!(status & PCI_PRI_STATUS_STOPPED)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
> -			      &max_requests);
> +	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
>  	reqs = min(max_requests, reqs);
> -	pdev->pri_reqs_alloc = reqs;
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pf->pri_reqs_alloc = reqs;
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>  
>  	control = PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> -	pdev->pri_enabled = 1;
> +	/*
> +	 * If PRI is not already enabled in PF, increment the PF
> +	 * pri_ref_cnt to track the usage of PRI interface.
> +	 */
> +	if (pdev->is_virtfn && !pf->pri_enabled) {
> +		atomic_inc(&pf->pri_ref_cnt);
> +		pf->pri_enabled = 1;
> +	}
>  
> -	return 0;
> +update_status:
> +	atomic_inc(&pf->pri_ref_cnt);
> +	pdev->pri_enabled = 1;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_enable_pri);
>  
> @@ -256,18 +286,30 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
>  void pci_disable_pri(struct pci_dev *pdev)
>  {
>  	u16 control;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(!pdev->pri_enabled))
> -		return;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return;
> +	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
> +		goto pri_unlock;
> +
> +	atomic_dec(&pf->pri_ref_cnt);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
> +	/*
> +	 * If pri_ref_cnt is not zero, then don't modify hardware
> +	 * registers.
> +	 */
> +	if (atomic_read(&pf->pri_ref_cnt))
> +		goto done;
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>  	control &= ~PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> +done:
>  	pdev->pri_enabled = 0;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_disable_pri);
>  
> @@ -277,17 +319,31 @@ EXPORT_SYMBOL_GPL(pci_disable_pri);
>   */
>  void pci_restore_pri_state(struct pci_dev *pdev)
>  {
> -	u16 control = PCI_PRI_CTRL_ENABLE;
> -	u32 reqs = pdev->pri_reqs_alloc;
> +	u16 control;
> +	u32 reqs;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
>  	if (!pdev->pri_enabled)
>  		return;
>  
> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap)
>  		return;
>  
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	mutex_lock(&pf->pri_lock);
> +
> +	/* If PRI is already enabled by other VF's or PF, return */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
> +	if (control & PCI_PRI_CTRL_ENABLE)
> +		goto pri_unlock;
> +
> +	reqs = pf->pri_reqs_alloc;
> +	control = PCI_PRI_CTRL_ENABLE;
> +
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> +
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>  
> @@ -300,18 +356,32 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>   */
>  int pci_reset_pri(struct pci_dev *pdev)
>  {
> +	struct pci_dev *pf = pci_physfn(pdev);
>  	u16 control;
> +	int ret = 0;
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto done;
> +	}
> +
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto done;
> +	}
> +
> +	/* If PRI is already enabled by other VF's or PF, return 0 */
> +	if (pf->pri_enabled)
> +		goto done;
>  
>  	control = PCI_PRI_CTRL_RESET;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>  
> -	return 0;
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> +done:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_reset_pri);
>  #endif /* CONFIG_PCI_PRI */
> @@ -475,11 +545,18 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
>  int pci_prg_resp_pasid_required(struct pci_dev *pdev)
>  {
>  	u16 status;
> +	struct pci_dev *pf = pci_physfn(pdev);
> +
> +	mutex_lock(&pf->pri_lock);

I don't believe this lock is necessary.  What is it protecting?

We're only doing a single read of PCI_PRI_STATUS here, and with the
lock, we'll read it either before or after the critical sections in
pci_enable_pri(), pci_disable_pri(), pci_restore_pri_state(), and
pci_reset_pri().

And without the lock, we'll *also* read PCI_PRI_STATUS either before
or after anything in those functions that affects it.

> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap) {
> +		mutex_unlock(&pf->pri_lock);
>  		return 0;
> +	}
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> +	mutex_unlock(&pf->pri_lock);
>  
>  	if (status & PCI_PRI_STATUS_PASID)
>  		return 1;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 27224c0db849..3c9c4c82be27 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -455,8 +455,10 @@ struct pci_dev {
>  	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
>  #endif
>  #ifdef CONFIG_PCI_PRI
> +	struct mutex	pri_lock;	/* PRI enable lock */
>  	u16		pri_cap;	/* PRI Capability offset */
>  	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
> +	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
>  #endif
>  #ifdef CONFIG_PCI_PASID
>  	u16		pasid_cap;	/* PASID Capability offset */
> -- 
> 2.21.0
>
Bjorn Helgaas Aug. 15, 2019, 10:20 p.m. UTC | #4
[+cc Joerg, David, iommu list: because IOMMU drivers are the only
callers of pci_enable_pri() and pci_enable_pasid()]

On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> When IOMMU tries to enable Page Request Interface (PRI) for VF device
> in iommu_enable_dev_iotlb(), it always fails because PRI support for
> PCIe VF device is currently broken. Current implementation expects
> the given PCIe device (PF & VF) to implement PRI capability before
> enabling the PRI support. But this assumption is incorrect. As per PCIe
> spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> PRI of the PF and not implement it. Hence we need to create exception
> for handling the PRI support for PCIe VF device.
> 
> Also, since PRI is a shared resource between PF/VF, following rules
> should apply.
> 
> 1. Use proper locking before accessing/modifying PF resources in VF
>    PRI enable/disable call.
> 2. Use reference count logic to track the usage of PRI resource.
> 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.

Wait, why do we need this at all?  I agree the spec says VFs may not
implement PRI or PASID capabilities and that VFs use the PRI and
PASID of the PF.

But why do we need to support pci_enable_pri() and pci_enable_pasid()
for VFs?  There's nothing interesting we can *do* in the VF, and
passing it off to the PF adds all this locking mess.  For VFs, can we
just make them do nothing or return -EINVAL?  What functionality would
we be missing if we did that?

(Obviously returning -EINVAL would require tweaks in the callers to
either avoid the call for VFs or handle the -EINVAL gracefully.)

> Cc: Ashok Raj <ashok.raj@intel.com>
> Cc: Keith Busch <keith.busch@intel.com>
> Suggested-by: Ashok Raj <ashok.raj@intel.com>
> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>  drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
>  include/linux/pci.h |   2 +
>  2 files changed, 112 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
> index 1f4be27a071d..079dc5444444 100644
> --- a/drivers/pci/ats.c
> +++ b/drivers/pci/ats.c
> @@ -189,6 +189,8 @@ void pci_pri_init(struct pci_dev *pdev)
>  	if (pdev->is_virtfn)
>  		return;
>  
> +	mutex_init(&pdev->pri_lock);
> +
>  	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
>  	if (!pos)
>  		return;
> @@ -221,29 +223,57 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
>  {
>  	u16 control, status;
>  	u32 max_requests;
> +	int ret = 0;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> -	if (!(status & PCI_PRI_STATUS_STOPPED))
> -		return -EBUSY;
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto pri_unlock;
> +	}
> +
> +	if (pdev->is_virtfn && pf->pri_enabled)
> +		goto update_status;
> +
> +	/*
> +	 * Before updating PRI registers, make sure there is no
> +	 * outstanding PRI requests.
> +	 */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
> +	if (!(status & PCI_PRI_STATUS_STOPPED)) {
> +		ret = -EBUSY;
> +		goto pri_unlock;
> +	}
>  
> -	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
> -			      &max_requests);
> +	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
>  	reqs = min(max_requests, reqs);
> -	pdev->pri_reqs_alloc = reqs;
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pf->pri_reqs_alloc = reqs;
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>  
>  	control = PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> -	pdev->pri_enabled = 1;
> +	/*
> +	 * If PRI is not already enabled in PF, increment the PF
> +	 * pri_ref_cnt to track the usage of PRI interface.
> +	 */
> +	if (pdev->is_virtfn && !pf->pri_enabled) {
> +		atomic_inc(&pf->pri_ref_cnt);
> +		pf->pri_enabled = 1;
> +	}
>  
> -	return 0;
> +update_status:
> +	atomic_inc(&pf->pri_ref_cnt);
> +	pdev->pri_enabled = 1;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_enable_pri);
>  
> @@ -256,18 +286,30 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
>  void pci_disable_pri(struct pci_dev *pdev)
>  {
>  	u16 control;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
> -	if (WARN_ON(!pdev->pri_enabled))
> -		return;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return;
> +	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
> +		goto pri_unlock;
> +
> +	atomic_dec(&pf->pri_ref_cnt);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
> +	/*
> +	 * If pri_ref_cnt is not zero, then don't modify hardware
> +	 * registers.
> +	 */
> +	if (atomic_read(&pf->pri_ref_cnt))
> +		goto done;
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>  	control &= ~PCI_PRI_CTRL_ENABLE;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>  
> +done:
>  	pdev->pri_enabled = 0;
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_disable_pri);
>  
> @@ -277,17 +319,31 @@ EXPORT_SYMBOL_GPL(pci_disable_pri);
>   */
>  void pci_restore_pri_state(struct pci_dev *pdev)
>  {
> -	u16 control = PCI_PRI_CTRL_ENABLE;
> -	u32 reqs = pdev->pri_reqs_alloc;
> +	u16 control;
> +	u32 reqs;
> +	struct pci_dev *pf = pci_physfn(pdev);
>  
>  	if (!pdev->pri_enabled)
>  		return;
>  
> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap)
>  		return;
>  
> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
> +	mutex_lock(&pf->pri_lock);
> +
> +	/* If PRI is already enabled by other VF's or PF, return */
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
> +	if (control & PCI_PRI_CTRL_ENABLE)
> +		goto pri_unlock;
> +
> +	reqs = pf->pri_reqs_alloc;
> +	control = PCI_PRI_CTRL_ENABLE;
> +
> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> +
> +pri_unlock:
> +	mutex_unlock(&pf->pri_lock);
>  }
>  EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>  
> @@ -300,18 +356,32 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>   */
>  int pci_reset_pri(struct pci_dev *pdev)
>  {
> +	struct pci_dev *pf = pci_physfn(pdev);
>  	u16 control;
> +	int ret = 0;
>  
> -	if (WARN_ON(pdev->pri_enabled))
> -		return -EBUSY;
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> -		return -EINVAL;
> +	if (WARN_ON(pdev->pri_enabled)) {
> +		ret = -EBUSY;
> +		goto done;
> +	}
> +
> +	if (!pf->pri_cap) {
> +		ret = -EINVAL;
> +		goto done;
> +	}
> +
> +	/* If PRI is already enabled by other VF's or PF, return 0 */
> +	if (pf->pri_enabled)
> +		goto done;
>  
>  	control = PCI_PRI_CTRL_RESET;
> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>  
> -	return 0;
> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
> +done:
> +	mutex_unlock(&pf->pri_lock);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pci_reset_pri);
>  #endif /* CONFIG_PCI_PRI */
> @@ -475,11 +545,18 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
>  int pci_prg_resp_pasid_required(struct pci_dev *pdev)
>  {
>  	u16 status;
> +	struct pci_dev *pf = pci_physfn(pdev);
> +
> +	mutex_lock(&pf->pri_lock);
>  
> -	if (!pdev->pri_cap)
> +	if (!pf->pri_cap) {
> +		mutex_unlock(&pf->pri_lock);
>  		return 0;
> +	}
> +
> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>  
> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
> +	mutex_unlock(&pf->pri_lock);
>  
>  	if (status & PCI_PRI_STATUS_PASID)
>  		return 1;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 27224c0db849..3c9c4c82be27 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -455,8 +455,10 @@ struct pci_dev {
>  	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
>  #endif
>  #ifdef CONFIG_PCI_PRI
> +	struct mutex	pri_lock;	/* PRI enable lock */
>  	u16		pri_cap;	/* PRI Capability offset */
>  	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
> +	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
>  #endif
>  #ifdef CONFIG_PCI_PASID
>  	u16		pasid_cap;	/* PASID Capability offset */
> -- 
> 2.21.0
>
Kuppuswamy Sathyanarayanan Aug. 15, 2019, 10:39 p.m. UTC | #5
On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> callers of pci_enable_pri() and pci_enable_pasid()]
>
> On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
>> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>
>> When IOMMU tries to enable Page Request Interface (PRI) for VF device
>> in iommu_enable_dev_iotlb(), it always fails because PRI support for
>> PCIe VF device is currently broken. Current implementation expects
>> the given PCIe device (PF & VF) to implement PRI capability before
>> enabling the PRI support. But this assumption is incorrect. As per PCIe
>> spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
>> PRI of the PF and not implement it. Hence we need to create exception
>> for handling the PRI support for PCIe VF device.
>>
>> Also, since PRI is a shared resource between PF/VF, following rules
>> should apply.
>>
>> 1. Use proper locking before accessing/modifying PF resources in VF
>>     PRI enable/disable call.
>> 2. Use reference count logic to track the usage of PRI resource.
>> 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> Wait, why do we need this at all?  I agree the spec says VFs may not
> implement PRI or PASID capabilities and that VFs use the PRI and
> PASID of the PF.
>
> But why do we need to support pci_enable_pri() and pci_enable_pasid()
> for VFs?  There's nothing interesting we can *do* in the VF, and
> passing it off to the PF adds all this locking mess.  For VFs, can we
> just make them do nothing or return -EINVAL?  What functionality would
> we be missing if we did that?

Currently PRI/PASID capabilities are not enabled by default. IOMMU can
enable PRI/PASID for VF first (and not enable it for PF). In this case,
doing nothing for VF device will break the functionality.

Also the PRI/PASID config options like "PRI Outstanding Page Request 
Allocation"
or "PASID Execute Permission" or "PASID Privileged Mode" are currently 
configured
as per device feature. And hence there is a chance for VF/PF to use 
different
values for these options.

> (Obviously returning -EINVAL would require tweaks in the callers to
> either avoid the call for VFs or handle the -EINVAL gracefully.)
>
>> Cc: Ashok Raj <ashok.raj@intel.com>
>> Cc: Keith Busch <keith.busch@intel.com>
>> Suggested-by: Ashok Raj <ashok.raj@intel.com>
>> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>> ---
>>   drivers/pci/ats.c   | 143 ++++++++++++++++++++++++++++++++++----------
>>   include/linux/pci.h |   2 +
>>   2 files changed, 112 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
>> index 1f4be27a071d..079dc5444444 100644
>> --- a/drivers/pci/ats.c
>> +++ b/drivers/pci/ats.c
>> @@ -189,6 +189,8 @@ void pci_pri_init(struct pci_dev *pdev)
>>   	if (pdev->is_virtfn)
>>   		return;
>>   
>> +	mutex_init(&pdev->pri_lock);
>> +
>>   	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
>>   	if (!pos)
>>   		return;
>> @@ -221,29 +223,57 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
>>   {
>>   	u16 control, status;
>>   	u32 max_requests;
>> +	int ret = 0;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>> -	if (WARN_ON(pdev->pri_enabled))
>> -		return -EBUSY;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return -EINVAL;
>> +	if (WARN_ON(pdev->pri_enabled)) {
>> +		ret = -EBUSY;
>> +		goto pri_unlock;
>> +	}
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
>> -	if (!(status & PCI_PRI_STATUS_STOPPED))
>> -		return -EBUSY;
>> +	if (!pf->pri_cap) {
>> +		ret = -EINVAL;
>> +		goto pri_unlock;
>> +	}
>> +
>> +	if (pdev->is_virtfn && pf->pri_enabled)
>> +		goto update_status;
>> +
>> +	/*
>> +	 * Before updating PRI registers, make sure there is no
>> +	 * outstanding PRI requests.
>> +	 */
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>> +	if (!(status & PCI_PRI_STATUS_STOPPED)) {
>> +		ret = -EBUSY;
>> +		goto pri_unlock;
>> +	}
>>   
>> -	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
>> -			      &max_requests);
>> +	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
>>   	reqs = min(max_requests, reqs);
>> -	pdev->pri_reqs_alloc = reqs;
>> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> +	pf->pri_reqs_alloc = reqs;
>> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>>   
>>   	control = PCI_PRI_CTRL_ENABLE;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>>   
>> -	pdev->pri_enabled = 1;
>> +	/*
>> +	 * If PRI is not already enabled in PF, increment the PF
>> +	 * pri_ref_cnt to track the usage of PRI interface.
>> +	 */
>> +	if (pdev->is_virtfn && !pf->pri_enabled) {
>> +		atomic_inc(&pf->pri_ref_cnt);
>> +		pf->pri_enabled = 1;
>> +	}
>>   
>> -	return 0;
>> +update_status:
>> +	atomic_inc(&pf->pri_ref_cnt);
>> +	pdev->pri_enabled = 1;
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>> +	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(pci_enable_pri);
>>   
>> @@ -256,18 +286,30 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
>>   void pci_disable_pri(struct pci_dev *pdev)
>>   {
>>   	u16 control;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>> -	if (WARN_ON(!pdev->pri_enabled))
>> -		return;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return;
>> +	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
>> +		goto pri_unlock;
>> +
>> +	atomic_dec(&pf->pri_ref_cnt);
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
>> +	/*
>> +	 * If pri_ref_cnt is not zero, then don't modify hardware
>> +	 * registers.
>> +	 */
>> +	if (atomic_read(&pf->pri_ref_cnt))
>> +		goto done;
>> +
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>>   	control &= ~PCI_PRI_CTRL_ENABLE;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>>   
>> +done:
>>   	pdev->pri_enabled = 0;
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>>   }
>>   EXPORT_SYMBOL_GPL(pci_disable_pri);
>>   
>> @@ -277,17 +319,31 @@ EXPORT_SYMBOL_GPL(pci_disable_pri);
>>    */
>>   void pci_restore_pri_state(struct pci_dev *pdev)
>>   {
>> -	u16 control = PCI_PRI_CTRL_ENABLE;
>> -	u32 reqs = pdev->pri_reqs_alloc;
>> +	u16 control;
>> +	u32 reqs;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   
>>   	if (!pdev->pri_enabled)
>>   		return;
>>   
>> -	if (!pdev->pri_cap)
>> +	if (!pf->pri_cap)
>>   		return;
>>   
>> -	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>> +	mutex_lock(&pf->pri_lock);
>> +
>> +	/* If PRI is already enabled by other VF's or PF, return */
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
>> +	if (control & PCI_PRI_CTRL_ENABLE)
>> +		goto pri_unlock;
>> +
>> +	reqs = pf->pri_reqs_alloc;
>> +	control = PCI_PRI_CTRL_ENABLE;
>> +
>> +	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>> +
>> +pri_unlock:
>> +	mutex_unlock(&pf->pri_lock);
>>   }
>>   EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>>   
>> @@ -300,18 +356,32 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
>>    */
>>   int pci_reset_pri(struct pci_dev *pdev)
>>   {
>> +	struct pci_dev *pf = pci_physfn(pdev);
>>   	u16 control;
>> +	int ret = 0;
>>   
>> -	if (WARN_ON(pdev->pri_enabled))
>> -		return -EBUSY;
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> -		return -EINVAL;
>> +	if (WARN_ON(pdev->pri_enabled)) {
>> +		ret = -EBUSY;
>> +		goto done;
>> +	}
>> +
>> +	if (!pf->pri_cap) {
>> +		ret = -EINVAL;
>> +		goto done;
>> +	}
>> +
>> +	/* If PRI is already enabled by other VF's or PF, return 0 */
>> +	if (pf->pri_enabled)
>> +		goto done;
>>   
>>   	control = PCI_PRI_CTRL_RESET;
>> -	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
>>   
>> -	return 0;
>> +	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
>> +done:
>> +	mutex_unlock(&pf->pri_lock);
>> +	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(pci_reset_pri);
>>   #endif /* CONFIG_PCI_PRI */
>> @@ -475,11 +545,18 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
>>   int pci_prg_resp_pasid_required(struct pci_dev *pdev)
>>   {
>>   	u16 status;
>> +	struct pci_dev *pf = pci_physfn(pdev);
>> +
>> +	mutex_lock(&pf->pri_lock);
>>   
>> -	if (!pdev->pri_cap)
>> +	if (!pf->pri_cap) {
>> +		mutex_unlock(&pf->pri_lock);
>>   		return 0;
>> +	}
>> +
>> +	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
>>   
>> -	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
>> +	mutex_unlock(&pf->pri_lock);
>>   
>>   	if (status & PCI_PRI_STATUS_PASID)
>>   		return 1;
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 27224c0db849..3c9c4c82be27 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -455,8 +455,10 @@ struct pci_dev {
>>   	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
>>   #endif
>>   #ifdef CONFIG_PCI_PRI
>> +	struct mutex	pri_lock;	/* PRI enable lock */
>>   	u16		pri_cap;	/* PRI Capability offset */
>>   	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
>> +	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
>>   #endif
>>   #ifdef CONFIG_PCI_PASID
>>   	u16		pasid_cap;	/* PASID Capability offset */
>> -- 
>> 2.21.0
>>
Bjorn Helgaas Aug. 19, 2019, 2:15 p.m. UTC | #6
On Thu, Aug 15, 2019 at 03:39:03PM -0700, Kuppuswamy Sathyanarayanan wrote:
> On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> > [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> > callers of pci_enable_pri() and pci_enable_pasid()]
> > 
> > On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > 
> > > When IOMMU tries to enable Page Request Interface (PRI) for VF device
> > > in iommu_enable_dev_iotlb(), it always fails because PRI support for
> > > PCIe VF device is currently broken. Current implementation expects
> > > the given PCIe device (PF & VF) to implement PRI capability before
> > > enabling the PRI support. But this assumption is incorrect. As per PCIe
> > > spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> > > PRI of the PF and not implement it. Hence we need to create exception
> > > for handling the PRI support for PCIe VF device.
> > > 
> > > Also, since PRI is a shared resource between PF/VF, following rules
> > > should apply.
> > > 
> > > 1. Use proper locking before accessing/modifying PF resources in VF
> > >     PRI enable/disable call.
> > > 2. Use reference count logic to track the usage of PRI resource.
> > > 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.

> > Wait, why do we need this at all?  I agree the spec says VFs may not
> > implement PRI or PASID capabilities and that VFs use the PRI and
> > PASID of the PF.
> > 
> > But why do we need to support pci_enable_pri() and pci_enable_pasid()
> > for VFs?  There's nothing interesting we can *do* in the VF, and
> > passing it off to the PF adds all this locking mess.  For VFs, can we
> > just make them do nothing or return -EINVAL?  What functionality would
> > we be missing if we did that?
> 
> Currently PRI/PASID capabilities are not enabled by default. IOMMU can
> enable PRI/PASID for VF first (and not enable it for PF). In this case,
> doing nothing for VF device will break the functionality.

What is the path where we can enable PRI/PASID for VF but not for the
PF?  The call chains leading to pci_enable_pri() go through the
iommu_ops.add_device interface, which makes me think this is part of
the device enumeration done by the PCI core, and in that case I would
think this it should be done for the PF before VFs.  But maybe this
path isn't exercised until a driver does a DMA map or something
similar?

Bjorn
Kuppuswamy Sathyanarayanan Aug. 19, 2019, 10:53 p.m. UTC | #7
On Mon, Aug 19, 2019 at 09:15:00AM -0500, Bjorn Helgaas wrote:
> On Thu, Aug 15, 2019 at 03:39:03PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> > > [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> > > callers of pci_enable_pri() and pci_enable_pasid()]
> > > 
> > > On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > 
> > > > When IOMMU tries to enable Page Request Interface (PRI) for VF device
> > > > in iommu_enable_dev_iotlb(), it always fails because PRI support for
> > > > PCIe VF device is currently broken. Current implementation expects
> > > > the given PCIe device (PF & VF) to implement PRI capability before
> > > > enabling the PRI support. But this assumption is incorrect. As per PCIe
> > > > spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> > > > PRI of the PF and not implement it. Hence we need to create exception
> > > > for handling the PRI support for PCIe VF device.
> > > > 
> > > > Also, since PRI is a shared resource between PF/VF, following rules
> > > > should apply.
> > > > 
> > > > 1. Use proper locking before accessing/modifying PF resources in VF
> > > >     PRI enable/disable call.
> > > > 2. Use reference count logic to track the usage of PRI resource.
> > > > 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> 
> > > Wait, why do we need this at all?  I agree the spec says VFs may not
> > > implement PRI or PASID capabilities and that VFs use the PRI and
> > > PASID of the PF.
> > > 
> > > But why do we need to support pci_enable_pri() and pci_enable_pasid()
> > > for VFs?  There's nothing interesting we can *do* in the VF, and
> > > passing it off to the PF adds all this locking mess.  For VFs, can we
> > > just make them do nothing or return -EINVAL?  What functionality would
> > > we be missing if we did that?
> > 
> > Currently PRI/PASID capabilities are not enabled by default. IOMMU can
> > enable PRI/PASID for VF first (and not enable it for PF). In this case,
> > doing nothing for VF device will break the functionality.
> 
> What is the path where we can enable PRI/PASID for VF but not for the
> PF?  The call chains leading to pci_enable_pri() go through the
> iommu_ops.add_device interface, which makes me think this is part of
> the device enumeration done by the PCI core, and in that case I would
> think this it should be done for the PF before VFs.  But maybe this
> path isn't exercised until a driver does a DMA map or something
> similar?
AFAIK, this path will only get exercised when the device does DMA and
hence there is no specific order in which PRI/PASID is enabled in PF/VF.
In fact, my v2 version of this patch set had a check to ensure PF
PRI/PASID enable is happened before VF attempts PRI/PASID
enable/disable. But I had to remove it in later version of this series
due to failure case reported by one the tester of this code. 
> 
> Bjorn
Bjorn Helgaas Aug. 19, 2019, 11:19 p.m. UTC | #8
On Mon, Aug 19, 2019 at 03:53:31PM -0700, Kuppuswamy Sathyanarayanan wrote:
> On Mon, Aug 19, 2019 at 09:15:00AM -0500, Bjorn Helgaas wrote:
> > On Thu, Aug 15, 2019 at 03:39:03PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > > On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> > > > [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> > > > callers of pci_enable_pri() and pci_enable_pasid()]
> > > > 
> > > > On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > > 
> > > > > When IOMMU tries to enable Page Request Interface (PRI) for VF device
> > > > > in iommu_enable_dev_iotlb(), it always fails because PRI support for
> > > > > PCIe VF device is currently broken. Current implementation expects
> > > > > the given PCIe device (PF & VF) to implement PRI capability before
> > > > > enabling the PRI support. But this assumption is incorrect. As per PCIe
> > > > > spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> > > > > PRI of the PF and not implement it. Hence we need to create exception
> > > > > for handling the PRI support for PCIe VF device.
> > > > > 
> > > > > Also, since PRI is a shared resource between PF/VF, following rules
> > > > > should apply.
> > > > > 
> > > > > 1. Use proper locking before accessing/modifying PF resources in VF
> > > > >     PRI enable/disable call.
> > > > > 2. Use reference count logic to track the usage of PRI resource.
> > > > > 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> > 
> > > > Wait, why do we need this at all?  I agree the spec says VFs may not
> > > > implement PRI or PASID capabilities and that VFs use the PRI and
> > > > PASID of the PF.
> > > > 
> > > > But why do we need to support pci_enable_pri() and pci_enable_pasid()
> > > > for VFs?  There's nothing interesting we can *do* in the VF, and
> > > > passing it off to the PF adds all this locking mess.  For VFs, can we
> > > > just make them do nothing or return -EINVAL?  What functionality would
> > > > we be missing if we did that?
> > > 
> > > Currently PRI/PASID capabilities are not enabled by default. IOMMU can
> > > enable PRI/PASID for VF first (and not enable it for PF). In this case,
> > > doing nothing for VF device will break the functionality.
> > 
> > What is the path where we can enable PRI/PASID for VF but not for the
> > PF?  The call chains leading to pci_enable_pri() go through the
> > iommu_ops.add_device interface, which makes me think this is part of
> > the device enumeration done by the PCI core, and in that case I would
> > think this it should be done for the PF before VFs.  But maybe this
> > path isn't exercised until a driver does a DMA map or something
> > similar?

> AFAIK, this path will only get exercised when the device does DMA and
> hence there is no specific order in which PRI/PASID is enabled in PF/VF.
> In fact, my v2 version of this patch set had a check to ensure PF
> PRI/PASID enable is happened before VF attempts PRI/PASID
> enable/disable. But I had to remove it in later version of this series
> due to failure case reported by one the tester of this code. 

What's the path?  And does that path make sense?

I got this far before giving up:

    iommu_go_to_state                           # AMD
      state_next
        amd_iommu_init_pci
          amd_iommu_init_api
            bus_set_iommu
              iommu_bus_init
                bus_for_each_dev(..., add_iommu_group)
                  add_iommu_group
                    iommu_probe_device
                      amd_iommu_add_device                      # amd_iommu_ops.add_device
                        init_iommu_group
                          iommu_group_get_for_dev
                            iommu_group_add_device
                              __iommu_attach_device
                                amd_iommu_attach_device         # amd_iommu_ops.attach_dev
                                  attach_device                 # amd_iommu
                                    pdev_iommuv2_enable
                                      pci_enable_pri


    iommu_probe_device
      intel_iommu_add_device                    # intel_iommu_ops.add_device
        domain_add_dev_info
          dmar_insert_one_dev_info
            domain_context_mapping
              domain_context_mapping_one
                iommu_enable_dev_iotlb
                  pci_enable_pri


These *look* like enumeration paths, not DMA setup paths.  But I could
be wrong, since I gave up before getting to the source.

I don't want to add all this complexity because we *think* we need it.
I want to think about whether it makes *sense*.  Maybe it's sensible
for the PF enumeration or a PF driver to enable the hardware it owns.

If we leave it to the VFs, then we have issues with coordinating
between VFs that want different settings, etc.

If we understand the whole picture and it needs to be in the VFs,
that's fine.  But I don't think we understand the whole picture yet.

Bjorn
Kuppuswamy Sathyanarayanan Aug. 28, 2019, 6:21 p.m. UTC | #9
On Mon, Aug 19, 2019 at 06:19:25PM -0500, Bjorn Helgaas wrote:
> On Mon, Aug 19, 2019 at 03:53:31PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > On Mon, Aug 19, 2019 at 09:15:00AM -0500, Bjorn Helgaas wrote:
> > > On Thu, Aug 15, 2019 at 03:39:03PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > > > On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> > > > > [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> > > > > callers of pci_enable_pri() and pci_enable_pasid()]
> > > > > 
> > > > > On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > > > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > > > 
> > > > > > When IOMMU tries to enable Page Request Interface (PRI) for VF device
> > > > > > in iommu_enable_dev_iotlb(), it always fails because PRI support for
> > > > > > PCIe VF device is currently broken. Current implementation expects
> > > > > > the given PCIe device (PF & VF) to implement PRI capability before
> > > > > > enabling the PRI support. But this assumption is incorrect. As per PCIe
> > > > > > spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> > > > > > PRI of the PF and not implement it. Hence we need to create exception
> > > > > > for handling the PRI support for PCIe VF device.
> > > > > > 
> > > > > > Also, since PRI is a shared resource between PF/VF, following rules
> > > > > > should apply.
> > > > > > 
> > > > > > 1. Use proper locking before accessing/modifying PF resources in VF
> > > > > >     PRI enable/disable call.
> > > > > > 2. Use reference count logic to track the usage of PRI resource.
> > > > > > 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> > > 
> > > > > Wait, why do we need this at all?  I agree the spec says VFs may not
> > > > > implement PRI or PASID capabilities and that VFs use the PRI and
> > > > > PASID of the PF.
> > > > > 
> > > > > But why do we need to support pci_enable_pri() and pci_enable_pasid()
> > > > > for VFs?  There's nothing interesting we can *do* in the VF, and
> > > > > passing it off to the PF adds all this locking mess.  For VFs, can we
> > > > > just make them do nothing or return -EINVAL?  What functionality would
> > > > > we be missing if we did that?
> > > > 
> > > > Currently PRI/PASID capabilities are not enabled by default. IOMMU can
> > > > enable PRI/PASID for VF first (and not enable it for PF). In this case,
> > > > doing nothing for VF device will break the functionality.
> > > 
> > > What is the path where we can enable PRI/PASID for VF but not for the
> > > PF?  The call chains leading to pci_enable_pri() go through the
> > > iommu_ops.add_device interface, which makes me think this is part of
> > > the device enumeration done by the PCI core, and in that case I would
> > > think this it should be done for the PF before VFs.  But maybe this
> > > path isn't exercised until a driver does a DMA map or something
> > > similar?
> 
> > AFAIK, this path will only get exercised when the device does DMA and
> > hence there is no specific order in which PRI/PASID is enabled in PF/VF.
> > In fact, my v2 version of this patch set had a check to ensure PF
> > PRI/PASID enable is happened before VF attempts PRI/PASID
> > enable/disable. But I had to remove it in later version of this series
> > due to failure case reported by one the tester of this code. 
> 
> What's the path?  And does that path make sense?
> 
> I got this far before giving up:
> 
>     iommu_go_to_state                           # AMD
>       state_next
>         amd_iommu_init_pci
>           amd_iommu_init_api
>             bus_set_iommu
>               iommu_bus_init
>                 bus_for_each_dev(..., add_iommu_group)
>                   add_iommu_group
>                     iommu_probe_device
>                       amd_iommu_add_device                      # amd_iommu_ops.add_device
>                         init_iommu_group
>                           iommu_group_get_for_dev
>                             iommu_group_add_device
>                               __iommu_attach_device
>                                 amd_iommu_attach_device         # amd_iommu_ops.attach_dev
>                                   attach_device                 # amd_iommu
>                                     pdev_iommuv2_enable
>                                       pci_enable_pri
> 
> 
>     iommu_probe_device
>       intel_iommu_add_device                    # intel_iommu_ops.add_device
>         domain_add_dev_info
>           dmar_insert_one_dev_info
>             domain_context_mapping
>               domain_context_mapping_one
>                 iommu_enable_dev_iotlb
>                   pci_enable_pri
> 
> 
> These *look* like enumeration paths, not DMA setup paths.  But I could
> be wrong, since I gave up before getting to the source.
> 
> I don't want to add all this complexity because we *think* we need it.
> I want to think about whether it makes *sense*.  Maybe it's sensible
> for the PF enumeration or a PF driver to enable the hardware it owns.
> 
> If we leave it to the VFs, then we have issues with coordinating
> between VFs that want different settings, etc.
> 
> If we understand the whole picture and it needs to be in the VFs,
> that's fine.  But I don't think we understand the whole picture yet.

After re-analyzing the code paths, I also could not find the use case
where PF/VF PRI/PASID is enabled in out of order(VF first and then PF).
Also, I had no luck in finding that old bug report email which triggered
me to come up with this complicated fix. As per my current analysis, as
you have mentioned, PF/VF PRI/PASID enable seems to happen only during
device creation time.

Following are some of the possible code paths:

VF PRI/PASID enable path is,

[ 8367.161880]  iommu_enable_dev_iotlb+0x83/0x180
[ 8367.168061]  domain_context_mapping_one+0x44f/0x500
[ 8367.174264]  ? domain_context_mapping_one+0x500/0x500
[ 8367.180429]  pci_for_each_dma_alias+0x30/0x170
[ 8367.186368]  dmar_insert_one_dev_info+0x43f/0x4d0
[ 8367.192288]  domain_add_dev_info+0x50/0x90
[ 8367.197973]  intel_iommu_attach_device+0x9c/0x130
[ 8367.203726]  __iommu_attach_device+0x47/0xb0
[ 8367.209292]  ? _cond_resched+0x15/0x40
[ 8367.214643]  iommu_group_add_device+0x13a/0x2c0
[ 8367.220102]  iommu_group_get_for_dev+0xa8/0x220
[ 8367.225460]  intel_iommu_add_device+0x61/0x590
[ 8367.230708]  iommu_bus_notifier+0xb1/0xe0
[ 8367.235768]  notifier_call_chain+0x47/0x70
[ 8367.240757]  blocking_notifier_call_chain+0x3e/0x60
[ 8367.245854]  device_add+0x3ec/0x690
[ 8367.250533]  pci_device_add+0x26b/0x660
[ 8367.255207]  pci_iov_add_virtfn+0x1ce/0x3b0
[ 8367.259873]  sriov_enable+0x254/0x410
[ 8367.264323]  dev_fops_ioctl+0x1378/0x1520 [sad8]
[ 8367.322115]  init_fops_ioctl+0x12c/0x150 [sad8]
[ 8367.324921]  do_vfs_ioctl+0xa4/0x630
[ 8367.327415]  ksys_ioctl+0x70/0x80
[ 8367.329822]  __x64_sys_ioctl+0x16/0x20
[ 8367.332310]  do_syscall_64+0x5b/0x1a0
[ 8367.334771]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

PF PRI/PASID enable path is,

[   11.084005] Call Trace:
[   11.084005]  dump_stack+0x5c/0x7b
[   11.084005]  iommu_enable_dev_iotlb+0x83/0x180
[   11.084005]  domain_context_mapping_one+0x44f/0x500
[   11.084005]  ? domain_context_mapping_one+0x500/0x500
[   11.084005]  pci_for_each_dma_alias+0x30/0x170
[   11.084005]  dmar_insert_one_dev_info+0x43f/0x4d0
[   11.084005]  domain_add_dev_info+0x50/0x90
[   11.084005]  intel_iommu_attach_device+0x9c/0x130
[   11.084005]  __iommu_attach_device+0x47/0xb0
[   11.084005]  ? _cond_resched+0x15/0x40
[   11.084005]  iommu_group_add_device+0x13a/0x2c0
[   11.084005]  iommu_group_get_for_dev+0xa8/0x220
[   11.084005]  intel_iommu_add_device+0x61/0x590
[   11.084005]  ? iommu_probe_device+0x40/0x40
[   11.084005]  add_iommu_group+0xa/0x20
[   11.084005]  bus_for_each_dev+0x76/0xc0
[   11.084005]  bus_set_iommu+0x85/0xc0
[   11.084005]  intel_iommu_init+0xfe5/0x11c1
[   11.084005]  ? __fput+0x134/0x220
[   11.084005]  ? set_debug_rodata+0x11/0x11
[   11.084005]  ? e820__memblock_setup+0x60/0x60
[   11.084005]  ? pci_iommu_init+0x16/0x3f
[   11.084005]  pci_iommu_init+0x16/0x3f
[   11.084005]  do_one_initcall+0x46/0x1f4
[   11.084005]  kernel_init_freeable+0x1ba/0x283
[   11.084005]  ? rest_init+0xb0/0xb0
[   11.084005]  kernel_init+0xa/0x120
[   11.084005]  ret_from_fork+0x1f/0x40

Similarly PF/VF PRI/PASID possible disable paths are,

iommu_hotplug_path->disable_dmar_iommu->__dmar_remove_one_dev_info->iommu_disable_dev_iotlb

domain_exit()->domain_remove_dev_info->iommu_disable_dev_iotlb

vfio_iommu_type1_detach_group()->iommu_detach_group()->intel_iommu_detach_device->dmar_remove_one_dev_info

But even in all of these paths, PF/VF PRI/PASID disable have to happen
in order (VF first and then PF).

So we can implement the logic of not doing anything for VF when its
related PRI/PASID calls. But my questions is, is it safe to go with
these assumptions? Since all these dependencies we have found are not
explicitly defined, if some one breaks it will also affect PRI/PASID
logic. Let me know your comments.



> 
> Bjorn
Bjorn Helgaas Aug. 28, 2019, 6:57 p.m. UTC | #10
On Wed, Aug 28, 2019 at 11:21:53AM -0700, Kuppuswamy Sathyanarayanan wrote:
> On Mon, Aug 19, 2019 at 06:19:25PM -0500, Bjorn Helgaas wrote:
> > On Mon, Aug 19, 2019 at 03:53:31PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > > On Mon, Aug 19, 2019 at 09:15:00AM -0500, Bjorn Helgaas wrote:
> > > > On Thu, Aug 15, 2019 at 03:39:03PM -0700, Kuppuswamy Sathyanarayanan wrote:
> > > > > On 8/15/19 3:20 PM, Bjorn Helgaas wrote:
> > > > > > [+cc Joerg, David, iommu list: because IOMMU drivers are the only
> > > > > > callers of pci_enable_pri() and pci_enable_pasid()]
> > > > > > 
> > > > > > On Thu, Aug 01, 2019 at 05:06:01PM -0700, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > > > > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > > > > 
> > > > > > > When IOMMU tries to enable Page Request Interface (PRI) for VF device
> > > > > > > in iommu_enable_dev_iotlb(), it always fails because PRI support for
> > > > > > > PCIe VF device is currently broken. Current implementation expects
> > > > > > > the given PCIe device (PF & VF) to implement PRI capability before
> > > > > > > enabling the PRI support. But this assumption is incorrect. As per PCIe
> > > > > > > spec r4.0, sec 9.3.7.11, all VFs associated with PF can only use the
> > > > > > > PRI of the PF and not implement it. Hence we need to create exception
> > > > > > > for handling the PRI support for PCIe VF device.
> > > > > > > 
> > > > > > > Also, since PRI is a shared resource between PF/VF, following rules
> > > > > > > should apply.
> > > > > > > 
> > > > > > > 1. Use proper locking before accessing/modifying PF resources in VF
> > > > > > >     PRI enable/disable call.
> > > > > > > 2. Use reference count logic to track the usage of PRI resource.
> > > > > > > 3. Disable PRI only if the PRI reference count (pri_ref_cnt) is zero.
> > > > 
> > > > > > Wait, why do we need this at all?  I agree the spec says VFs may not
> > > > > > implement PRI or PASID capabilities and that VFs use the PRI and
> > > > > > PASID of the PF.
> > > > > > 
> > > > > > But why do we need to support pci_enable_pri() and pci_enable_pasid()
> > > > > > for VFs?  There's nothing interesting we can *do* in the VF, and
> > > > > > passing it off to the PF adds all this locking mess.  For VFs, can we
> > > > > > just make them do nothing or return -EINVAL?  What functionality would
> > > > > > we be missing if we did that?
> > > > > 
> > > > > Currently PRI/PASID capabilities are not enabled by default. IOMMU can
> > > > > enable PRI/PASID for VF first (and not enable it for PF). In this case,
> > > > > doing nothing for VF device will break the functionality.
> > > > 
> > > > What is the path where we can enable PRI/PASID for VF but not for the
> > > > PF?  The call chains leading to pci_enable_pri() go through the
> > > > iommu_ops.add_device interface, which makes me think this is part of
> > > > the device enumeration done by the PCI core, and in that case I would
> > > > think this it should be done for the PF before VFs.  But maybe this
> > > > path isn't exercised until a driver does a DMA map or something
> > > > similar?
> > 
> > > AFAIK, this path will only get exercised when the device does DMA and
> > > hence there is no specific order in which PRI/PASID is enabled in PF/VF.
> > > In fact, my v2 version of this patch set had a check to ensure PF
> > > PRI/PASID enable is happened before VF attempts PRI/PASID
> > > enable/disable. But I had to remove it in later version of this series
> > > due to failure case reported by one the tester of this code. 
> > 
> > What's the path?  And does that path make sense?
> > 
> > I got this far before giving up:
> > 
> >     iommu_go_to_state                           # AMD
> >       state_next
> >         amd_iommu_init_pci
> >           amd_iommu_init_api
> >             bus_set_iommu
> >               iommu_bus_init
> >                 bus_for_each_dev(..., add_iommu_group)
> >                   add_iommu_group
> >                     iommu_probe_device
> >                       amd_iommu_add_device                      # amd_iommu_ops.add_device
> >                         init_iommu_group
> >                           iommu_group_get_for_dev
> >                             iommu_group_add_device
> >                               __iommu_attach_device
> >                                 amd_iommu_attach_device         # amd_iommu_ops.attach_dev
> >                                   attach_device                 # amd_iommu
> >                                     pdev_iommuv2_enable
> >                                       pci_enable_pri
> > 
> > 
> >     iommu_probe_device
> >       intel_iommu_add_device                    # intel_iommu_ops.add_device
> >         domain_add_dev_info
> >           dmar_insert_one_dev_info
> >             domain_context_mapping
> >               domain_context_mapping_one
> >                 iommu_enable_dev_iotlb
> >                   pci_enable_pri
> > 
> > 
> > These *look* like enumeration paths, not DMA setup paths.  But I could
> > be wrong, since I gave up before getting to the source.
> > 
> > I don't want to add all this complexity because we *think* we need it.
> > I want to think about whether it makes *sense*.  Maybe it's sensible
> > for the PF enumeration or a PF driver to enable the hardware it owns.
> > 
> > If we leave it to the VFs, then we have issues with coordinating
> > between VFs that want different settings, etc.
> > 
> > If we understand the whole picture and it needs to be in the VFs,
> > that's fine.  But I don't think we understand the whole picture yet.
> 
> After re-analyzing the code paths, I also could not find the use case
> where PF/VF PRI/PASID is enabled in out of order(VF first and then PF).
> Also, I had no luck in finding that old bug report email which triggered
> me to come up with this complicated fix. As per my current analysis, as
> you have mentioned, PF/VF PRI/PASID enable seems to happen only during
> device creation time.
> 
> Following are some of the possible code paths:
> 
> VF PRI/PASID enable path is,
> 
> [ 8367.161880]  iommu_enable_dev_iotlb+0x83/0x180
> [ 8367.168061]  domain_context_mapping_one+0x44f/0x500
> [ 8367.174264]  ? domain_context_mapping_one+0x500/0x500
> [ 8367.180429]  pci_for_each_dma_alias+0x30/0x170
> [ 8367.186368]  dmar_insert_one_dev_info+0x43f/0x4d0
> [ 8367.192288]  domain_add_dev_info+0x50/0x90
> [ 8367.197973]  intel_iommu_attach_device+0x9c/0x130
> [ 8367.203726]  __iommu_attach_device+0x47/0xb0
> [ 8367.209292]  ? _cond_resched+0x15/0x40
> [ 8367.214643]  iommu_group_add_device+0x13a/0x2c0
> [ 8367.220102]  iommu_group_get_for_dev+0xa8/0x220
> [ 8367.225460]  intel_iommu_add_device+0x61/0x590
> [ 8367.230708]  iommu_bus_notifier+0xb1/0xe0
> [ 8367.235768]  notifier_call_chain+0x47/0x70
> [ 8367.240757]  blocking_notifier_call_chain+0x3e/0x60
> [ 8367.245854]  device_add+0x3ec/0x690
> [ 8367.250533]  pci_device_add+0x26b/0x660
> [ 8367.255207]  pci_iov_add_virtfn+0x1ce/0x3b0
> [ 8367.259873]  sriov_enable+0x254/0x410
> [ 8367.264323]  dev_fops_ioctl+0x1378/0x1520 [sad8]
> [ 8367.322115]  init_fops_ioctl+0x12c/0x150 [sad8]
> [ 8367.324921]  do_vfs_ioctl+0xa4/0x630
> [ 8367.327415]  ksys_ioctl+0x70/0x80
> [ 8367.329822]  __x64_sys_ioctl+0x16/0x20
> [ 8367.332310]  do_syscall_64+0x5b/0x1a0
> [ 8367.334771]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> PF PRI/PASID enable path is,
> 
> [   11.084005] Call Trace:
> [   11.084005]  dump_stack+0x5c/0x7b
> [   11.084005]  iommu_enable_dev_iotlb+0x83/0x180
> [   11.084005]  domain_context_mapping_one+0x44f/0x500
> [   11.084005]  ? domain_context_mapping_one+0x500/0x500
> [   11.084005]  pci_for_each_dma_alias+0x30/0x170
> [   11.084005]  dmar_insert_one_dev_info+0x43f/0x4d0
> [   11.084005]  domain_add_dev_info+0x50/0x90
> [   11.084005]  intel_iommu_attach_device+0x9c/0x130
> [   11.084005]  __iommu_attach_device+0x47/0xb0
> [   11.084005]  ? _cond_resched+0x15/0x40
> [   11.084005]  iommu_group_add_device+0x13a/0x2c0
> [   11.084005]  iommu_group_get_for_dev+0xa8/0x220
> [   11.084005]  intel_iommu_add_device+0x61/0x590
> [   11.084005]  ? iommu_probe_device+0x40/0x40
> [   11.084005]  add_iommu_group+0xa/0x20
> [   11.084005]  bus_for_each_dev+0x76/0xc0
> [   11.084005]  bus_set_iommu+0x85/0xc0
> [   11.084005]  intel_iommu_init+0xfe5/0x11c1
> [   11.084005]  ? __fput+0x134/0x220
> [   11.084005]  ? set_debug_rodata+0x11/0x11
> [   11.084005]  ? e820__memblock_setup+0x60/0x60
> [   11.084005]  ? pci_iommu_init+0x16/0x3f
> [   11.084005]  pci_iommu_init+0x16/0x3f
> [   11.084005]  do_one_initcall+0x46/0x1f4
> [   11.084005]  kernel_init_freeable+0x1ba/0x283
> [   11.084005]  ? rest_init+0xb0/0xb0
> [   11.084005]  kernel_init+0xa/0x120
> [   11.084005]  ret_from_fork+0x1f/0x40
> 
> Similarly PF/VF PRI/PASID possible disable paths are,
> 
> iommu_hotplug_path->disable_dmar_iommu->__dmar_remove_one_dev_info->iommu_disable_dev_iotlb
> 
> domain_exit()->domain_remove_dev_info->iommu_disable_dev_iotlb
> 
> vfio_iommu_type1_detach_group()->iommu_detach_group()->intel_iommu_detach_device->dmar_remove_one_dev_info
> 
> But even in all of these paths, PF/VF PRI/PASID disable have to happen
> in order (VF first and then PF).
> 
> So we can implement the logic of not doing anything for VF when its
> related PRI/PASID calls. But my questions is, is it safe to go with
> these assumptions? Since all these dependencies we have found are not
> explicitly defined, if some one breaks it will also affect PRI/PASID
> logic. Let me know your comments.

I think we should assume PRI/PASID will be controlled via the PF.
That's true today because we initialize them via the IOMMU binding
path.  If the IOMMU path changes so that's no longer feasible, we
could probably do the initialization in the PCI core.  These features
are implemented in the PF, so I think the code will be simpler if it
mirrors that instead of trying to provide the illusion that they're in
the VF.

Bjorn
diff mbox series

Patch

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 1f4be27a071d..079dc5444444 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -189,6 +189,8 @@  void pci_pri_init(struct pci_dev *pdev)
 	if (pdev->is_virtfn)
 		return;
 
+	mutex_init(&pdev->pri_lock);
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (!pos)
 		return;
@@ -221,29 +223,57 @@  int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 {
 	u16 control, status;
 	u32 max_requests;
+	int ret = 0;
+	struct pci_dev *pf = pci_physfn(pdev);
 
-	if (WARN_ON(pdev->pri_enabled))
-		return -EBUSY;
+	mutex_lock(&pf->pri_lock);
 
-	if (!pdev->pri_cap)
-		return -EINVAL;
+	if (WARN_ON(pdev->pri_enabled)) {
+		ret = -EBUSY;
+		goto pri_unlock;
+	}
 
-	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
-	if (!(status & PCI_PRI_STATUS_STOPPED))
-		return -EBUSY;
+	if (!pf->pri_cap) {
+		ret = -EINVAL;
+		goto pri_unlock;
+	}
+
+	if (pdev->is_virtfn && pf->pri_enabled)
+		goto update_status;
+
+	/*
+	 * Before updating PRI registers, make sure there is no
+	 * outstanding PRI requests.
+	 */
+	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
+	if (!(status & PCI_PRI_STATUS_STOPPED)) {
+		ret = -EBUSY;
+		goto pri_unlock;
+	}
 
-	pci_read_config_dword(pdev, pdev->pri_cap + PCI_PRI_MAX_REQ,
-			      &max_requests);
+	pci_read_config_dword(pf, pf->pri_cap + PCI_PRI_MAX_REQ, &max_requests);
 	reqs = min(max_requests, reqs);
-	pdev->pri_reqs_alloc = reqs;
-	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
+	pf->pri_reqs_alloc = reqs;
+	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
 
 	control = PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
+	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
 
-	pdev->pri_enabled = 1;
+	/*
+	 * If PRI is not already enabled in PF, increment the PF
+	 * pri_ref_cnt to track the usage of PRI interface.
+	 */
+	if (pdev->is_virtfn && !pf->pri_enabled) {
+		atomic_inc(&pf->pri_ref_cnt);
+		pf->pri_enabled = 1;
+	}
 
-	return 0;
+update_status:
+	atomic_inc(&pf->pri_ref_cnt);
+	pdev->pri_enabled = 1;
+pri_unlock:
+	mutex_unlock(&pf->pri_lock);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pci_enable_pri);
 
@@ -256,18 +286,30 @@  EXPORT_SYMBOL_GPL(pci_enable_pri);
 void pci_disable_pri(struct pci_dev *pdev)
 {
 	u16 control;
+	struct pci_dev *pf = pci_physfn(pdev);
 
-	if (WARN_ON(!pdev->pri_enabled))
-		return;
+	mutex_lock(&pf->pri_lock);
 
-	if (!pdev->pri_cap)
-		return;
+	if (WARN_ON(!pdev->pri_enabled) || !pf->pri_cap)
+		goto pri_unlock;
+
+	atomic_dec(&pf->pri_ref_cnt);
 
-	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, &control);
+	/*
+	 * If pri_ref_cnt is not zero, then don't modify hardware
+	 * registers.
+	 */
+	if (atomic_read(&pf->pri_ref_cnt))
+		goto done;
+
+	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
 	control &= ~PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
+	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
 
+done:
 	pdev->pri_enabled = 0;
+pri_unlock:
+	mutex_unlock(&pf->pri_lock);
 }
 EXPORT_SYMBOL_GPL(pci_disable_pri);
 
@@ -277,17 +319,31 @@  EXPORT_SYMBOL_GPL(pci_disable_pri);
  */
 void pci_restore_pri_state(struct pci_dev *pdev)
 {
-	u16 control = PCI_PRI_CTRL_ENABLE;
-	u32 reqs = pdev->pri_reqs_alloc;
+	u16 control;
+	u32 reqs;
+	struct pci_dev *pf = pci_physfn(pdev);
 
 	if (!pdev->pri_enabled)
 		return;
 
-	if (!pdev->pri_cap)
+	if (!pf->pri_cap)
 		return;
 
-	pci_write_config_dword(pdev, pdev->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
-	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
+	mutex_lock(&pf->pri_lock);
+
+	/* If PRI is already enabled by other VF's or PF, return */
+	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, &control);
+	if (control & PCI_PRI_CTRL_ENABLE)
+		goto pri_unlock;
+
+	reqs = pf->pri_reqs_alloc;
+	control = PCI_PRI_CTRL_ENABLE;
+
+	pci_write_config_dword(pf, pf->pri_cap + PCI_PRI_ALLOC_REQ, reqs);
+	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
+
+pri_unlock:
+	mutex_unlock(&pf->pri_lock);
 }
 EXPORT_SYMBOL_GPL(pci_restore_pri_state);
 
@@ -300,18 +356,32 @@  EXPORT_SYMBOL_GPL(pci_restore_pri_state);
  */
 int pci_reset_pri(struct pci_dev *pdev)
 {
+	struct pci_dev *pf = pci_physfn(pdev);
 	u16 control;
+	int ret = 0;
 
-	if (WARN_ON(pdev->pri_enabled))
-		return -EBUSY;
+	mutex_lock(&pf->pri_lock);
 
-	if (!pdev->pri_cap)
-		return -EINVAL;
+	if (WARN_ON(pdev->pri_enabled)) {
+		ret = -EBUSY;
+		goto done;
+	}
+
+	if (!pf->pri_cap) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* If PRI is already enabled by other VF's or PF, return 0 */
+	if (pf->pri_enabled)
+		goto done;
 
 	control = PCI_PRI_CTRL_RESET;
-	pci_write_config_word(pdev, pdev->pri_cap + PCI_PRI_CTRL, control);
 
-	return 0;
+	pci_write_config_word(pf, pf->pri_cap + PCI_PRI_CTRL, control);
+done:
+	mutex_unlock(&pf->pri_lock);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pci_reset_pri);
 #endif /* CONFIG_PCI_PRI */
@@ -475,11 +545,18 @@  EXPORT_SYMBOL_GPL(pci_pasid_features);
 int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 {
 	u16 status;
+	struct pci_dev *pf = pci_physfn(pdev);
+
+	mutex_lock(&pf->pri_lock);
 
-	if (!pdev->pri_cap)
+	if (!pf->pri_cap) {
+		mutex_unlock(&pf->pri_lock);
 		return 0;
+	}
+
+	pci_read_config_word(pf, pf->pri_cap + PCI_PRI_STATUS, &status);
 
-	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
+	mutex_unlock(&pf->pri_lock);
 
 	if (status & PCI_PRI_STATUS_PASID)
 		return 1;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 27224c0db849..3c9c4c82be27 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -455,8 +455,10 @@  struct pci_dev {
 	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
 #endif
 #ifdef CONFIG_PCI_PRI
+	struct mutex	pri_lock;	/* PRI enable lock */
 	u16		pri_cap;	/* PRI Capability offset */
 	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
+	atomic_t	pri_ref_cnt;	/* Number of PF/VF PRI users */
 #endif
 #ifdef CONFIG_PCI_PASID
 	u16		pasid_cap;	/* PASID Capability offset */