diff mbox series

[v2] ice: Fix enabling SR-IOV with Xen

Message ID 20240429124922.2872002-1-ross.lagerwall@citrix.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series [v2] ice: Fix enabling SR-IOV with Xen | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Ross Lagerwall April 29, 2024, 12:49 p.m. UTC
When the PCI functions are created, Xen is informed about them and
caches the number of MSI-X entries each function has.  However, the
number of MSI-X entries is not set until after the hardware has been
configured and the VFs have been started. This prevents
PCI-passthrough from working because Xen rejects mapping MSI-X
interrupts to domains because it thinks the MSI-X interrupts don't
exist.

Fix this by moving the call to pci_enable_sriov() later so that the
number of MSI-X entries is set correctly in hardware by the time Xen
reads it.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Javi Merino <javi.merino@kernel.org>
---

In v2:
* Fix cleanup on if pci_enable_sriov() fails.

 drivers/net/ethernet/intel/ice/ice_sriov.c | 23 +++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Comments

Paul Menzel April 29, 2024, 1:04 p.m. UTC | #1
Dear Ross,


Thank you for your patch.

Am 29.04.24 um 14:49 schrieb Ross Lagerwall:
> When the PCI functions are created, Xen is informed about them and
> caches the number of MSI-X entries each function has.  However, the
> number of MSI-X entries is not set until after the hardware has been
> configured and the VFs have been started. This prevents
> PCI-passthrough from working because Xen rejects mapping MSI-X
> interrupts to domains because it thinks the MSI-X interrupts don't
> exist.

Thank you for this great problem description. Is there any log message 
shown, you could paste, so people can find this commit when searching 
for the log message?

Do you have a minimal test case, so the maintainers can reproduce this 
to test the fix?

> Fix this by moving the call to pci_enable_sriov() later so that the
> number of MSI-X entries is set correctly in hardware by the time Xen
> reads it.

It’d be great if you could be more specific on “later”, and why this is 
the correct place.

> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
> Signed-off-by: Javi Merino <javi.merino@kernel.org>
> ---
> 
> In v2:
> * Fix cleanup on if pci_enable_sriov() fails.
> 
>   drivers/net/ethernet/intel/ice/ice_sriov.c | 23 +++++++++++++---------
>   1 file changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
> index a958fcf3e6be..bc97493046a8 100644
> --- a/drivers/net/ethernet/intel/ice/ice_sriov.c
> +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
> @@ -864,6 +864,8 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
>   	int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors;
>   	struct device *dev = ice_pf_to_dev(pf);
>   	struct ice_hw *hw = &pf->hw;
> +	struct ice_vf *vf;
> +	unsigned int bkt;
>   	int ret;
>   
>   	pf->sriov_irq_bm = bitmap_zalloc(total_vectors, GFP_KERNEL);
> @@ -877,24 +879,20 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
>   	set_bit(ICE_OICR_INTR_DIS, pf->state);
>   	ice_flush(hw);
>   
> -	ret = pci_enable_sriov(pf->pdev, num_vfs);
> -	if (ret)
> -		goto err_unroll_intr;
> -
>   	mutex_lock(&pf->vfs.table_lock);
>   
>   	ret = ice_set_per_vf_res(pf, num_vfs);
>   	if (ret) {
>   		dev_err(dev, "Not enough resources for %d VFs, err %d. Try with fewer number of VFs\n",
>   			num_vfs, ret);
> -		goto err_unroll_sriov;
> +		goto err_unroll_intr;
>   	}
>   
>   	ret = ice_create_vf_entries(pf, num_vfs);
>   	if (ret) {
>   		dev_err(dev, "Failed to allocate VF entries for %d VFs\n",
>   			num_vfs);
> -		goto err_unroll_sriov;
> +		goto err_unroll_intr;
>   	}
>   
>   	ice_eswitch_reserve_cp_queues(pf, num_vfs);
> @@ -905,6 +903,10 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
>   		goto err_unroll_vf_entries;
>   	}
>   
> +	ret = pci_enable_sriov(pf->pdev, num_vfs);
> +	if (ret)
> +		goto err_unroll_start_vfs;
> +
>   	clear_bit(ICE_VF_DIS, pf->state);
>   
>   	/* rearm global interrupts */
> @@ -915,12 +917,15 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
>   
>   	return 0;
>   
> +err_unroll_start_vfs:
> +	ice_for_each_vf(pf, bkt, vf) {
> +		ice_dis_vf_mappings(vf);
> +		ice_vf_vsi_release(vf);
> +	}

Why wasn’t this needed with `pci_enable_sriov()` done earlier?

>   err_unroll_vf_entries:
>   	ice_free_vf_entries(pf);
> -err_unroll_sriov:
> -	mutex_unlock(&pf->vfs.table_lock);
> -	pci_disable_sriov(pf->pdev);
>   err_unroll_intr:
> +	mutex_unlock(&pf->vfs.table_lock);
>   	/* rearm interrupts here */
>   	ice_irq_dynamic_ena(hw, NULL, NULL);
>   	clear_bit(ICE_OICR_INTR_DIS, pf->state);


Kind regards,

Paul
Ross Lagerwall April 30, 2024, 9:03 a.m. UTC | #2
On Mon, Apr 29, 2024 at 2:04 PM Paul Menzel <pmenzel@molgen.mpg.de> wrote:
>
> Dear Ross,
>
>
> Thank you for your patch.
>
> Am 29.04.24 um 14:49 schrieb Ross Lagerwall:
> > When the PCI functions are created, Xen is informed about them and
> > caches the number of MSI-X entries each function has.  However, the
> > number of MSI-X entries is not set until after the hardware has been
> > configured and the VFs have been started. This prevents
> > PCI-passthrough from working because Xen rejects mapping MSI-X
> > interrupts to domains because it thinks the MSI-X interrupts don't
> > exist.
>
> Thank you for this great problem description. Is there any log message
> shown, you could paste, so people can find this commit when searching
> for the log message?

When this issue occurs, QEMU repeatedly reports:

msi_msix_setup: Error: Mapping of MSI-X (err: 22, vec: 0, entry 0x1)

>
> Do you have a minimal test case, so the maintainers can reproduce this
> to test the fix?

Testing this requires setting up Xen which I wouldn't expect anyone to
do unless they already have an environment set up.

In any case, a "minimal" test would be something like:

1. Set up Xen with dom0 and another VM running Linux.
2. Pass through a VF to the VM. See that QEMU reports the above message
   and the VF is not usable within the VM.
3. Rebuild the dom0 kernel with the attached patch.
4. Pass through a VF to the VM. See that the VF is usable within the
   VM.

>
> > Fix this by moving the call to pci_enable_sriov() later so that the
> > number of MSI-X entries is set correctly in hardware by the time Xen
> > reads it.
>
> It’d be great if you could be more specific on “later”, and why this is
> the correct place.

"later" in this case means after ice_start_vfs() since it is at that
point that the hardware sets the number of MSI-X entries.
I expect that a maintainer or someone with more knowledge of the
hardware could explain why the hardware only sets the number of MSI-X
entries at this point.

>
> > Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
> > Signed-off-by: Javi Merino <javi.merino@kernel.org>
> > ---
> >
> > In v2:
> > * Fix cleanup on if pci_enable_sriov() fails.
> >
> >   drivers/net/ethernet/intel/ice/ice_sriov.c | 23 +++++++++++++---------
> >   1 file changed, 14 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
> > index a958fcf3e6be..bc97493046a8 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_sriov.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
> > @@ -864,6 +864,8 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
> >       int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors;
> >       struct device *dev = ice_pf_to_dev(pf);
> >       struct ice_hw *hw = &pf->hw;
> > +     struct ice_vf *vf;
> > +     unsigned int bkt;
> >       int ret;
> >
> >       pf->sriov_irq_bm = bitmap_zalloc(total_vectors, GFP_KERNEL);
> > @@ -877,24 +879,20 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
> >       set_bit(ICE_OICR_INTR_DIS, pf->state);
> >       ice_flush(hw);
> >
> > -     ret = pci_enable_sriov(pf->pdev, num_vfs);
> > -     if (ret)
> > -             goto err_unroll_intr;
> > -
> >       mutex_lock(&pf->vfs.table_lock);
> >
> >       ret = ice_set_per_vf_res(pf, num_vfs);
> >       if (ret) {
> >               dev_err(dev, "Not enough resources for %d VFs, err %d. Try with fewer number of VFs\n",
> >                       num_vfs, ret);
> > -             goto err_unroll_sriov;
> > +             goto err_unroll_intr;
> >       }
> >
> >       ret = ice_create_vf_entries(pf, num_vfs);
> >       if (ret) {
> >               dev_err(dev, "Failed to allocate VF entries for %d VFs\n",
> >                       num_vfs);
> > -             goto err_unroll_sriov;
> > +             goto err_unroll_intr;
> >       }
> >
> >       ice_eswitch_reserve_cp_queues(pf, num_vfs);
> > @@ -905,6 +903,10 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
> >               goto err_unroll_vf_entries;
> >       }
> >
> > +     ret = pci_enable_sriov(pf->pdev, num_vfs);
> > +     if (ret)
> > +             goto err_unroll_start_vfs;
> > +
> >       clear_bit(ICE_VF_DIS, pf->state);
> >
> >       /* rearm global interrupts */
> > @@ -915,12 +917,15 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
> >
> >       return 0;
> >
> > +err_unroll_start_vfs:
> > +     ice_for_each_vf(pf, bkt, vf) {
> > +             ice_dis_vf_mappings(vf);
> > +             ice_vf_vsi_release(vf);
> > +     }
>
> Why wasn’t this needed with `pci_enable_sriov()` done earlier?

Previously ice_start_vifs() was the last function call that may fail
in this function. That is no longer the case so when
pci_enable_sriov() fails, it needs to undo what was done in
ice_start_vifs().

Thanks,
Ross
Tony Nguyen May 6, 2024, 9:27 p.m. UTC | #3
On 4/29/2024 5:49 AM, Ross Lagerwall wrote:
> When the PCI functions are created, Xen is informed about them and
> caches the number of MSI-X entries each function has.  However, the
> number of MSI-X entries is not set until after the hardware has been
> configured and the VFs have been started. This prevents
> PCI-passthrough from working because Xen rejects mapping MSI-X
> interrupts to domains because it thinks the MSI-X interrupts don't
> exist.
> 
> Fix this by moving the call to pci_enable_sriov() later so that the
> number of MSI-X entries is set correctly in hardware by the time Xen
> reads it.
> 

Sorry, I missed this on initial review, but bug fixes should have a 
Fixes: tag

I assume you are targeting this for net, if so, can you mark it as 
'PATCH iwl-net'.

> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
> Signed-off-by: Javi Merino <javi.merino@kernel.org>

Also, sender should be the last sign-off.

Thanks,
Tony
Sergey Temerkhanov May 8, 2024, 9:23 a.m. UTC | #4
This patch makes sense since VFs need to be assigned resources (especially MSI-X interrupt count)
before making these VFs visible, so that the kernel PCI enumeration code reads correct MSI-X
interrupt count for the VFs.

Regards,
Sergey
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index a958fcf3e6be..bc97493046a8 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -864,6 +864,8 @@  static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
 	int total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors;
 	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
+	struct ice_vf *vf;
+	unsigned int bkt;
 	int ret;
 
 	pf->sriov_irq_bm = bitmap_zalloc(total_vectors, GFP_KERNEL);
@@ -877,24 +879,20 @@  static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
 	set_bit(ICE_OICR_INTR_DIS, pf->state);
 	ice_flush(hw);
 
-	ret = pci_enable_sriov(pf->pdev, num_vfs);
-	if (ret)
-		goto err_unroll_intr;
-
 	mutex_lock(&pf->vfs.table_lock);
 
 	ret = ice_set_per_vf_res(pf, num_vfs);
 	if (ret) {
 		dev_err(dev, "Not enough resources for %d VFs, err %d. Try with fewer number of VFs\n",
 			num_vfs, ret);
-		goto err_unroll_sriov;
+		goto err_unroll_intr;
 	}
 
 	ret = ice_create_vf_entries(pf, num_vfs);
 	if (ret) {
 		dev_err(dev, "Failed to allocate VF entries for %d VFs\n",
 			num_vfs);
-		goto err_unroll_sriov;
+		goto err_unroll_intr;
 	}
 
 	ice_eswitch_reserve_cp_queues(pf, num_vfs);
@@ -905,6 +903,10 @@  static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
 		goto err_unroll_vf_entries;
 	}
 
+	ret = pci_enable_sriov(pf->pdev, num_vfs);
+	if (ret)
+		goto err_unroll_start_vfs;
+
 	clear_bit(ICE_VF_DIS, pf->state);
 
 	/* rearm global interrupts */
@@ -915,12 +917,15 @@  static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
 
 	return 0;
 
+err_unroll_start_vfs:
+	ice_for_each_vf(pf, bkt, vf) {
+		ice_dis_vf_mappings(vf);
+		ice_vf_vsi_release(vf);
+	}
 err_unroll_vf_entries:
 	ice_free_vf_entries(pf);
-err_unroll_sriov:
-	mutex_unlock(&pf->vfs.table_lock);
-	pci_disable_sriov(pf->pdev);
 err_unroll_intr:
+	mutex_unlock(&pf->vfs.table_lock);
 	/* rearm interrupts here */
 	ice_irq_dynamic_ena(hw, NULL, NULL);
 	clear_bit(ICE_OICR_INTR_DIS, pf->state);