diff mbox series

[v9,03/12] net: mana: Handle vport sharing between devices

Message ID 1666396889-31288-4-git-send-email-longli@linuxonhyperv.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Introduce Microsoft Azure Network Adapter (MANA) RDMA driver | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 108 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Long Li Oct. 22, 2022, 12:01 a.m. UTC
From: Long Li <longli@microsoft.com>

For outgoing packets, the PF requires the VF to configure the vport with
corresponding protection domain and doorbell ID for the kernel or user
context. The vport can't be shared between different contexts.

Implement the logic to exclusively take over the vport by either the
Ethernet device or RDMA device.

Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
---
Change log:
v2: use refcount instead of directly using atomic variables
v4: change to mutex to avoid possible race with refcount
v5: add detailed comments explaining vport sharing, use EXPORT_SYMBOL_NS
v6: rebased to rdma-next

 drivers/net/ethernet/microsoft/mana/mana.h    |  7 +++
 drivers/net/ethernet/microsoft/mana/mana_en.c | 53 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 2 deletions(-)

Comments

Yunsheng Lin Oct. 24, 2022, 1:20 a.m. UTC | #1
On 2022/10/22 8:01, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> For outgoing packets, the PF requires the VF to configure the vport with
> corresponding protection domain and doorbell ID for the kernel or user
> context. The vport can't be shared between different contexts.
> 
> Implement the logic to exclusively take over the vport by either the
> Ethernet device or RDMA device.
> 
> Reviewed-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Long Li <longli@microsoft.com>
> Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> Change log:
> v2: use refcount instead of directly using atomic variables
> v4: change to mutex to avoid possible race with refcount
> v5: add detailed comments explaining vport sharing, use EXPORT_SYMBOL_NS
> v6: rebased to rdma-next
> 
>  drivers/net/ethernet/microsoft/mana/mana.h    |  7 +++
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 53 ++++++++++++++++++-
>  2 files changed, 58 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h
> index d58be64374c8..2883a08dbfb5 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana.h
> +++ b/drivers/net/ethernet/microsoft/mana/mana.h
> @@ -380,6 +380,10 @@ struct mana_port_context {
>  	mana_handle_t port_handle;
>  	mana_handle_t pf_filter_handle;
>  
> +	/* Mutex for sharing access to vport_use_count */
> +	struct mutex vport_mutex;
> +	int vport_use_count;
> +
>  	u16 port_idx;
>  
>  	bool port_is_up;
> @@ -631,4 +635,7 @@ struct mana_tx_package {
>  	struct gdma_posted_wqe_info wqe_info;
>  };
>  
> +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> +		   u32 doorbell_pg_id);
> +void mana_uncfg_vport(struct mana_port_context *apc);
>  #endif /* _MANA_H */
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 8751e475d1ba..efe14a343fd1 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -646,13 +646,48 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
>  	return 0;
>  }
>  
> -static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> -			  u32 doorbell_pg_id)
> +void mana_uncfg_vport(struct mana_port_context *apc)
> +{
> +	mutex_lock(&apc->vport_mutex);
> +	apc->vport_use_count--;
> +	WARN_ON(apc->vport_use_count < 0);
> +	mutex_unlock(&apc->vport_mutex);
> +}
> +EXPORT_SYMBOL_NS(mana_uncfg_vport, NET_MANA);
> +
> +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
> +		   u32 doorbell_pg_id)
>  {
>  	struct mana_config_vport_resp resp = {};
>  	struct mana_config_vport_req req = {};
>  	int err;
>  
> +	/* This function is used to program the Ethernet port in the hardware
> +	 * table. It can be called from the Ethernet driver or the RDMA driver.
> +	 *
> +	 * For Ethernet usage, the hardware supports only one active user on a
> +	 * physical port. The driver checks on the port usage before programming
> +	 * the hardware when creating the RAW QP (RDMA driver) or exposing the
> +	 * device to kernel NET layer (Ethernet driver).
> +	 *
> +	 * Because the RDMA driver doesn't know in advance which QP type the
> +	 * user will create, it exposes the device with all its ports. The user
> +	 * may not be able to create RAW QP on a port if this port is already
> +	 * in used by the Ethernet driver from the kernel.
> +	 *
> +	 * This physical port limitation only applies to the RAW QP. For RC QP,
> +	 * the hardware doesn't have this limitation. The user can create RC
> +	 * QPs on a physical port up to the hardware limits independent of the
> +	 * Ethernet usage on the same port.
> +	 */
> +	mutex_lock(&apc->vport_mutex);
> +	if (apc->vport_use_count > 0) {
> +		mutex_unlock(&apc->vport_mutex);
> +		return -EBUSY;
> +	}
> +	apc->vport_use_count++;
> +	mutex_unlock(&apc->vport_mutex);
> +
>  	mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
>  			     sizeof(req), sizeof(resp));
>  	req.vport = apc->port_handle;
> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
>  
>  	apc->tx_shortform_allowed = resp.short_form_allowed;
>  	apc->tx_vp_offset = resp.tx_vport_offset;
> +
> +	netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> +		    apc->port_handle, protection_dom_id, doorbell_pg_id);
>  out:
> +	if (err)
> +		mana_uncfg_vport(apc);

There seems to be a similar race between error handling here and the
"apc->vport_use_count > 0" checking above as pointed out in v7.

> +
>  	return err;
>  }
> +EXPORT_SYMBOL_NS(mana_cfg_vport, NET_MANA);
>  
>  static int mana_cfg_vport_steering(struct mana_port_context *apc,
>  				   enum TRI_STATE rx,
> @@ -742,6 +784,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
>  			   resp.hdr.status);
>  		err = -EPROTO;
>  	}
> +
> +	netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
> +		    apc->port_handle, num_entries);
>  out:
>  	kfree(req);
>  	return err;
> @@ -1804,6 +1849,7 @@ static void mana_destroy_vport(struct mana_port_context *apc)
>  	}
>  
>  	mana_destroy_txq(apc);
> +	mana_uncfg_vport(apc);
>  
>  	if (gd->gdma_context->is_pf)
>  		mana_pf_deregister_hw_vport(apc);
> @@ -2076,6 +2122,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
>  	apc->pf_filter_handle = INVALID_MANA_HANDLE;
>  	apc->port_idx = port_idx;
>  
> +	mutex_init(&apc->vport_mutex);
> +	apc->vport_use_count = 0;
> +
>  	ndev->netdev_ops = &mana_devops;
>  	ndev->ethtool_ops = &mana_ethtool_ops;
>  	ndev->mtu = ETH_DATA_LEN;
>
Long Li Oct. 24, 2022, 6:45 p.m. UTC | #2
> > +int mana_cfg_vport(struct mana_port_context *apc, u32
> protection_dom_id,
> > +		   u32 doorbell_pg_id)
> >  {
> >  	struct mana_config_vport_resp resp = {};
> >  	struct mana_config_vport_req req = {};
> >  	int err;
> >
> > +	/* This function is used to program the Ethernet port in the hardware
> > +	 * table. It can be called from the Ethernet driver or the RDMA driver.
> > +	 *
> > +	 * For Ethernet usage, the hardware supports only one active user on
> a
> > +	 * physical port. The driver checks on the port usage before
> programming
> > +	 * the hardware when creating the RAW QP (RDMA driver) or
> exposing the
> > +	 * device to kernel NET layer (Ethernet driver).
> > +	 *
> > +	 * Because the RDMA driver doesn't know in advance which QP type
> the
> > +	 * user will create, it exposes the device with all its ports. The user
> > +	 * may not be able to create RAW QP on a port if this port is already
> > +	 * in used by the Ethernet driver from the kernel.
> > +	 *
> > +	 * This physical port limitation only applies to the RAW QP. For RC QP,
> > +	 * the hardware doesn't have this limitation. The user can create RC
> > +	 * QPs on a physical port up to the hardware limits independent of
> the
> > +	 * Ethernet usage on the same port.
> > +	 */
> > +	mutex_lock(&apc->vport_mutex);
> > +	if (apc->vport_use_count > 0) {
> > +		mutex_unlock(&apc->vport_mutex);
> > +		return -EBUSY;
> > +	}
> > +	apc->vport_use_count++;
> > +	mutex_unlock(&apc->vport_mutex);
> > +
> >  	mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
> >  			     sizeof(req), sizeof(resp));
> >  	req.vport = apc->port_handle;
> > @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
> > mana_port_context *apc, u32 protection_dom_id,
> >
> >  	apc->tx_shortform_allowed = resp.short_form_allowed;
> >  	apc->tx_vp_offset = resp.tx_vport_offset;
> > +
> > +	netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> > +		    apc->port_handle, protection_dom_id, doorbell_pg_id);
> >  out:
> > +	if (err)
> > +		mana_uncfg_vport(apc);
> 
> There seems to be a similar race between error handling here and the "apc-
> >vport_use_count > 0" checking above as pointed out in v7.

Thanks for looking into this.

This is different to the locking bug in mana_ib_cfg_vport(). The vport sharing
between Ethernet and RDMA is exclusive, not shared. If another driver tries
to take the vport while it is being configured, it will fail immediately. It is by
design to prevent possible deadlock.
Yunsheng Lin Oct. 25, 2022, 1:08 a.m. UTC | #3
On 2022/10/25 2:45, Long Li wrote:
>>> +int mana_cfg_vport(struct mana_port_context *apc, u32
>> protection_dom_id,
>>> +		   u32 doorbell_pg_id)
>>>  {
>>>  	struct mana_config_vport_resp resp = {};
>>>  	struct mana_config_vport_req req = {};
>>>  	int err;
>>>
>>> +	/* This function is used to program the Ethernet port in the hardware
>>> +	 * table. It can be called from the Ethernet driver or the RDMA driver.
>>> +	 *
>>> +	 * For Ethernet usage, the hardware supports only one active user on
>> a
>>> +	 * physical port. The driver checks on the port usage before
>> programming
>>> +	 * the hardware when creating the RAW QP (RDMA driver) or
>> exposing the
>>> +	 * device to kernel NET layer (Ethernet driver).
>>> +	 *
>>> +	 * Because the RDMA driver doesn't know in advance which QP type
>> the
>>> +	 * user will create, it exposes the device with all its ports. The user
>>> +	 * may not be able to create RAW QP on a port if this port is already
>>> +	 * in used by the Ethernet driver from the kernel.
>>> +	 *
>>> +	 * This physical port limitation only applies to the RAW QP. For RC QP,
>>> +	 * the hardware doesn't have this limitation. The user can create RC
>>> +	 * QPs on a physical port up to the hardware limits independent of
>> the
>>> +	 * Ethernet usage on the same port.
>>> +	 */
>>> +	mutex_lock(&apc->vport_mutex);
>>> +	if (apc->vport_use_count > 0) {
>>> +		mutex_unlock(&apc->vport_mutex);
>>> +		return -EBUSY;
>>> +	}
>>> +	apc->vport_use_count++;
>>> +	mutex_unlock(&apc->vport_mutex);
>>> +
>>>  	mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
>>>  			     sizeof(req), sizeof(resp));
>>>  	req.vport = apc->port_handle;
>>> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
>>> mana_port_context *apc, u32 protection_dom_id,
>>>
>>>  	apc->tx_shortform_allowed = resp.short_form_allowed;
>>>  	apc->tx_vp_offset = resp.tx_vport_offset;
>>> +
>>> +	netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
>>> +		    apc->port_handle, protection_dom_id, doorbell_pg_id);
>>>  out:
>>> +	if (err)
>>> +		mana_uncfg_vport(apc);
>>
>> There seems to be a similar race between error handling here and the "apc-
>>> vport_use_count > 0" checking above as pointed out in v7.
> 
> Thanks for looking into this.
> 
> This is different to the locking bug in mana_ib_cfg_vport(). The vport sharing
> between Ethernet and RDMA is exclusive, not shared. If another driver tries
> to take the vport while it is being configured, it will fail immediately. It is by

Suppose the following steps:
1. Ethernet driver take the lock first and do a "apc->vport_use_count++", and
   release the lock;
2. RDMA driver take the lock, do "apc->vport_use_count > 0" checking and return
   -EBUSY;
3. mana_send_request() or mana_verify_resp_hdr() return error to Ethernet driver.

It seems that vport is left unused when above happens, if that is what you wanted?


> design to prevent possible deadlock.

I am not sure I understand the deadlock here.

>
Long Li Oct. 25, 2022, 1:43 a.m. UTC | #4
> >>> @@ -679,9 +714,16 @@ static int mana_cfg_vport(struct
> >>> mana_port_context *apc, u32 protection_dom_id,
> >>>
> >>>  	apc->tx_shortform_allowed = resp.short_form_allowed;
> >>>  	apc->tx_vp_offset = resp.tx_vport_offset;
> >>> +
> >>> +	netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
> >>> +		    apc->port_handle, protection_dom_id, doorbell_pg_id);
> >>>  out:
> >>> +	if (err)
> >>> +		mana_uncfg_vport(apc);
> >>
> >> There seems to be a similar race between error handling here and the
> >> "apc-
> >>> vport_use_count > 0" checking above as pointed out in v7.
> >
> > Thanks for looking into this.
> >
> > This is different to the locking bug in mana_ib_cfg_vport(). The vport
> > sharing between Ethernet and RDMA is exclusive, not shared. If another
> > driver tries to take the vport while it is being configured, it will
> > fail immediately. It is by
> 
> Suppose the following steps:
> 1. Ethernet driver take the lock first and do a "apc->vport_use_count++",
> and
>    release the lock;
> 2. RDMA driver take the lock, do "apc->vport_use_count > 0" checking and
> return
>    -EBUSY;
> 3. mana_send_request() or mana_verify_resp_hdr() return error to
> Ethernet driver.
> 
> It seems that vport is left unused when above happens, if that is what you
> wanted?

Yes, in this case the vport is left unused. There is no resource leak.
This is expected.

> 
> 
> > design to prevent possible deadlock.
> 
> I am not sure I understand the deadlock here.

Because we are dealing with two drivers. I don't want to block as
mana_send_request() is a blocking call.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h
index d58be64374c8..2883a08dbfb5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana.h
+++ b/drivers/net/ethernet/microsoft/mana/mana.h
@@ -380,6 +380,10 @@  struct mana_port_context {
 	mana_handle_t port_handle;
 	mana_handle_t pf_filter_handle;
 
+	/* Mutex for sharing access to vport_use_count */
+	struct mutex vport_mutex;
+	int vport_use_count;
+
 	u16 port_idx;
 
 	bool port_is_up;
@@ -631,4 +635,7 @@  struct mana_tx_package {
 	struct gdma_posted_wqe_info wqe_info;
 };
 
+int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
+		   u32 doorbell_pg_id);
+void mana_uncfg_vport(struct mana_port_context *apc);
 #endif /* _MANA_H */
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 8751e475d1ba..efe14a343fd1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -646,13 +646,48 @@  static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
 	return 0;
 }
 
-static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
-			  u32 doorbell_pg_id)
+void mana_uncfg_vport(struct mana_port_context *apc)
+{
+	mutex_lock(&apc->vport_mutex);
+	apc->vport_use_count--;
+	WARN_ON(apc->vport_use_count < 0);
+	mutex_unlock(&apc->vport_mutex);
+}
+EXPORT_SYMBOL_NS(mana_uncfg_vport, NET_MANA);
+
+int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
+		   u32 doorbell_pg_id)
 {
 	struct mana_config_vport_resp resp = {};
 	struct mana_config_vport_req req = {};
 	int err;
 
+	/* This function is used to program the Ethernet port in the hardware
+	 * table. It can be called from the Ethernet driver or the RDMA driver.
+	 *
+	 * For Ethernet usage, the hardware supports only one active user on a
+	 * physical port. The driver checks on the port usage before programming
+	 * the hardware when creating the RAW QP (RDMA driver) or exposing the
+	 * device to kernel NET layer (Ethernet driver).
+	 *
+	 * Because the RDMA driver doesn't know in advance which QP type the
+	 * user will create, it exposes the device with all its ports. The user
+	 * may not be able to create RAW QP on a port if this port is already
+	 * in used by the Ethernet driver from the kernel.
+	 *
+	 * This physical port limitation only applies to the RAW QP. For RC QP,
+	 * the hardware doesn't have this limitation. The user can create RC
+	 * QPs on a physical port up to the hardware limits independent of the
+	 * Ethernet usage on the same port.
+	 */
+	mutex_lock(&apc->vport_mutex);
+	if (apc->vport_use_count > 0) {
+		mutex_unlock(&apc->vport_mutex);
+		return -EBUSY;
+	}
+	apc->vport_use_count++;
+	mutex_unlock(&apc->vport_mutex);
+
 	mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
 			     sizeof(req), sizeof(resp));
 	req.vport = apc->port_handle;
@@ -679,9 +714,16 @@  static int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
 
 	apc->tx_shortform_allowed = resp.short_form_allowed;
 	apc->tx_vp_offset = resp.tx_vport_offset;
+
+	netdev_info(apc->ndev, "Configured vPort %llu PD %u DB %u\n",
+		    apc->port_handle, protection_dom_id, doorbell_pg_id);
 out:
+	if (err)
+		mana_uncfg_vport(apc);
+
 	return err;
 }
+EXPORT_SYMBOL_NS(mana_cfg_vport, NET_MANA);
 
 static int mana_cfg_vport_steering(struct mana_port_context *apc,
 				   enum TRI_STATE rx,
@@ -742,6 +784,9 @@  static int mana_cfg_vport_steering(struct mana_port_context *apc,
 			   resp.hdr.status);
 		err = -EPROTO;
 	}
+
+	netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
+		    apc->port_handle, num_entries);
 out:
 	kfree(req);
 	return err;
@@ -1804,6 +1849,7 @@  static void mana_destroy_vport(struct mana_port_context *apc)
 	}
 
 	mana_destroy_txq(apc);
+	mana_uncfg_vport(apc);
 
 	if (gd->gdma_context->is_pf)
 		mana_pf_deregister_hw_vport(apc);
@@ -2076,6 +2122,9 @@  static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc->pf_filter_handle = INVALID_MANA_HANDLE;
 	apc->port_idx = port_idx;
 
+	mutex_init(&apc->vport_mutex);
+	apc->vport_use_count = 0;
+
 	ndev->netdev_ops = &mana_devops;
 	ndev->ethtool_ops = &mana_ethtool_ops;
 	ndev->mtu = ETH_DATA_LEN;