diff mbox series

[net-next] virtio-net: switch napi_tx without downing nic

Message ID f9f7d28624f8084ef07842ee569c22b324ee4055.1703059341.git.hengqi@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] virtio-net: switch napi_tx without downing nic | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1115 this patch: 1115
netdev/cc_maintainers warning 1 maintainers not CCed: virtualization@lists.linux.dev
netdev/build_clang fail Errors and warnings before: 12 this patch: 12
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1144 this patch: 1144
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 117 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Heng Qi Dec. 20, 2023, 8:07 a.m. UTC
virtio-net has two ways to switch napi_tx: one is through the
module parameter, and the other is through coalescing parameter
settings (provided that the nic status is down).

Sometimes we face performance regression caused by napi_tx,
then we need to switch napi_tx when debugging. However, the
existing methods are a bit troublesome, such as needing to
reload the driver or turn off the network card. So try to make
this update.

Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 44 deletions(-)

Comments

Willem de Bruijn Dec. 20, 2023, 2:45 p.m. UTC | #1
Heng Qi wrote:
> virtio-net has two ways to switch napi_tx: one is through the
> module parameter, and the other is through coalescing parameter
> settings (provided that the nic status is down).
> 
> Sometimes we face performance regression caused by napi_tx,
> then we need to switch napi_tx when debugging. However, the
> existing methods are a bit troublesome, such as needing to
> reload the driver or turn off the network card. So try to make
> this update.
> 
> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

The commit does not explain why it is safe to do so.

The tx-napi weights are not really weights: it is a boolean whether
napi is used for transmit cleaning, or whether packets are cleaned
in ndo_start_xmit.

There certainly are some subtle issues with regard to pausing/waking
queues when switching between modes.

Calling napi_enable/napi_disable without bringing down the device is
allowed. The actually napi.weight field is only updated when neither
napi nor ndo_start_xmit is running. So I don't see an immediate issue.


> ---
>  drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
>  1 file changed, 37 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 10614e9f7cad..12f8e1f9971c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
>  	return 0;
>  }
>  
> -static int virtnet_should_update_vq_weight(int dev_flags, int weight,
> -					   int vq_weight, bool *should_update)
> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
> +				   u32 qend, u32 tx_frames)
>  {
> -	if (weight ^ vq_weight) {
> -		if (dev_flags & IFF_UP)
> -			return -EBUSY;
> -		*should_update = true;
> -	}
> +	struct net_device *dev = vi->dev;
> +	int new_weight, cur_weight;
> +	struct netdev_queue *txq;
> +	struct send_queue *sq;
>  
> -	return 0;
> +	new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
> +	for (; qstart < qend; qstart++) {
> +		sq = &vi->sq[qstart];
> +		cur_weight = sq->napi.weight;
> +		if (!(new_weight ^ cur_weight))
> +			continue;
> +
> +		if (!(dev->flags & IFF_UP)) {
> +			sq->napi.weight = new_weight;
> +			continue;
> +		}
> +
> +		if (cur_weight)
> +			virtnet_napi_tx_disable(&sq->napi);
> +
> +		txq = netdev_get_tx_queue(dev, qstart);
> +		__netif_tx_lock_bh(txq);
> +		sq->napi.weight = new_weight;
> +		__netif_tx_unlock_bh(txq);
> +
> +		if (!cur_weight)
> +			virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
> +	}
>  }
>  
>  static int virtnet_set_coalesce(struct net_device *dev,
> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev,
>  				struct netlink_ext_ack *extack)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> -	int ret, queue_number, napi_weight;
> -	bool update_napi = false;
> -
> -	/* Can't change NAPI weight if the link is up */
> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> -	for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
> -		ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> -						      vi->sq[queue_number].napi.weight,
> -						      &update_napi);
> -		if (ret)
> -			return ret;
> -
> -		if (update_napi) {
> -			/* All queues that belong to [queue_number, vi->max_queue_pairs] will be
> -			 * updated for the sake of simplicity, which might not be necessary
> -			 */
> -			break;
> -		}
> -	}
> +	int ret;
> +
> +	/* Param tx_frames can be used to switch napi_tx */
> +	virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
> +			       ec->tx_max_coalesced_frames);
>  
>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
>  		ret = virtnet_send_notf_coal_cmds(vi, ec);
> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev,
>  	if (ret)
>  		return ret;
>  
> -	if (update_napi) {
> -		for (; queue_number < vi->max_queue_pairs; queue_number++)
> -			vi->sq[queue_number].napi.weight = napi_weight;
> -	}
> -
>  	return ret;
>  }
>  
> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>  					  struct ethtool_coalesce *ec)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> -	int ret, napi_weight;
> -	bool update_napi = false;
> +	int ret;
>  
>  	if (queue >= vi->max_queue_pairs)
>  		return -EINVAL;
>  
> -	/* Can't change NAPI weight if the link is up */
> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> -	ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> -					      vi->sq[queue].napi.weight,
> -					      &update_napi);
> -	if (ret)
> -		return ret;
> +	/* Param tx_frames can be used to switch napi_tx */
> +	virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames);
>  
>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>  		ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>  	if (ret)
>  		return ret;
>  
> -	if (update_napi)
> -		vi->sq[queue].napi.weight = napi_weight;
> -
>  	return 0;
>  }
>  
> -- 
> 2.19.1.6.gb485710b
>
Zhu Yanjun Dec. 21, 2023, 3:02 a.m. UTC | #2
在 2023/12/20 16:07, Heng Qi 写道:
> virtio-net has two ways to switch napi_tx: one is through the
> module parameter, and the other is through coalescing parameter
> settings (provided that the nic status is down).
> 
> Sometimes we face performance regression caused by napi_tx,
> then we need to switch napi_tx when debugging. However, the
> existing methods are a bit troublesome, such as needing to
> reload the driver or turn off the network card. So try to make
> this update.

What scenario can trigger this? We want to make tests on our device.

Zhu Yanjun

> 
> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>   drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
>   1 file changed, 37 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 10614e9f7cad..12f8e1f9971c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
>   	return 0;
>   }
>   
> -static int virtnet_should_update_vq_weight(int dev_flags, int weight,
> -					   int vq_weight, bool *should_update)
> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
> +				   u32 qend, u32 tx_frames)
>   {
> -	if (weight ^ vq_weight) {
> -		if (dev_flags & IFF_UP)
> -			return -EBUSY;
> -		*should_update = true;
> -	}
> +	struct net_device *dev = vi->dev;
> +	int new_weight, cur_weight;
> +	struct netdev_queue *txq;
> +	struct send_queue *sq;
>   
> -	return 0;
> +	new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
> +	for (; qstart < qend; qstart++) {
> +		sq = &vi->sq[qstart];
> +		cur_weight = sq->napi.weight;
> +		if (!(new_weight ^ cur_weight))
> +			continue;
> +
> +		if (!(dev->flags & IFF_UP)) {
> +			sq->napi.weight = new_weight;
> +			continue;
> +		}
> +
> +		if (cur_weight)
> +			virtnet_napi_tx_disable(&sq->napi);
> +
> +		txq = netdev_get_tx_queue(dev, qstart);
> +		__netif_tx_lock_bh(txq);
> +		sq->napi.weight = new_weight;
> +		__netif_tx_unlock_bh(txq);
> +
> +		if (!cur_weight)
> +			virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
> +	}
>   }
>   
>   static int virtnet_set_coalesce(struct net_device *dev,
> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev,
>   				struct netlink_ext_ack *extack)
>   {
>   	struct virtnet_info *vi = netdev_priv(dev);
> -	int ret, queue_number, napi_weight;
> -	bool update_napi = false;
> -
> -	/* Can't change NAPI weight if the link is up */
> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> -	for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
> -		ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> -						      vi->sq[queue_number].napi.weight,
> -						      &update_napi);
> -		if (ret)
> -			return ret;
> -
> -		if (update_napi) {
> -			/* All queues that belong to [queue_number, vi->max_queue_pairs] will be
> -			 * updated for the sake of simplicity, which might not be necessary
> -			 */
> -			break;
> -		}
> -	}
> +	int ret;
> +
> +	/* Param tx_frames can be used to switch napi_tx */
> +	virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
> +			       ec->tx_max_coalesced_frames);
>   
>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
>   		ret = virtnet_send_notf_coal_cmds(vi, ec);
> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev,
>   	if (ret)
>   		return ret;
>   
> -	if (update_napi) {
> -		for (; queue_number < vi->max_queue_pairs; queue_number++)
> -			vi->sq[queue_number].napi.weight = napi_weight;
> -	}
> -
>   	return ret;
>   }
>   
> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>   					  struct ethtool_coalesce *ec)
>   {
>   	struct virtnet_info *vi = netdev_priv(dev);
> -	int ret, napi_weight;
> -	bool update_napi = false;
> +	int ret;
>   
>   	if (queue >= vi->max_queue_pairs)
>   		return -EINVAL;
>   
> -	/* Can't change NAPI weight if the link is up */
> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> -	ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> -					      vi->sq[queue].napi.weight,
> -					      &update_napi);
> -	if (ret)
> -		return ret;
> +	/* Param tx_frames can be used to switch napi_tx */
> +	virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames);
>   
>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>   		ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>   	if (ret)
>   		return ret;
>   
> -	if (update_napi)
> -		vi->sq[queue].napi.weight = napi_weight;
> -
>   	return 0;
>   }
>
Heng Qi Dec. 21, 2023, 5:18 a.m. UTC | #3
在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> Heng Qi wrote:
>> virtio-net has two ways to switch napi_tx: one is through the
>> module parameter, and the other is through coalescing parameter
>> settings (provided that the nic status is down).
>>
>> Sometimes we face performance regression caused by napi_tx,
>> then we need to switch napi_tx when debugging. However, the
>> existing methods are a bit troublesome, such as needing to
>> reload the driver or turn off the network card. So try to make
>> this update.
>>
>> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
>> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> The commit does not explain why it is safe to do so.

virtnet_napi_tx_disable ensures that already scheduled tx napi ends and 
no new tx napi will be scheduled.

Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot 
send the packet.

Then we can safely toggle the weight to indicate where to clear the buffers.

>
> The tx-napi weights are not really weights: it is a boolean whether
> napi is used for transmit cleaning, or whether packets are cleaned
> in ndo_start_xmit.

Right.

>
> There certainly are some subtle issues with regard to pausing/waking
> queues when switching between modes.

What are "subtle issues" and if there are any, we find them.
So far my test results show it's working fine.

>
> Calling napi_enable/napi_disable without bringing down the device is
> allowed. The actually napi.weight field is only updated when neither
> napi nor ndo_start_xmit is running.

YES.

> So I don't see an immediate issue.

Switching napi_tx requires reloading the driver or downing the nic, 
which is troublesome.
I think it would be better if we could find a better way.

Thanks!

>
>
>> ---
>>   drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
>>   1 file changed, 37 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 10614e9f7cad..12f8e1f9971c 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
>>   	return 0;
>>   }
>>   
>> -static int virtnet_should_update_vq_weight(int dev_flags, int weight,
>> -					   int vq_weight, bool *should_update)
>> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
>> +				   u32 qend, u32 tx_frames)
>>   {
>> -	if (weight ^ vq_weight) {
>> -		if (dev_flags & IFF_UP)
>> -			return -EBUSY;
>> -		*should_update = true;
>> -	}
>> +	struct net_device *dev = vi->dev;
>> +	int new_weight, cur_weight;
>> +	struct netdev_queue *txq;
>> +	struct send_queue *sq;
>>   
>> -	return 0;
>> +	new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
>> +	for (; qstart < qend; qstart++) {
>> +		sq = &vi->sq[qstart];
>> +		cur_weight = sq->napi.weight;
>> +		if (!(new_weight ^ cur_weight))
>> +			continue;
>> +
>> +		if (!(dev->flags & IFF_UP)) {
>> +			sq->napi.weight = new_weight;
>> +			continue;
>> +		}
>> +
>> +		if (cur_weight)
>> +			virtnet_napi_tx_disable(&sq->napi);
>> +
>> +		txq = netdev_get_tx_queue(dev, qstart);
>> +		__netif_tx_lock_bh(txq);
>> +		sq->napi.weight = new_weight;
>> +		__netif_tx_unlock_bh(txq);
>> +
>> +		if (!cur_weight)
>> +			virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
>> +	}
>>   }
>>   
>>   static int virtnet_set_coalesce(struct net_device *dev,
>> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev,
>>   				struct netlink_ext_ack *extack)
>>   {
>>   	struct virtnet_info *vi = netdev_priv(dev);
>> -	int ret, queue_number, napi_weight;
>> -	bool update_napi = false;
>> -
>> -	/* Can't change NAPI weight if the link is up */
>> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> -	for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
>> -		ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>> -						      vi->sq[queue_number].napi.weight,
>> -						      &update_napi);
>> -		if (ret)
>> -			return ret;
>> -
>> -		if (update_napi) {
>> -			/* All queues that belong to [queue_number, vi->max_queue_pairs] will be
>> -			 * updated for the sake of simplicity, which might not be necessary
>> -			 */
>> -			break;
>> -		}
>> -	}
>> +	int ret;
>> +
>> +	/* Param tx_frames can be used to switch napi_tx */
>> +	virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
>> +			       ec->tx_max_coalesced_frames);
>>   
>>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
>>   		ret = virtnet_send_notf_coal_cmds(vi, ec);
>> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev,
>>   	if (ret)
>>   		return ret;
>>   
>> -	if (update_napi) {
>> -		for (; queue_number < vi->max_queue_pairs; queue_number++)
>> -			vi->sq[queue_number].napi.weight = napi_weight;
>> -	}
>> -
>>   	return ret;
>>   }
>>   
>> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>>   					  struct ethtool_coalesce *ec)
>>   {
>>   	struct virtnet_info *vi = netdev_priv(dev);
>> -	int ret, napi_weight;
>> -	bool update_napi = false;
>> +	int ret;
>>   
>>   	if (queue >= vi->max_queue_pairs)
>>   		return -EINVAL;
>>   
>> -	/* Can't change NAPI weight if the link is up */
>> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> -	ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>> -					      vi->sq[queue].napi.weight,
>> -					      &update_napi);
>> -	if (ret)
>> -		return ret;
>> +	/* Param tx_frames can be used to switch napi_tx */
>> +	virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames);
>>   
>>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>>   		ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
>> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
>>   	if (ret)
>>   		return ret;
>>   
>> -	if (update_napi)
>> -		vi->sq[queue].napi.weight = napi_weight;
>> -
>>   	return 0;
>>   }
>>   
>> -- 
>> 2.19.1.6.gb485710b
>>
Heng Qi Dec. 21, 2023, 5:20 a.m. UTC | #4
在 2023/12/21 上午11:02, Zhu Yanjun 写道:
> 在 2023/12/20 16:07, Heng Qi 写道:
>> virtio-net has two ways to switch napi_tx: one is through the
>> module parameter, and the other is through coalescing parameter
>> settings (provided that the nic status is down).
>>
>> Sometimes we face performance regression caused by napi_tx,
>> then we need to switch napi_tx when debugging. However, the
>> existing methods are a bit troublesome, such as needing to
>> reload the driver or turn off the network card. So try to make
>> this update.
>
> What scenario can trigger this? We want to make tests on our device.

Hi Zhu Yanjun, you can use the following cmds:

ethtool -C tx-frames 0, to disable napi_tx
ethtool -C tx-frames 1, to enable napi_tx

Thanks.

>
> Zhu Yanjun
>
>>
>> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
>> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> ---
>>   drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
>>   1 file changed, 37 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 10614e9f7cad..12f8e1f9971c 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -3559,16 +3559,37 @@ static int 
>> virtnet_coal_params_supported(struct ethtool_coalesce *ec)
>>       return 0;
>>   }
>>   -static int virtnet_should_update_vq_weight(int dev_flags, int weight,
>> -                       int vq_weight, bool *should_update)
>> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
>> +                   u32 qend, u32 tx_frames)
>>   {
>> -    if (weight ^ vq_weight) {
>> -        if (dev_flags & IFF_UP)
>> -            return -EBUSY;
>> -        *should_update = true;
>> -    }
>> +    struct net_device *dev = vi->dev;
>> +    int new_weight, cur_weight;
>> +    struct netdev_queue *txq;
>> +    struct send_queue *sq;
>>   -    return 0;
>> +    new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
>> +    for (; qstart < qend; qstart++) {
>> +        sq = &vi->sq[qstart];
>> +        cur_weight = sq->napi.weight;
>> +        if (!(new_weight ^ cur_weight))
>> +            continue;
>> +
>> +        if (!(dev->flags & IFF_UP)) {
>> +            sq->napi.weight = new_weight;
>> +            continue;
>> +        }
>> +
>> +        if (cur_weight)
>> +            virtnet_napi_tx_disable(&sq->napi);
>> +
>> +        txq = netdev_get_tx_queue(dev, qstart);
>> +        __netif_tx_lock_bh(txq);
>> +        sq->napi.weight = new_weight;
>> +        __netif_tx_unlock_bh(txq);
>> +
>> +        if (!cur_weight)
>> +            virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
>> +    }
>>   }
>>     static int virtnet_set_coalesce(struct net_device *dev,
>> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct 
>> net_device *dev,
>>                   struct netlink_ext_ack *extack)
>>   {
>>       struct virtnet_info *vi = netdev_priv(dev);
>> -    int ret, queue_number, napi_weight;
>> -    bool update_napi = false;
>> -
>> -    /* Can't change NAPI weight if the link is up */
>> -    napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> -    for (queue_number = 0; queue_number < vi->max_queue_pairs; 
>> queue_number++) {
>> -        ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>> - vi->sq[queue_number].napi.weight,
>> -                              &update_napi);
>> -        if (ret)
>> -            return ret;
>> -
>> -        if (update_napi) {
>> -            /* All queues that belong to [queue_number, 
>> vi->max_queue_pairs] will be
>> -             * updated for the sake of simplicity, which might not 
>> be necessary
>> -             */
>> -            break;
>> -        }
>> -    }
>> +    int ret;
>> +
>> +    /* Param tx_frames can be used to switch napi_tx */
>> +    virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
>> +                   ec->tx_max_coalesced_frames);
>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
>>           ret = virtnet_send_notf_coal_cmds(vi, ec);
>> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct 
>> net_device *dev,
>>       if (ret)
>>           return ret;
>>   -    if (update_napi) {
>> -        for (; queue_number < vi->max_queue_pairs; queue_number++)
>> -            vi->sq[queue_number].napi.weight = napi_weight;
>> -    }
>> -
>>       return ret;
>>   }
>>   @@ -3641,19 +3643,13 @@ static int 
>> virtnet_set_per_queue_coalesce(struct net_device *dev,
>>                         struct ethtool_coalesce *ec)
>>   {
>>       struct virtnet_info *vi = netdev_priv(dev);
>> -    int ret, napi_weight;
>> -    bool update_napi = false;
>> +    int ret;
>>         if (queue >= vi->max_queue_pairs)
>>           return -EINVAL;
>>   -    /* Can't change NAPI weight if the link is up */
>> -    napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> -    ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>> -                          vi->sq[queue].napi.weight,
>> -                          &update_napi);
>> -    if (ret)
>> -        return ret;
>> +    /* Param tx_frames can be used to switch napi_tx */
>> +    virtnet_switch_napi_tx(vi, queue, queue, 
>> ec->tx_max_coalesced_frames);
>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>>           ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
>> @@ -3663,9 +3659,6 @@ static int 
>> virtnet_set_per_queue_coalesce(struct net_device *dev,
>>       if (ret)
>>           return ret;
>>   -    if (update_napi)
>> -        vi->sq[queue].napi.weight = napi_weight;
>> -
>>       return 0;
>>   }
Zhu Yanjun Dec. 21, 2023, 2:34 p.m. UTC | #5
在 2023/12/21 13:20, Heng Qi 写道:
>
>
> 在 2023/12/21 上午11:02, Zhu Yanjun 写道:
>> 在 2023/12/20 16:07, Heng Qi 写道:
>>> virtio-net has two ways to switch napi_tx: one is through the
>>> module parameter, and the other is through coalescing parameter
>>> settings (provided that the nic status is down).
>>>
>>> Sometimes we face performance regression caused by napi_tx,
>>> then we need to switch napi_tx when debugging. However, the
>>> existing methods are a bit troublesome, such as needing to
>>> reload the driver or turn off the network card. So try to make
>>> this update.
>>
>> What scenario can trigger this? We want to make tests on our device.
>
> Hi Zhu Yanjun, you can use the following cmds:
>
> ethtool -C tx-frames 0, to disable napi_tx
> ethtool -C tx-frames 1, to enable napi_tx


Thanks a lot. Just now I made tests on our device. I confirmed that 
virtion_net driver can work well after running "ethtool -C NIC tx-frames 
0 && sleep 3 && ethtool -C NIC tx-frames 1".

You can add "Reviewed-and-tested-by: Zhu Yanjun <yanjun.zhu@linux.dev>"

Thanks,

Zhu Yanjun

>
> Thanks.
>
>>
>> Zhu Yanjun
>>
>>>
>>> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
>>> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>>> ---
>>>   drivers/net/virtio_net.c | 81 
>>> ++++++++++++++++++----------------------
>>>   1 file changed, 37 insertions(+), 44 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 10614e9f7cad..12f8e1f9971c 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -3559,16 +3559,37 @@ static int 
>>> virtnet_coal_params_supported(struct ethtool_coalesce *ec)
>>>       return 0;
>>>   }
>>>   -static int virtnet_should_update_vq_weight(int dev_flags, int 
>>> weight,
>>> -                       int vq_weight, bool *should_update)
>>> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 
>>> qstart,
>>> +                   u32 qend, u32 tx_frames)
>>>   {
>>> -    if (weight ^ vq_weight) {
>>> -        if (dev_flags & IFF_UP)
>>> -            return -EBUSY;
>>> -        *should_update = true;
>>> -    }
>>> +    struct net_device *dev = vi->dev;
>>> +    int new_weight, cur_weight;
>>> +    struct netdev_queue *txq;
>>> +    struct send_queue *sq;
>>>   -    return 0;
>>> +    new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
>>> +    for (; qstart < qend; qstart++) {
>>> +        sq = &vi->sq[qstart];
>>> +        cur_weight = sq->napi.weight;
>>> +        if (!(new_weight ^ cur_weight))
>>> +            continue;
>>> +
>>> +        if (!(dev->flags & IFF_UP)) {
>>> +            sq->napi.weight = new_weight;
>>> +            continue;
>>> +        }
>>> +
>>> +        if (cur_weight)
>>> +            virtnet_napi_tx_disable(&sq->napi);
>>> +
>>> +        txq = netdev_get_tx_queue(dev, qstart);
>>> +        __netif_tx_lock_bh(txq);
>>> +        sq->napi.weight = new_weight;
>>> +        __netif_tx_unlock_bh(txq);
>>> +
>>> +        if (!cur_weight)
>>> +            virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
>>> +    }
>>>   }
>>>     static int virtnet_set_coalesce(struct net_device *dev,
>>> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct 
>>> net_device *dev,
>>>                   struct netlink_ext_ack *extack)
>>>   {
>>>       struct virtnet_info *vi = netdev_priv(dev);
>>> -    int ret, queue_number, napi_weight;
>>> -    bool update_napi = false;
>>> -
>>> -    /* Can't change NAPI weight if the link is up */
>>> -    napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>>> -    for (queue_number = 0; queue_number < vi->max_queue_pairs; 
>>> queue_number++) {
>>> -        ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>>> - vi->sq[queue_number].napi.weight,
>>> -                              &update_napi);
>>> -        if (ret)
>>> -            return ret;
>>> -
>>> -        if (update_napi) {
>>> -            /* All queues that belong to [queue_number, 
>>> vi->max_queue_pairs] will be
>>> -             * updated for the sake of simplicity, which might not 
>>> be necessary
>>> -             */
>>> -            break;
>>> -        }
>>> -    }
>>> +    int ret;
>>> +
>>> +    /* Param tx_frames can be used to switch napi_tx */
>>> +    virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
>>> +                   ec->tx_max_coalesced_frames);
>>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
>>>           ret = virtnet_send_notf_coal_cmds(vi, ec);
>>> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct 
>>> net_device *dev,
>>>       if (ret)
>>>           return ret;
>>>   -    if (update_napi) {
>>> -        for (; queue_number < vi->max_queue_pairs; queue_number++)
>>> -            vi->sq[queue_number].napi.weight = napi_weight;
>>> -    }
>>> -
>>>       return ret;
>>>   }
>>>   @@ -3641,19 +3643,13 @@ static int 
>>> virtnet_set_per_queue_coalesce(struct net_device *dev,
>>>                         struct ethtool_coalesce *ec)
>>>   {
>>>       struct virtnet_info *vi = netdev_priv(dev);
>>> -    int ret, napi_weight;
>>> -    bool update_napi = false;
>>> +    int ret;
>>>         if (queue >= vi->max_queue_pairs)
>>>           return -EINVAL;
>>>   -    /* Can't change NAPI weight if the link is up */
>>> -    napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>>> -    ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
>>> -                          vi->sq[queue].napi.weight,
>>> -                          &update_napi);
>>> -    if (ret)
>>> -        return ret;
>>> +    /* Param tx_frames can be used to switch napi_tx */
>>> +    virtnet_switch_napi_tx(vi, queue, queue, 
>>> ec->tx_max_coalesced_frames);
>>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>>>           ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
>>> @@ -3663,9 +3659,6 @@ static int 
>>> virtnet_set_per_queue_coalesce(struct net_device *dev,
>>>       if (ret)
>>>           return ret;
>>>   -    if (update_napi)
>>> -        vi->sq[queue].napi.weight = napi_weight;
>>> -
>>>       return 0;
>>>   }
>
Willem de Bruijn Dec. 21, 2023, 3:05 p.m. UTC | #6
Heng Qi wrote:
> 
> 
> 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > Heng Qi wrote:
> >> virtio-net has two ways to switch napi_tx: one is through the
> >> module parameter, and the other is through coalescing parameter
> >> settings (provided that the nic status is down).
> >>
> >> Sometimes we face performance regression caused by napi_tx,
> >> then we need to switch napi_tx when debugging. However, the
> >> existing methods are a bit troublesome, such as needing to
> >> reload the driver or turn off the network card. So try to make
> >> this update.
> >>
> >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > The commit does not explain why it is safe to do so.
> 
> virtnet_napi_tx_disable ensures that already scheduled tx napi ends and 
> no new tx napi will be scheduled.
> 
> Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot 
> send the packet.
> 
> Then we can safely toggle the weight to indicate where to clear the buffers.
> 
> >
> > The tx-napi weights are not really weights: it is a boolean whether
> > napi is used for transmit cleaning, or whether packets are cleaned
> > in ndo_start_xmit.
> 
> Right.
> 
> >
> > There certainly are some subtle issues with regard to pausing/waking
> > queues when switching between modes.
> 
> What are "subtle issues" and if there are any, we find them.

A single runtime test is not sufficient to exercise all edge cases.

Please don't leave it to reviewers to establish the correctness of a
patch.

The napi_tx and non-napi code paths differ in how they handle at least
the following structures:

1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
needed as delay until the next ndo_start_xmit and thus completion is
unbounded.

When switching to napi mode, orphaned skbs may now be cleaned by the
napi handler. This is indeed safe.

When switching from napi to non-napi, the unbound latency resurfaces.
It is a small edge case, and I think a potentially acceptable risk, if
the user of this knob is aware of the risk.

2. virtqueue callback ("interrupt" masking). The non-napi path enables
the interrupt (disables the mask) when available descriptors falls
beneath a low watermark, and reenables when it recovers above a high
watermark. Napi disables when napi is scheduled, and reenables on
napi complete.

3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
a low watermark, the driver stops the stack for queuing more packets.
In napi mode, it schedules napi to clean packets. See the calls to
netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
netif_tx_wake_queue.

Some if this can be assumed safe by looking at existing analogous
code, such as the queue stop/start in virtnet_tx_resize.

But that all virtqueue callback and dev_queue->state transitions are
correct when switching between modes at runtime is not trivial to
establish, deserves some thought and explanation in the commit
message.

> So far my test results show it's working fine.
> 
> >
> > Calling napi_enable/napi_disable without bringing down the device is
> > allowed. The actually napi.weight field is only updated when neither
> > napi nor ndo_start_xmit is running.
> 
> YES.
> 
> > So I don't see an immediate issue.
> 
> Switching napi_tx requires reloading the driver or downing the nic, 
> which is troublesome.
> I think it would be better if we could find a better way.
> 
> Thanks!
> 
> >
> >
> >> ---
> >>   drivers/net/virtio_net.c | 81 ++++++++++++++++++----------------------
> >>   1 file changed, 37 insertions(+), 44 deletions(-)
> >>
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index 10614e9f7cad..12f8e1f9971c 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -3559,16 +3559,37 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
> >>   	return 0;
> >>   }
> >>   
> >> -static int virtnet_should_update_vq_weight(int dev_flags, int weight,
> >> -					   int vq_weight, bool *should_update)
> >> +static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
> >> +				   u32 qend, u32 tx_frames)
> >>   {
> >> -	if (weight ^ vq_weight) {
> >> -		if (dev_flags & IFF_UP)
> >> -			return -EBUSY;
> >> -		*should_update = true;
> >> -	}
> >> +	struct net_device *dev = vi->dev;
> >> +	int new_weight, cur_weight;
> >> +	struct netdev_queue *txq;
> >> +	struct send_queue *sq;
> >>   
> >> -	return 0;
> >> +	new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
> >> +	for (; qstart < qend; qstart++) {
> >> +		sq = &vi->sq[qstart];
> >> +		cur_weight = sq->napi.weight;
> >> +		if (!(new_weight ^ cur_weight))
> >> +			continue;
> >> +
> >> +		if (!(dev->flags & IFF_UP)) {
> >> +			sq->napi.weight = new_weight;
> >> +			continue;
> >> +		}
> >> +
> >> +		if (cur_weight)
> >> +			virtnet_napi_tx_disable(&sq->napi);
> >> +
> >> +		txq = netdev_get_tx_queue(dev, qstart);
> >> +		__netif_tx_lock_bh(txq);
> >> +		sq->napi.weight = new_weight;
> >> +		__netif_tx_unlock_bh(txq);
> >> +
> >> +		if (!cur_weight)
> >> +			virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
> >> +	}
> >>   }
> >>   
> >>   static int virtnet_set_coalesce(struct net_device *dev,
> >> @@ -3577,25 +3598,11 @@ static int virtnet_set_coalesce(struct net_device *dev,
> >>   				struct netlink_ext_ack *extack)
> >>   {
> >>   	struct virtnet_info *vi = netdev_priv(dev);
> >> -	int ret, queue_number, napi_weight;
> >> -	bool update_napi = false;
> >> -
> >> -	/* Can't change NAPI weight if the link is up */
> >> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> >> -	for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
> >> -		ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> >> -						      vi->sq[queue_number].napi.weight,
> >> -						      &update_napi);
> >> -		if (ret)
> >> -			return ret;
> >> -
> >> -		if (update_napi) {
> >> -			/* All queues that belong to [queue_number, vi->max_queue_pairs] will be
> >> -			 * updated for the sake of simplicity, which might not be necessary
> >> -			 */
> >> -			break;
> >> -		}
> >> -	}
> >> +	int ret;
> >> +
> >> +	/* Param tx_frames can be used to switch napi_tx */
> >> +	virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
> >> +			       ec->tx_max_coalesced_frames);
> >>   
> >>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
> >>   		ret = virtnet_send_notf_coal_cmds(vi, ec);
> >> @@ -3605,11 +3612,6 @@ static int virtnet_set_coalesce(struct net_device *dev,
> >>   	if (ret)
> >>   		return ret;
> >>   
> >> -	if (update_napi) {
> >> -		for (; queue_number < vi->max_queue_pairs; queue_number++)
> >> -			vi->sq[queue_number].napi.weight = napi_weight;
> >> -	}
> >> -
> >>   	return ret;
> >>   }
> >>   
> >> @@ -3641,19 +3643,13 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
> >>   					  struct ethtool_coalesce *ec)
> >>   {
> >>   	struct virtnet_info *vi = netdev_priv(dev);
> >> -	int ret, napi_weight;
> >> -	bool update_napi = false;
> >> +	int ret;
> >>   
> >>   	if (queue >= vi->max_queue_pairs)
> >>   		return -EINVAL;
> >>   
> >> -	/* Can't change NAPI weight if the link is up */
> >> -	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> >> -	ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
> >> -					      vi->sq[queue].napi.weight,
> >> -					      &update_napi);
> >> -	if (ret)
> >> -		return ret;
> >> +	/* Param tx_frames can be used to switch napi_tx */
> >> +	virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames);
> >>   
> >>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
> >>   		ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
> >> @@ -3663,9 +3659,6 @@ static int virtnet_set_per_queue_coalesce(struct net_device *dev,
> >>   	if (ret)
> >>   		return ret;
> >>   
> >> -	if (update_napi)
> >> -		vi->sq[queue].napi.weight = napi_weight;
> >> -
> >>   	return 0;
> >>   }
> >>   
> >> -- 
> >> 2.19.1.6.gb485710b
> >>
>
Jason Wang Dec. 22, 2023, 2:35 a.m. UTC | #7
On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Heng Qi wrote:
> >
> >
> > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > Heng Qi wrote:
> > >> virtio-net has two ways to switch napi_tx: one is through the
> > >> module parameter, and the other is through coalescing parameter
> > >> settings (provided that the nic status is down).
> > >>
> > >> Sometimes we face performance regression caused by napi_tx,
> > >> then we need to switch napi_tx when debugging. However, the
> > >> existing methods are a bit troublesome, such as needing to
> > >> reload the driver or turn off the network card.

Why is this troublesome? We don't need to turn off the card, it's just
a toggling of the interface.

This ends up with pretty simple code.

> So try to make
> > >> this update.
> > >>
> > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > The commit does not explain why it is safe to do so.
> >
> > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > no new tx napi will be scheduled.
> >
> > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > send the packet.
> >
> > Then we can safely toggle the weight to indicate where to clear the buffers.
> >
> > >
> > > The tx-napi weights are not really weights: it is a boolean whether
> > > napi is used for transmit cleaning, or whether packets are cleaned
> > > in ndo_start_xmit.
> >
> > Right.
> >
> > >
> > > There certainly are some subtle issues with regard to pausing/waking
> > > queues when switching between modes.
> >
> > What are "subtle issues" and if there are any, we find them.
>
> A single runtime test is not sufficient to exercise all edge cases.
>
> Please don't leave it to reviewers to establish the correctness of a
> patch.

+1

And instead of trying to do this, it would be much better to optimize
the NAPI performance. Then we can drop the orphan mode.

>
> The napi_tx and non-napi code paths differ in how they handle at least
> the following structures:
>
> 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> needed as delay until the next ndo_start_xmit and thus completion is
> unbounded.
>
> When switching to napi mode, orphaned skbs may now be cleaned by the
> napi handler. This is indeed safe.
>
> When switching from napi to non-napi, the unbound latency resurfaces.
> It is a small edge case, and I think a potentially acceptable risk, if
> the user of this knob is aware of the risk.
>
> 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> the interrupt (disables the mask) when available descriptors falls
> beneath a low watermark, and reenables when it recovers above a high
> watermark. Napi disables when napi is scheduled, and reenables on
> napi complete.
>
> 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> a low watermark, the driver stops the stack for queuing more packets.
> In napi mode, it schedules napi to clean packets. See the calls to
> netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> netif_tx_wake_queue.
>
> Some if this can be assumed safe by looking at existing analogous
> code, such as the queue stop/start in virtnet_tx_resize.
>
> But that all virtqueue callback and dev_queue->state transitions are
> correct when switching between modes at runtime is not trivial to
> establish, deserves some thought and explanation in the commit
> message.

Thanks
Michael S. Tsirkin Dec. 22, 2023, 8:14 a.m. UTC | #8
On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote:
> On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Heng Qi wrote:
> > >
> > >
> > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > Heng Qi wrote:
> > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > >> module parameter, and the other is through coalescing parameter
> > > >> settings (provided that the nic status is down).
> > > >>
> > > >> Sometimes we face performance regression caused by napi_tx,
> > > >> then we need to switch napi_tx when debugging. However, the
> > > >> existing methods are a bit troublesome, such as needing to
> > > >> reload the driver or turn off the network card.
> 
> Why is this troublesome? We don't need to turn off the card, it's just
> a toggling of the interface.
> 
> This ends up with pretty simple code.
> 
> > So try to make
> > > >> this update.
> > > >>
> > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > The commit does not explain why it is safe to do so.
> > >
> > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > no new tx napi will be scheduled.
> > >
> > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > send the packet.
> > >
> > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > >
> > > >
> > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > in ndo_start_xmit.
> > >
> > > Right.
> > >
> > > >
> > > > There certainly are some subtle issues with regard to pausing/waking
> > > > queues when switching between modes.
> > >
> > > What are "subtle issues" and if there are any, we find them.
> >
> > A single runtime test is not sufficient to exercise all edge cases.
> >
> > Please don't leave it to reviewers to establish the correctness of a
> > patch.
> 
> +1
> 
> And instead of trying to do this, it would be much better to optimize
> the NAPI performance. Then we can drop the orphan mode.

"To address your problem, optimize our code to the level which we
couldn't achieve in more than 10 years".  That's not a reasonable
requirement. Not getting an interrupt will always be better for some
workloads.


> >
> > The napi_tx and non-napi code paths differ in how they handle at least
> > the following structures:
> >
> > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > needed as delay until the next ndo_start_xmit and thus completion is
> > unbounded.
> >
> > When switching to napi mode, orphaned skbs may now be cleaned by the
> > napi handler. This is indeed safe.
> >
> > When switching from napi to non-napi, the unbound latency resurfaces.
> > It is a small edge case, and I think a potentially acceptable risk, if
> > the user of this knob is aware of the risk.
> >
> > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > the interrupt (disables the mask) when available descriptors falls
> > beneath a low watermark, and reenables when it recovers above a high
> > watermark. Napi disables when napi is scheduled, and reenables on
> > napi complete.
> >
> > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > a low watermark, the driver stops the stack for queuing more packets.
> > In napi mode, it schedules napi to clean packets. See the calls to
> > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > netif_tx_wake_queue.
> >
> > Some if this can be assumed safe by looking at existing analogous
> > code, such as the queue stop/start in virtnet_tx_resize.
> >
> > But that all virtqueue callback and dev_queue->state transitions are
> > correct when switching between modes at runtime is not trivial to
> > establish, deserves some thought and explanation in the commit
> > message.
> 
> Thanks
Jason Xing Dec. 25, 2023, 2:24 a.m. UTC | #9
Hello Jason,
On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Heng Qi wrote:
> > >
> > >
> > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > Heng Qi wrote:
> > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > >> module parameter, and the other is through coalescing parameter
> > > >> settings (provided that the nic status is down).
> > > >>
> > > >> Sometimes we face performance regression caused by napi_tx,
> > > >> then we need to switch napi_tx when debugging. However, the
> > > >> existing methods are a bit troublesome, such as needing to
> > > >> reload the driver or turn off the network card.
>
> Why is this troublesome? We don't need to turn off the card, it's just
> a toggling of the interface.
>
> This ends up with pretty simple code.
>
> > So try to make
> > > >> this update.
> > > >>
> > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > The commit does not explain why it is safe to do so.
> > >
> > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > no new tx napi will be scheduled.
> > >
> > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > send the packet.
> > >
> > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > >
> > > >
> > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > in ndo_start_xmit.
> > >
> > > Right.
> > >
> > > >
> > > > There certainly are some subtle issues with regard to pausing/waking
> > > > queues when switching between modes.
> > >
> > > What are "subtle issues" and if there are any, we find them.
> >
> > A single runtime test is not sufficient to exercise all edge cases.
> >
> > Please don't leave it to reviewers to establish the correctness of a
> > patch.
>
> +1
>
[...]
> And instead of trying to do this, it would be much better to optimize
> the NAPI performance. Then we can drop the orphan mode.

Do you mean when to call skb_orphan()? If yes, I just want to provide
more information that we also have some performance issues where the
flow control takes a bad effect, especially under some small
throughput in our production environment.
What strikes me as odd is if I restart the network, then the issue
will go with the wind. I cannot reproduce it in my testing machine.
One more thing: if I force skb_orphan() the current skb in every
start_xmit(), it could also solve the issue but not in a proper way.
After all, it drops the flow control... :S

Thanks,
Jason
>
> >
> > The napi_tx and non-napi code paths differ in how they handle at least
> > the following structures:
> >
> > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > needed as delay until the next ndo_start_xmit and thus completion is
> > unbounded.
> >
> > When switching to napi mode, orphaned skbs may now be cleaned by the
> > napi handler. This is indeed safe.
> >
> > When switching from napi to non-napi, the unbound latency resurfaces.
> > It is a small edge case, and I think a potentially acceptable risk, if
> > the user of this knob is aware of the risk.
> >
> > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > the interrupt (disables the mask) when available descriptors falls
> > beneath a low watermark, and reenables when it recovers above a high
> > watermark. Napi disables when napi is scheduled, and reenables on
> > napi complete.
> >
> > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > a low watermark, the driver stops the stack for queuing more packets.
> > In napi mode, it schedules napi to clean packets. See the calls to
> > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > netif_tx_wake_queue.
> >
> > Some if this can be assumed safe by looking at existing analogous
> > code, such as the queue stop/start in virtnet_tx_resize.
> >
> > But that all virtqueue callback and dev_queue->state transitions are
> > correct when switching between modes at runtime is not trivial to
> > establish, deserves some thought and explanation in the commit
> > message.
>
> Thanks
>
>
Jason Wang Dec. 25, 2023, 4:12 a.m. UTC | #10
On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote:
> > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Heng Qi wrote:
> > > >
> > > >
> > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > Heng Qi wrote:
> > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > >> module parameter, and the other is through coalescing parameter
> > > > >> settings (provided that the nic status is down).
> > > > >>
> > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > >> then we need to switch napi_tx when debugging. However, the
> > > > >> existing methods are a bit troublesome, such as needing to
> > > > >> reload the driver or turn off the network card.
> >
> > Why is this troublesome? We don't need to turn off the card, it's just
> > a toggling of the interface.
> >
> > This ends up with pretty simple code.
> >
> > > So try to make
> > > > >> this update.
> > > > >>
> > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > The commit does not explain why it is safe to do so.
> > > >
> > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > no new tx napi will be scheduled.
> > > >
> > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > send the packet.
> > > >
> > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > >
> > > > >
> > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > in ndo_start_xmit.
> > > >
> > > > Right.
> > > >
> > > > >
> > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > queues when switching between modes.
> > > >
> > > > What are "subtle issues" and if there are any, we find them.
> > >
> > > A single runtime test is not sufficient to exercise all edge cases.
> > >
> > > Please don't leave it to reviewers to establish the correctness of a
> > > patch.
> >
> > +1
> >
> > And instead of trying to do this, it would be much better to optimize
> > the NAPI performance. Then we can drop the orphan mode.
>
> "To address your problem, optimize our code to the level which we
> couldn't achieve in more than 10 years".

Last time QE didn't report any issue for TCP. For others, the code
might just need some optimization if it really matters, it's just
because nobody has worked on this part in the past years.

The ethtool trick is just for debugging purposes, I can hardly believe
it is used by any management layer software.

> That's not a reasonable
> requirement. Not getting an interrupt will always be better for some
> workloads.

So NAPI has been enabled by default for many years. And most of the
NIC doesn't do orphans. Orphan has known issues like pktgen and
others. Keeping two modes may result in tricky code and complicate the
features like BQL [1]. We would have interrupt coalescing and dim
soon. Admin can enable the heuristic or tweak it according to the
workload which is much more user friendly than orphaning.

I don't see a strong point to keep the orphan mode any more
considering the advantages of NAPI.

Thanks

[1] https://lore.kernel.org/lkml/20181205225323.12555-2-mst@redhat.com/



>
>
> > >
> > > The napi_tx and non-napi code paths differ in how they handle at least
> > > the following structures:
> > >
> > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > needed as delay until the next ndo_start_xmit and thus completion is
> > > unbounded.
> > >
> > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > napi handler. This is indeed safe.
> > >
> > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > It is a small edge case, and I think a potentially acceptable risk, if
> > > the user of this knob is aware of the risk.
> > >
> > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > the interrupt (disables the mask) when available descriptors falls
> > > beneath a low watermark, and reenables when it recovers above a high
> > > watermark. Napi disables when napi is scheduled, and reenables on
> > > napi complete.
> > >
> > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > a low watermark, the driver stops the stack for queuing more packets.
> > > In napi mode, it schedules napi to clean packets. See the calls to
> > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > netif_tx_wake_queue.
> > >
> > > Some if this can be assumed safe by looking at existing analogous
> > > code, such as the queue stop/start in virtnet_tx_resize.
> > >
> > > But that all virtqueue callback and dev_queue->state transitions are
> > > correct when switching between modes at runtime is not trivial to
> > > establish, deserves some thought and explanation in the commit
> > > message.
> >
> > Thanks
>
Jason Wang Dec. 25, 2023, 4:14 a.m. UTC | #11
On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hello Jason,
> On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Heng Qi wrote:
> > > >
> > > >
> > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > Heng Qi wrote:
> > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > >> module parameter, and the other is through coalescing parameter
> > > > >> settings (provided that the nic status is down).
> > > > >>
> > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > >> then we need to switch napi_tx when debugging. However, the
> > > > >> existing methods are a bit troublesome, such as needing to
> > > > >> reload the driver or turn off the network card.
> >
> > Why is this troublesome? We don't need to turn off the card, it's just
> > a toggling of the interface.
> >
> > This ends up with pretty simple code.
> >
> > > So try to make
> > > > >> this update.
> > > > >>
> > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > The commit does not explain why it is safe to do so.
> > > >
> > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > no new tx napi will be scheduled.
> > > >
> > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > send the packet.
> > > >
> > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > >
> > > > >
> > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > in ndo_start_xmit.
> > > >
> > > > Right.
> > > >
> > > > >
> > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > queues when switching between modes.
> > > >
> > > > What are "subtle issues" and if there are any, we find them.
> > >
> > > A single runtime test is not sufficient to exercise all edge cases.
> > >
> > > Please don't leave it to reviewers to establish the correctness of a
> > > patch.
> >
> > +1
> >
> [...]
> > And instead of trying to do this, it would be much better to optimize
> > the NAPI performance. Then we can drop the orphan mode.
>
> Do you mean when to call skb_orphan()? If yes, I just want to provide
> more information that we also have some performance issues where the
> flow control takes a bad effect, especially under some small
> throughput in our production environment.

I think you need to describe it in detail.

> What strikes me as odd is if I restart the network, then the issue
> will go with the wind. I cannot reproduce it in my testing machine.
> One more thing: if I force skb_orphan() the current skb in every
> start_xmit(), it could also solve the issue but not in a proper way.
> After all, it drops the flow control... :S

Yes, that's the known issue.

Thanks

>
> Thanks,
> Jason
> >
> > >
> > > The napi_tx and non-napi code paths differ in how they handle at least
> > > the following structures:
> > >
> > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > needed as delay until the next ndo_start_xmit and thus completion is
> > > unbounded.
> > >
> > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > napi handler. This is indeed safe.
> > >
> > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > It is a small edge case, and I think a potentially acceptable risk, if
> > > the user of this knob is aware of the risk.
> > >
> > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > the interrupt (disables the mask) when available descriptors falls
> > > beneath a low watermark, and reenables when it recovers above a high
> > > watermark. Napi disables when napi is scheduled, and reenables on
> > > napi complete.
> > >
> > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > a low watermark, the driver stops the stack for queuing more packets.
> > > In napi mode, it schedules napi to clean packets. See the calls to
> > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > netif_tx_wake_queue.
> > >
> > > Some if this can be assumed safe by looking at existing analogous
> > > code, such as the queue stop/start in virtnet_tx_resize.
> > >
> > > But that all virtqueue callback and dev_queue->state transitions are
> > > correct when switching between modes at runtime is not trivial to
> > > establish, deserves some thought and explanation in the commit
> > > message.
> >
> > Thanks
> >
> >
>
Jason Xing Dec. 25, 2023, 6:33 a.m. UTC | #12
On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hello Jason,
> > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Heng Qi wrote:
> > > > >
> > > > >
> > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > Heng Qi wrote:
> > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > >> module parameter, and the other is through coalescing parameter
> > > > > >> settings (provided that the nic status is down).
> > > > > >>
> > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > >> reload the driver or turn off the network card.
> > >
> > > Why is this troublesome? We don't need to turn off the card, it's just
> > > a toggling of the interface.
> > >
> > > This ends up with pretty simple code.
> > >
> > > > So try to make
> > > > > >> this update.
> > > > > >>
> > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > The commit does not explain why it is safe to do so.
> > > > >
> > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > no new tx napi will be scheduled.
> > > > >
> > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > send the packet.
> > > > >
> > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > >
> > > > > >
> > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > in ndo_start_xmit.
> > > > >
> > > > > Right.
> > > > >
> > > > > >
> > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > queues when switching between modes.
> > > > >
> > > > > What are "subtle issues" and if there are any, we find them.
> > > >
> > > > A single runtime test is not sufficient to exercise all edge cases.
> > > >
> > > > Please don't leave it to reviewers to establish the correctness of a
> > > > patch.
> > >
> > > +1
> > >
> > [...]
> > > And instead of trying to do this, it would be much better to optimize
> > > the NAPI performance. Then we can drop the orphan mode.
> >
[...]
> > Do you mean when to call skb_orphan()? If yes, I just want to provide
> > more information that we also have some performance issues where the
> > flow control takes a bad effect, especially under some small
> > throughput in our production environment.
>
> I think you need to describe it in detail.

Some of the details were described below in the last email. The
decreased performance happened because of flow control: the delay of
skb free means the delay of decreasing of sk_wmem_alloc, then it will
hit the limit of TSQ mechanism, finally causing transmitting slowly in
the TCP layer.

>
> > What strikes me as odd is if I restart the network, then the issue
> > will go with the wind. I cannot reproduce it in my testing machine.
> > One more thing: if I force skb_orphan() the current skb in every
> > start_xmit(), it could also solve the issue but not in a proper way.
> > After all, it drops the flow control... :S
>
> Yes, that's the known issue.

Really? Did you have some numbers or have some discussion links to
share? I failed to reproduce on my testing machine, probably the short
rtt is the key/obstacle.

@Eric, it seems it still exists.

Thanks,
Jason

>
> Thanks
>
> >
> > Thanks,
> > Jason
> > >
> > > >
> > > > The napi_tx and non-napi code paths differ in how they handle at least
> > > > the following structures:
> > > >
> > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > > needed as delay until the next ndo_start_xmit and thus completion is
> > > > unbounded.
> > > >
> > > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > > napi handler. This is indeed safe.
> > > >
> > > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > > It is a small edge case, and I think a potentially acceptable risk, if
> > > > the user of this knob is aware of the risk.
> > > >
> > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > > the interrupt (disables the mask) when available descriptors falls
> > > > beneath a low watermark, and reenables when it recovers above a high
> > > > watermark. Napi disables when napi is scheduled, and reenables on
> > > > napi complete.
> > > >
> > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > > a low watermark, the driver stops the stack for queuing more packets.
> > > > In napi mode, it schedules napi to clean packets. See the calls to
> > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > > netif_tx_wake_queue.
> > > >
> > > > Some if this can be assumed safe by looking at existing analogous
> > > > code, such as the queue stop/start in virtnet_tx_resize.
> > > >
> > > > But that all virtqueue callback and dev_queue->state transitions are
> > > > correct when switching between modes at runtime is not trivial to
> > > > establish, deserves some thought and explanation in the commit
> > > > message.
> > >
> > > Thanks
> > >
> > >
> >
>
Jason Wang Dec. 25, 2023, 6:44 a.m. UTC | #13
On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >
> > > Hello Jason,
> > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > >
> > > > > Heng Qi wrote:
> > > > > >
> > > > > >
> > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > > Heng Qi wrote:
> > > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > > >> module parameter, and the other is through coalescing parameter
> > > > > > >> settings (provided that the nic status is down).
> > > > > > >>
> > > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > > >> reload the driver or turn off the network card.
> > > >
> > > > Why is this troublesome? We don't need to turn off the card, it's just
> > > > a toggling of the interface.
> > > >
> > > > This ends up with pretty simple code.
> > > >
> > > > > So try to make
> > > > > > >> this update.
> > > > > > >>
> > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > The commit does not explain why it is safe to do so.
> > > > > >
> > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > > no new tx napi will be scheduled.
> > > > > >
> > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > > send the packet.
> > > > > >
> > > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > > >
> > > > > > >
> > > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > > in ndo_start_xmit.
> > > > > >
> > > > > > Right.
> > > > > >
> > > > > > >
> > > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > > queues when switching between modes.
> > > > > >
> > > > > > What are "subtle issues" and if there are any, we find them.
> > > > >
> > > > > A single runtime test is not sufficient to exercise all edge cases.
> > > > >
> > > > > Please don't leave it to reviewers to establish the correctness of a
> > > > > patch.
> > > >
> > > > +1
> > > >
> > > [...]
> > > > And instead of trying to do this, it would be much better to optimize
> > > > the NAPI performance. Then we can drop the orphan mode.
> > >
> [...]
> > > Do you mean when to call skb_orphan()? If yes, I just want to provide
> > > more information that we also have some performance issues where the
> > > flow control takes a bad effect, especially under some small
> > > throughput in our production environment.
> >
> > I think you need to describe it in detail.
>
> Some of the details were described below in the last email. The
> decreased performance happened because of flow control: the delay of
> skb free means the delay

What do you mean by delay here? Is it an interrupt delay? If yes, Does
it work better if you simply remove

virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the
former may delay the interrupt more or less depend on the traffic.

> of decreasing of sk_wmem_alloc, then it will
> hit the limit of TSQ mechanism, finally causing transmitting slowly in
> the TCP layer.

TSQ might work better with BQL which virtio-net doesn't have right now.

>
> >
> > > What strikes me as odd is if I restart the network, then the issue
> > > will go with the wind. I cannot reproduce it in my testing machine.
> > > One more thing: if I force skb_orphan() the current skb in every
> > > start_xmit(), it could also solve the issue but not in a proper way.
> > > After all, it drops the flow control... :S
> >
> > Yes, that's the known issue.
>
> Really? Did you have some numbers or have some discussion links to
> share? I failed to reproduce on my testing machine, probably the short
> rtt is the key/obstacle.

I basically mean it is a known side effect of skb_orphan() as it might
decrease sk_wmem_alloc too early.

Thanks


>
> @Eric, it seems it still exists.
>
> Thanks,
> Jason
>
> >
> > Thanks
> >
> > >
> > > Thanks,
> > > Jason
> > > >
> > > > >
> > > > > The napi_tx and non-napi code paths differ in how they handle at least
> > > > > the following structures:
> > > > >
> > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > > > needed as delay until the next ndo_start_xmit and thus completion is
> > > > > unbounded.
> > > > >
> > > > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > > > napi handler. This is indeed safe.
> > > > >
> > > > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > > > It is a small edge case, and I think a potentially acceptable risk, if
> > > > > the user of this knob is aware of the risk.
> > > > >
> > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > > > the interrupt (disables the mask) when available descriptors falls
> > > > > beneath a low watermark, and reenables when it recovers above a high
> > > > > watermark. Napi disables when napi is scheduled, and reenables on
> > > > > napi complete.
> > > > >
> > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > > > a low watermark, the driver stops the stack for queuing more packets.
> > > > > In napi mode, it schedules napi to clean packets. See the calls to
> > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > > > netif_tx_wake_queue.
> > > > >
> > > > > Some if this can be assumed safe by looking at existing analogous
> > > > > code, such as the queue stop/start in virtnet_tx_resize.
> > > > >
> > > > > But that all virtqueue callback and dev_queue->state transitions are
> > > > > correct when switching between modes at runtime is not trivial to
> > > > > establish, deserves some thought and explanation in the commit
> > > > > message.
> > > >
> > > > Thanks
> > > >
> > > >
> > >
> >
>
Jason Wang Dec. 25, 2023, 6:47 a.m. UTC | #14
On Mon, Dec 25, 2023 at 2:44 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> > > >
> > > > Hello Jason,
> > > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > > >
> > > > > > Heng Qi wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > > > Heng Qi wrote:
> > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > > > >> module parameter, and the other is through coalescing parameter
> > > > > > > >> settings (provided that the nic status is down).
> > > > > > > >>
> > > > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > > > >> reload the driver or turn off the network card.
> > > > >
> > > > > Why is this troublesome? We don't need to turn off the card, it's just
> > > > > a toggling of the interface.
> > > > >
> > > > > This ends up with pretty simple code.
> > > > >
> > > > > > So try to make
> > > > > > > >> this update.
> > > > > > > >>
> > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > The commit does not explain why it is safe to do so.
> > > > > > >
> > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > > > no new tx napi will be scheduled.
> > > > > > >
> > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > > > send the packet.
> > > > > > >
> > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > > > >
> > > > > > > >
> > > > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > > > in ndo_start_xmit.
> > > > > > >
> > > > > > > Right.
> > > > > > >
> > > > > > > >
> > > > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > > > queues when switching between modes.
> > > > > > >
> > > > > > > What are "subtle issues" and if there are any, we find them.
> > > > > >
> > > > > > A single runtime test is not sufficient to exercise all edge cases.
> > > > > >
> > > > > > Please don't leave it to reviewers to establish the correctness of a
> > > > > > patch.
> > > > >
> > > > > +1
> > > > >
> > > > [...]
> > > > > And instead of trying to do this, it would be much better to optimize
> > > > > the NAPI performance. Then we can drop the orphan mode.
> > > >
> > [...]
> > > > Do you mean when to call skb_orphan()? If yes, I just want to provide
> > > > more information that we also have some performance issues where the
> > > > flow control takes a bad effect, especially under some small
> > > > throughput in our production environment.
> > >
> > > I think you need to describe it in detail.
> >
> > Some of the details were described below in the last email. The
> > decreased performance happened because of flow control: the delay of
> > skb free means the delay
>
> What do you mean by delay here? Is it an interrupt delay? If yes, Does
> it work better if you simply remove
>
> virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the
> former may delay the interrupt more or less depend on the traffic.
>
> > of decreasing of sk_wmem_alloc, then it will
> > hit the limit of TSQ mechanism, finally causing transmitting slowly in
> > the TCP layer.
>
> TSQ might work better with BQL which virtio-net doesn't have right now.
>
> >
> > >
> > > > What strikes me as odd is if I restart the network, then the issue
> > > > will go with the wind. I cannot reproduce it in my testing machine.
> > > > One more thing: if I force skb_orphan() the current skb in every
> > > > start_xmit(), it could also solve the issue but not in a proper way.
> > > > After all, it drops the flow control... :S
> > >
> > > Yes, that's the known issue.

Just to clarify, I mean the skb_orphan() drop flow control is a known
issue. Not the restart...

Thanks

> >
> > Really? Did you have some numbers or have some discussion links to
> > share? I failed to reproduce on my testing machine, probably the short
> > rtt is the key/obstacle.
>
> I basically mean it is a known side effect of skb_orphan() as it might
> decrease sk_wmem_alloc too early.
>
> Thanks
>
>
> >
> > @Eric, it seems it still exists.
> >
> > Thanks,
> > Jason
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks,
> > > > Jason
> > > > >
> > > > > >
> > > > > > The napi_tx and non-napi code paths differ in how they handle at least
> > > > > > the following structures:
> > > > > >
> > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > > > > needed as delay until the next ndo_start_xmit and thus completion is
> > > > > > unbounded.
> > > > > >
> > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > > > > napi handler. This is indeed safe.
> > > > > >
> > > > > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > > > > It is a small edge case, and I think a potentially acceptable risk, if
> > > > > > the user of this knob is aware of the risk.
> > > > > >
> > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > > > > the interrupt (disables the mask) when available descriptors falls
> > > > > > beneath a low watermark, and reenables when it recovers above a high
> > > > > > watermark. Napi disables when napi is scheduled, and reenables on
> > > > > > napi complete.
> > > > > >
> > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > > > > a low watermark, the driver stops the stack for queuing more packets.
> > > > > > In napi mode, it schedules napi to clean packets. See the calls to
> > > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > > > > netif_tx_wake_queue.
> > > > > >
> > > > > > Some if this can be assumed safe by looking at existing analogous
> > > > > > code, such as the queue stop/start in virtnet_tx_resize.
> > > > > >
> > > > > > But that all virtqueue callback and dev_queue->state transitions are
> > > > > > correct when switching between modes at runtime is not trivial to
> > > > > > establish, deserves some thought and explanation in the commit
> > > > > > message.
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > >
> > >
> >
Jason Xing Dec. 25, 2023, 7:52 a.m. UTC | #15
On Mon, Dec 25, 2023 at 2:45 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Dec 25, 2023 at 2:34 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Mon, Dec 25, 2023 at 12:14 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Mon, Dec 25, 2023 at 10:25 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> > > >
> > > > Hello Jason,
> > > > On Fri, Dec 22, 2023 at 10:36 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > > >
> > > > > > Heng Qi wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > > > Heng Qi wrote:
> > > > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > > > >> module parameter, and the other is through coalescing parameter
> > > > > > > >> settings (provided that the nic status is down).
> > > > > > > >>
> > > > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > > > >> reload the driver or turn off the network card.
> > > > >
> > > > > Why is this troublesome? We don't need to turn off the card, it's just
> > > > > a toggling of the interface.
> > > > >
> > > > > This ends up with pretty simple code.
> > > > >
> > > > > > So try to make
> > > > > > > >> this update.
> > > > > > > >>
> > > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > The commit does not explain why it is safe to do so.
> > > > > > >
> > > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > > > no new tx napi will be scheduled.
> > > > > > >
> > > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > > > send the packet.
> > > > > > >
> > > > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > > > >
> > > > > > > >
> > > > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > > > in ndo_start_xmit.
> > > > > > >
> > > > > > > Right.
> > > > > > >
> > > > > > > >
> > > > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > > > queues when switching between modes.
> > > > > > >
> > > > > > > What are "subtle issues" and if there are any, we find them.
> > > > > >
> > > > > > A single runtime test is not sufficient to exercise all edge cases.
> > > > > >
> > > > > > Please don't leave it to reviewers to establish the correctness of a
> > > > > > patch.
> > > > >
> > > > > +1
> > > > >
> > > > [...]
> > > > > And instead of trying to do this, it would be much better to optimize
> > > > > the NAPI performance. Then we can drop the orphan mode.
> > > >
> > [...]
> > > > Do you mean when to call skb_orphan()? If yes, I just want to provide
> > > > more information that we also have some performance issues where the
> > > > flow control takes a bad effect, especially under some small
> > > > throughput in our production environment.
> > >
> > > I think you need to describe it in detail.
> >
> > Some of the details were described below in the last email. The
> > decreased performance happened because of flow control: the delay of
> > skb free means the delay
>
> What do you mean by delay here? Is it an interrupt delay? If yes, Does
> it work better if you simply remove

Delay means the interval between start_xmit() to skb free. I counted
some numbers and then found out that some of them have very long
intervals which might be normal?

$ sudo bpftrace txcompletion.bt
Attaching 4 probes...
trace from virtio start_xmit() to tcp_wfree() longer than 0 ns
^C

@average: 384386
@count: 348302
@hist:
[1K, 2K)            9551 |@@@@                                                |
[2K, 4K)          116513 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)           88295 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@             |
[8K, 16K)          39965 |@@@@@@@@@@@@@@@@@                                   |
[16K, 32K)         39584 |@@@@@@@@@@@@@@@@@                                   |
[32K, 64K)         53970 |@@@@@@@@@@@@@@@@@@@@@@@@                            |
[64K, 128K)          415 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)             0 |                                                    |
[1M, 2M)               0 |                                                    |
[2M, 4M)               0 |                                                    |
[4M, 8M)               0 |                                                    |
[8M, 16M)              0 |                                                    |
[16M, 32M)             0 |                                                    |
[32M, 64M)             0 |                                                    |
[64M, 128M)            0 |                                                    |
[128M, 256M)           0 |                                                    |
[256M, 512M)           0 |                                                    |
[512M, 1G)             0 |                                                    |
[1G, 2G)               0 |                                                    |
[2G, 4G)               9 |                                                    |

>
> virtqueue_enable_cb_delayed() with virtqueue_enable_cb()? As the
> former may delay the interrupt more or less depend on the traffic.

Due to the complexity of the production environment, I suspect the
interrupt could be delayed on the host.

Thanks for the suggestion.

>
> > of decreasing of sk_wmem_alloc, then it will
> > hit the limit of TSQ mechanism, finally causing transmitting slowly in
> > the TCP layer.
>
> TSQ might work better with BQL which virtio-net doesn't have right now.
>
> >
> > >
> > > > What strikes me as odd is if I restart the network, then the issue
> > > > will go with the wind. I cannot reproduce it in my testing machine.
> > > > One more thing: if I force skb_orphan() the current skb in every
> > > > start_xmit(), it could also solve the issue but not in a proper way.
> > > > After all, it drops the flow control... :S
> > >
> > > Yes, that's the known issue.
> >
> > Really? Did you have some numbers or have some discussion links to
> > share? I failed to reproduce on my testing machine, probably the short
> > rtt is the key/obstacle.
>
> I basically mean it is a known side effect of skb_orphan() as it might
> decrease sk_wmem_alloc too early.

Oh, I got it wrong :(

>
> Thanks
>
>
> >
> > @Eric, it seems it still exists.
> >
> > Thanks,
> > Jason
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks,
> > > > Jason
> > > > >
> > > > > >
> > > > > > The napi_tx and non-napi code paths differ in how they handle at least
> > > > > > the following structures:
> > > > > >
> > > > > > 1. skb: non-napi orphans these in ndo_start_xmit. Without napi this is
> > > > > > needed as delay until the next ndo_start_xmit and thus completion is
> > > > > > unbounded.
> > > > > >
> > > > > > When switching to napi mode, orphaned skbs may now be cleaned by the
> > > > > > napi handler. This is indeed safe.
> > > > > >
> > > > > > When switching from napi to non-napi, the unbound latency resurfaces.
> > > > > > It is a small edge case, and I think a potentially acceptable risk, if
> > > > > > the user of this knob is aware of the risk.
> > > > > >
> > > > > > 2. virtqueue callback ("interrupt" masking). The non-napi path enables
> > > > > > the interrupt (disables the mask) when available descriptors falls
> > > > > > beneath a low watermark, and reenables when it recovers above a high
> > > > > > watermark. Napi disables when napi is scheduled, and reenables on
> > > > > > napi complete.
> > > > > >
> > > > > > 3. dev_queue->state (QUEUE_STATE_DRV_XOFF). if the ring falls below
> > > > > > a low watermark, the driver stops the stack for queuing more packets.
> > > > > > In napi mode, it schedules napi to clean packets. See the calls to
> > > > > > netif_xmit_stopped, netif_stop_subqueue, netif_start_subqueue and
> > > > > > netif_tx_wake_queue.
> > > > > >
> > > > > > Some if this can be assumed safe by looking at existing analogous
> > > > > > code, such as the queue stop/start in virtnet_tx_resize.
> > > > > >
> > > > > > But that all virtqueue callback and dev_queue->state transitions are
> > > > > > correct when switching between modes at runtime is not trivial to
> > > > > > establish, deserves some thought and explanation in the commit
> > > > > > message.
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > >
> > >
> >
>
Michael S. Tsirkin Dec. 25, 2023, 8:03 a.m. UTC | #16
On Mon, Dec 25, 2023 at 12:12:48PM +0800, Jason Wang wrote:
> On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote:
> > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Heng Qi wrote:
> > > > >
> > > > >
> > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > Heng Qi wrote:
> > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > >> module parameter, and the other is through coalescing parameter
> > > > > >> settings (provided that the nic status is down).
> > > > > >>
> > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > >> reload the driver or turn off the network card.
> > >
> > > Why is this troublesome? We don't need to turn off the card, it's just
> > > a toggling of the interface.
> > >
> > > This ends up with pretty simple code.
> > >
> > > > So try to make
> > > > > >> this update.
> > > > > >>
> > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > The commit does not explain why it is safe to do so.
> > > > >
> > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > no new tx napi will be scheduled.
> > > > >
> > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > send the packet.
> > > > >
> > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > >
> > > > > >
> > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > in ndo_start_xmit.
> > > > >
> > > > > Right.
> > > > >
> > > > > >
> > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > queues when switching between modes.
> > > > >
> > > > > What are "subtle issues" and if there are any, we find them.
> > > >
> > > > A single runtime test is not sufficient to exercise all edge cases.
> > > >
> > > > Please don't leave it to reviewers to establish the correctness of a
> > > > patch.
> > >
> > > +1
> > >
> > > And instead of trying to do this, it would be much better to optimize
> > > the NAPI performance. Then we can drop the orphan mode.
> >
> > "To address your problem, optimize our code to the level which we
> > couldn't achieve in more than 10 years".
> 
> Last time QE didn't report any issue for TCP. For others, the code
> might just need some optimization if it really matters, it's just
> because nobody has worked on this part in the past years.

You think nobody worked on performance of virtio net because nobody
could bother? I think it's just micro optimized to a level where
progress is difficult.

> The ethtool trick is just for debugging purposes, I can hardly believe
> it is used by any management layer software.

It's UAPI - someone somewhere uses it, management layer or not.
Jason Wang Dec. 26, 2023, 3:32 a.m. UTC | #17
On Mon, Dec 25, 2023 at 4:03 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Dec 25, 2023 at 12:12:48PM +0800, Jason Wang wrote:
> > On Fri, Dec 22, 2023 at 4:14 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Dec 22, 2023 at 10:35:07AM +0800, Jason Wang wrote:
> > > > On Thu, Dec 21, 2023 at 11:06 PM Willem de Bruijn
> > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > >
> > > > > Heng Qi wrote:
> > > > > >
> > > > > >
> > > > > > 在 2023/12/20 下午10:45, Willem de Bruijn 写道:
> > > > > > > Heng Qi wrote:
> > > > > > >> virtio-net has two ways to switch napi_tx: one is through the
> > > > > > >> module parameter, and the other is through coalescing parameter
> > > > > > >> settings (provided that the nic status is down).
> > > > > > >>
> > > > > > >> Sometimes we face performance regression caused by napi_tx,
> > > > > > >> then we need to switch napi_tx when debugging. However, the
> > > > > > >> existing methods are a bit troublesome, such as needing to
> > > > > > >> reload the driver or turn off the network card.
> > > >
> > > > Why is this troublesome? We don't need to turn off the card, it's just
> > > > a toggling of the interface.
> > > >
> > > > This ends up with pretty simple code.
> > > >
> > > > > So try to make
> > > > > > >> this update.
> > > > > > >>
> > > > > > >> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > > > > > >> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > The commit does not explain why it is safe to do so.
> > > > > >
> > > > > > virtnet_napi_tx_disable ensures that already scheduled tx napi ends and
> > > > > > no new tx napi will be scheduled.
> > > > > >
> > > > > > Afterwards, if the __netif_tx_lock_bh lock is held, the stack cannot
> > > > > > send the packet.
> > > > > >
> > > > > > Then we can safely toggle the weight to indicate where to clear the buffers.
> > > > > >
> > > > > > >
> > > > > > > The tx-napi weights are not really weights: it is a boolean whether
> > > > > > > napi is used for transmit cleaning, or whether packets are cleaned
> > > > > > > in ndo_start_xmit.
> > > > > >
> > > > > > Right.
> > > > > >
> > > > > > >
> > > > > > > There certainly are some subtle issues with regard to pausing/waking
> > > > > > > queues when switching between modes.
> > > > > >
> > > > > > What are "subtle issues" and if there are any, we find them.
> > > > >
> > > > > A single runtime test is not sufficient to exercise all edge cases.
> > > > >
> > > > > Please don't leave it to reviewers to establish the correctness of a
> > > > > patch.
> > > >
> > > > +1
> > > >
> > > > And instead of trying to do this, it would be much better to optimize
> > > > the NAPI performance. Then we can drop the orphan mode.
> > >
> > > "To address your problem, optimize our code to the level which we
> > > couldn't achieve in more than 10 years".
> >
> > Last time QE didn't report any issue for TCP. For others, the code
> > might just need some optimization if it really matters, it's just
> > because nobody has worked on this part in the past years.
>
> You think nobody worked on performance of virtio net because nobody
> could bother?

No, I just describe what I've seen from the list. No patches were
posted for performance optimization in recent years.

> I think it's just micro optimized to a level where
> progress is difficult.
>
>

Maybe.

Let me clarify what I meant.

If people can help to optimize the NAPI to be as fast as orphans or
something like 80%-90% of orphans. It should be sufficient to remove
the orphan path completely. This allows a lot of good features to be
built easily on top.

To me, it seems fine to remove it even now as it breaks socket
accounting which is actually a bug and NAPI has been enabled for years
but if you don't wish to do that, that's fine.  We can wait for the
coalescing and dim to be used for a while and revisit this.

Thanks
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 10614e9f7cad..12f8e1f9971c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3559,16 +3559,37 @@  static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
 	return 0;
 }
 
-static int virtnet_should_update_vq_weight(int dev_flags, int weight,
-					   int vq_weight, bool *should_update)
+static void virtnet_switch_napi_tx(struct virtnet_info *vi, u32 qstart,
+				   u32 qend, u32 tx_frames)
 {
-	if (weight ^ vq_weight) {
-		if (dev_flags & IFF_UP)
-			return -EBUSY;
-		*should_update = true;
-	}
+	struct net_device *dev = vi->dev;
+	int new_weight, cur_weight;
+	struct netdev_queue *txq;
+	struct send_queue *sq;
 
-	return 0;
+	new_weight = tx_frames ? NAPI_POLL_WEIGHT : 0;
+	for (; qstart < qend; qstart++) {
+		sq = &vi->sq[qstart];
+		cur_weight = sq->napi.weight;
+		if (!(new_weight ^ cur_weight))
+			continue;
+
+		if (!(dev->flags & IFF_UP)) {
+			sq->napi.weight = new_weight;
+			continue;
+		}
+
+		if (cur_weight)
+			virtnet_napi_tx_disable(&sq->napi);
+
+		txq = netdev_get_tx_queue(dev, qstart);
+		__netif_tx_lock_bh(txq);
+		sq->napi.weight = new_weight;
+		__netif_tx_unlock_bh(txq);
+
+		if (!cur_weight)
+			virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
+	}
 }
 
 static int virtnet_set_coalesce(struct net_device *dev,
@@ -3577,25 +3598,11 @@  static int virtnet_set_coalesce(struct net_device *dev,
 				struct netlink_ext_ack *extack)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int ret, queue_number, napi_weight;
-	bool update_napi = false;
-
-	/* Can't change NAPI weight if the link is up */
-	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
-	for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
-		ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
-						      vi->sq[queue_number].napi.weight,
-						      &update_napi);
-		if (ret)
-			return ret;
-
-		if (update_napi) {
-			/* All queues that belong to [queue_number, vi->max_queue_pairs] will be
-			 * updated for the sake of simplicity, which might not be necessary
-			 */
-			break;
-		}
-	}
+	int ret;
+
+	/* Param tx_frames can be used to switch napi_tx */
+	virtnet_switch_napi_tx(vi, 0, vi->max_queue_pairs,
+			       ec->tx_max_coalesced_frames);
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
 		ret = virtnet_send_notf_coal_cmds(vi, ec);
@@ -3605,11 +3612,6 @@  static int virtnet_set_coalesce(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	if (update_napi) {
-		for (; queue_number < vi->max_queue_pairs; queue_number++)
-			vi->sq[queue_number].napi.weight = napi_weight;
-	}
-
 	return ret;
 }
 
@@ -3641,19 +3643,13 @@  static int virtnet_set_per_queue_coalesce(struct net_device *dev,
 					  struct ethtool_coalesce *ec)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int ret, napi_weight;
-	bool update_napi = false;
+	int ret;
 
 	if (queue >= vi->max_queue_pairs)
 		return -EINVAL;
 
-	/* Can't change NAPI weight if the link is up */
-	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
-	ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
-					      vi->sq[queue].napi.weight,
-					      &update_napi);
-	if (ret)
-		return ret;
+	/* Param tx_frames can be used to switch napi_tx */
+	virtnet_switch_napi_tx(vi, queue, queue, ec->tx_max_coalesced_frames);
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
 		ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue);
@@ -3663,9 +3659,6 @@  static int virtnet_set_per_queue_coalesce(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	if (update_napi)
-		vi->sq[queue].napi.weight = napi_weight;
-
 	return 0;
 }